In [1]:
#### Library Imports
import pandas as pd 
from sklearn.model_selection import KFold, train_test_split
from model_creators.xgboost_model import XGBModel
from model_creators.catboost_model import CatModel
from model_creators.lgbm_model import LGBMModel
from model_creators.histgbm_model import HistGBMModel
from model_creators.gbm_model import GBMModel
from sklearn.metrics import accuracy_score
from tabulate import tabulate

In [2]:
### Parameters
seed = 7

### Model training
model_list = ['xgboost', 'catboost', 'lgbm', 'gbm', 'histgbm']    # Options include 'xgboost', 'catboost', 'lgbm', 'gbm', 'histgbm'
cross_validation_strat = KFold(n_splits=5, shuffle=True, random_state=seed)    # See ReadMe.md for more information
USE_ENSEMBLE = False    # If False, will only use best model
metric = 'accuracy'

### Logging
USE_LOGGER = False
LOG_FILE = ''


### Data
target_col = 'Transported'
data_filename = './data/train-cleaned.csv'
validation_size = 0.1


### Model eval
validation_accuracy = []
best_params = {}

In [3]:
#### Load data

df = pd.read_csv(data_filename)
df.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Earth,Europa,Mars,55 Cancri e,PSO J318.5-22,TRAPPIST-1e
0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,1.0,0.0,0.0,0.0,1.0
1,0,24.0,0,109.0,9.0,25.0,549.0,44.0,1,1.0,0.0,0.0,0.0,0.0,1.0
2,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,0.0,1.0,0.0,0.0,0.0,1.0
3,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,0.0,1.0,0.0,0.0,0.0,1.0
4,0,16.0,0,303.0,70.0,151.0,565.0,2.0,1,1.0,0.0,0.0,0.0,0.0,1.0


In [4]:
# Split data into train/test 

train_df, val_df = train_test_split(df, test_size=validation_size, random_state=seed)
print(f"Train size: {train_df.shape}\nTest size: {val_df.shape}")

train_x, train_y = train_df.drop(target_col, axis=1), train_df[target_col]
val_x, val_y = val_df.drop(target_col, axis=1), val_df[target_col]

Train size: (7823, 15)
Test size: (870, 15)


In [5]:
# Train XGB model
if 'xgboost' in model_list:
    xgb_model = XGBModel(train_x, train_y, cross_validation_strat)
    xgb_model.run_trial(n_trials=10)

    print("Best model params: ")
    best_params = xgb_model.get_best_params()
    print(best_params)

    # Evaluate model accuracy
    best_params['xgboost'] = best_params
    best_xgb = xgb_model.get_best_model()
    xgb_preds = best_xgb.predict(val_x)
    xgb_acc = accuracy_score(val_y, xgb_preds)
    print(f"XGBoost Validation Accuracy: {xgb_acc}")
    validation_accuracy.append(xgb_acc)

[I 2023-08-17 04:12:55,207] A new study created in memory with name: no-name-e27c1d23-b251-4137-bfec-3c73e12404eb
[I 2023-08-17 04:13:01,720] Trial 0 finished with value: 0.7908724250917202 and parameters: {'max_depth': 5, 'subsample': 0.7, 'n_estimators': 3225, 'eta': 0.05, 'reg_alpha': 40, 'reg_lambda': 32, 'min_child_weight': 12, 'colsample_bytree': 0.6250347718011828}. Best is trial 0 with value: 0.7908724250917202.
[I 2023-08-17 04:13:05,506] Trial 1 finished with value: 0.7851200738664684 and parameters: {'max_depth': 12, 'subsample': 0.95, 'n_estimators': 2950, 'eta': 0.03, 'reg_alpha': 48, 'reg_lambda': 27, 'min_child_weight': 16, 'colsample_bytree': 0.22187708928700178}. Best is trial 0 with value: 0.7908724250917202.
[I 2023-08-17 04:13:11,456] Trial 2 finished with value: 0.7920228299682146 and parameters: {'max_depth': 9, 'subsample': 0.7, 'n_estimators': 3575, 'eta': 0.09999999999999999, 'reg_alpha': 24, 'reg_lambda': 13, 'min_child_weight': 20, 'colsample_bytree': 0.35758

Best model params: 
{'max_depth': 10, 'subsample': 0.6, 'n_estimators': 4150, 'eta': 0.04, 'reg_alpha': 25, 'reg_lambda': 37, 'min_child_weight': 7, 'colsample_bytree': 0.8431320119175758}
XGBoost Validation Accuracy: 0.8091954022988506


In [6]:
# Train CatBoost model
if 'catboost' in model_list:
    cat_model = CatModel(train_x, train_y, cross_validation_strat)
    cat_model.run_trial(n_trials=10)

    print("Best model params: ")
    best_params = cat_model.get_best_params()
    print(best_params)

    # Evaluate model accuracy
    best_params['catboost'] = best_params
    best_cat = cat_model.get_best_model()
    cat_preds = best_cat.predict(val_x)
    cat_acc = accuracy_score(val_y, cat_preds)
    print(f"CatBoost Validation Accuracy: {cat_acc}")
    validation_accuracy.append(cat_acc)

[I 2023-08-17 04:13:57,827] A new study created in memory with name: no-name-43ad3119-f521-4283-9a63-0fe4b46989c9
[I 2023-08-17 04:14:15,901] Trial 0 finished with value: 0.7855043592655843 and parameters: {'max_depth': 5, 'subsample': 1.0, 'n_estimators': 4725, 'eta': 0.04, 'reg_lambda': 31}. Best is trial 0 with value: 0.7855043592655843.
[I 2023-08-17 04:14:29,254] Trial 1 finished with value: 0.794195435640571 and parameters: {'max_depth': 2, 'subsample': 0.6, 'n_estimators': 4650, 'eta': 0.01, 'reg_lambda': 80}. Best is trial 1 with value: 0.794195435640571.
[I 2023-08-17 04:14:31,623] Trial 2 finished with value: 0.7950903311734473 and parameters: {'max_depth': 6, 'subsample': 0.75, 'n_estimators': 325, 'eta': 0.05, 'reg_lambda': 95}. Best is trial 2 with value: 0.7950903311734473.
[I 2023-08-17 04:14:33,021] Trial 3 finished with value: 0.7885715336280366 and parameters: {'max_depth': 7, 'subsample': 1.0, 'n_estimators': 175, 'eta': 0.01, 'reg_lambda': 20}. Best is trial 2 with 

Best model params: 
{'max_depth': 3, 'subsample': 0.8, 'n_estimators': 1800, 'eta': 0.09, 'reg_lambda': 95}
CatBoost Validation Accuracy: 0.7931034482758621


In [7]:
# Train LightGBM model
if 'lgbm' in model_list:
    lgbm_model = LGBMModel(train_x, train_y, cross_validation_strat)
    lgbm_model.run_trial(n_trials=10)

    print("Best model params: ")
    best_params = lgbm_model.get_best_params()
    print(best_params)

    # Evaluate model accuracy
    best_params['lgbm'] = best_params
    best_lgbm = lgbm_model.get_best_model()
    lgbm_preds = best_lgbm.predict(val_x)
    lgbm_acc = accuracy_score(val_y, lgbm_preds)
    print(f"LGBMBoost Validation Accuracy: {lgbm_acc}")
    validation_accuracy.append(lgbm_acc)

[I 2023-08-17 04:51:36,881] A new study created in memory with name: no-name-44cc9ea1-e68e-460d-a482-4d0c506d18a0
[I 2023-08-17 04:51:38,589] Trial 0 finished with value: 0.7935567848475688 and parameters: {'max_depth': 8, 'subsample': 0.8, 'n_estimators': 3325, 'eta': 0.09, 'reg_lambda': 24, 'reg_alpha': 16, 'min_child_weight': 19, 'colsample_bytree': 0.6739765076795549}. Best is trial 0 with value: 0.7935567848475688.
[I 2023-08-17 04:51:40,242] Trial 1 finished with value: 0.7894663474502177 and parameters: {'max_depth': 10, 'subsample': 0.7, 'n_estimators': 3500, 'eta': 0.09, 'reg_lambda': 70, 'reg_alpha': 33, 'min_child_weight': 20, 'colsample_bytree': 0.8141207423691543}. Best is trial 0 with value: 0.7935567848475688.
[I 2023-08-17 04:51:41,101] Trial 2 finished with value: 0.7840972193850453 and parameters: {'max_depth': 8, 'subsample': 0.8, 'n_estimators': 300, 'eta': 0.09999999999999999, 'reg_lambda': 94, 'reg_alpha': 46, 'min_child_weight': 11, 'colsample_bytree': 0.32187899

Best model params: 
{'max_depth': 13, 'subsample': 0.6, 'n_estimators': 150, 'eta': 0.060000000000000005, 'reg_lambda': 43, 'reg_alpha': 4, 'min_child_weight': 19, 'colsample_bytree': 0.6885190059844645}
LGBMBoost Validation Accuracy: 0.8


In [8]:
# Train HistGBM model
if 'histgbm' in model_list:
    hist_model = HistGBMModel(train_x, train_y, cross_validation_strat)
    hist_model.run_trial(n_trials=10)

    print("Best model params: ")
    best_params = hist_model.get_best_params()
    print(best_params)

    # Evaluate model accuracy
    best_params['histgbm'] = best_params
    best_hist = hist_model.get_best_model()
    hist_preds = best_hist.predict(val_x)
    hist_acc = accuracy_score(val_y, hist_preds)
    print(f"HistGBM Validation Accuracy: {hist_acc}")
    validation_accuracy.append(hist_acc)

[I 2023-08-17 04:51:47,073] A new study created in memory with name: no-name-46885158-bae7-44f7-9f2a-d7232dd4e7c9
[I 2023-08-17 04:51:47,835] Trial 0 finished with value: 0.7823088991118047 and parameters: {'max_iter': 550, 'learning_rate': 0.38, 'max_bins': 122, 'max_depth': 3, 'l2_regularization': 3.1}. Best is trial 0 with value: 0.7823088991118047.
[I 2023-08-17 04:51:57,422] Trial 1 finished with value: 0.7585339467082846 and parameters: {'max_iter': 3475, 'learning_rate': 0.24000000000000002, 'max_bins': 157, 'max_depth': 15, 'l2_regularization': 0.5}. Best is trial 0 with value: 0.7823088991118047.
[I 2023-08-17 04:52:09,323] Trial 2 finished with value: 0.7550817515504604 and parameters: {'max_iter': 4825, 'learning_rate': 0.31, 'max_bins': 32, 'max_depth': 13, 'l2_regularization': 1.8000000000000003}. Best is trial 0 with value: 0.7823088991118047.
[I 2023-08-17 04:52:16,950] Trial 3 finished with value: 0.7608343479077977 and parameters: {'max_iter': 2900, 'learning_rate': 0.

Best model params: 
{'max_iter': 550, 'learning_rate': 0.38, 'max_bins': 122, 'max_depth': 3, 'l2_regularization': 3.1}
HistGBM Validation Accuracy: 0.7954022988505747


In [9]:
# Train GBM model
if 'gbm' in model_list:
    gbm_model = GBMModel(train_x, train_y, cross_validation_strat)
    gbm_model.run_trial(n_trials=10)

    print("Best model params: ")
    best_params = gbm_model.get_best_params()
    print(best_params)

    # Evaluate model accuracy
    best_params['gbm'] = best_params
    best_gbm = gbm_model.get_best_model()
    gbm_preds = best_gbm.predict(val_x)
    gbm_acc = accuracy_score(val_y, gbm_preds)
    print(f"GBM Validation Accuracy: {gbm_acc}")
    validation_accuracy.append(gbm_acc)

[I 2023-08-17 04:52:47,683] A new study created in memory with name: no-name-5868c6df-e042-4020-98f1-df7c02b7cf80
[I 2023-08-17 04:54:11,547] Trial 0 finished with value: 0.7733613328648588 and parameters: {'max_depth': 9, 'subsample': 0.9, 'n_estimators': 4075, 'learning_rate': 0.01}. Best is trial 0 with value: 0.7733613328648588.
[I 2023-08-17 04:54:29,616] Trial 1 finished with value: 0.7847388934737667 and parameters: {'max_depth': 5, 'subsample': 0.65, 'n_estimators': 2250, 'learning_rate': 0.02}. Best is trial 1 with value: 0.7847388934737667.
[I 2023-08-17 04:54:45,195] Trial 2 finished with value: 0.7878043519116218 and parameters: {'max_depth': 2, 'subsample': 0.6, 'n_estimators': 4100, 'learning_rate': 0.060000000000000005}. Best is trial 2 with value: 0.7878043519116218.
[I 2023-08-17 04:55:07,239] Trial 3 finished with value: 0.7725942328591391 and parameters: {'max_depth': 11, 'subsample': 0.9, 'n_estimators': 800, 'learning_rate': 0.09}. Best is trial 2 with value: 0.787

Best model params: 
{'max_depth': 4, 'subsample': 0.65, 'n_estimators': 2400, 'learning_rate': 0.01}
GBM Validation Accuracy: 0.8045977011494253


In [10]:
print(tabulate(zip(model_list, validation_accuracy)))

--------  --------
xgboost   0.809195
catboost  0.793103
lgbm      0.8
gbm       0.795402
histgbm   0.804598
--------  --------
