In [1]:
#### Library Imports
import pandas as pd 
from sklearn.model_selection import KFold, train_test_split
from model_creators.xgboost_model import XGBModel
from sklearn.metrics import accuracy_score


In [2]:
#### Parameters
seed = 7

### Model training
model_list = []    # Options include 'xgboost', 
cross_validation_strat = KFold(n_splits=5, shuffle=True, random_state=seed)    # See ReadMe.md for more information
USE_ENSEMBLE = False    # If False, will only train the first model in 
metric = 'accuracy'

### Logging
USE_LOGGER = False
LOG_FILE = ''


### Data
target_col = 'Transported'
data_filename = './data/train-cleaned.csv'
validation_size = 0.1

In [3]:
#### Load data

df = pd.read_csv(data_filename)
df.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Earth,Europa,Mars,55 Cancri e,PSO J318.5-22,TRAPPIST-1e
0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,1.0,0.0,0.0,0.0,1.0
1,0,24.0,0,109.0,9.0,25.0,549.0,44.0,1,1.0,0.0,0.0,0.0,0.0,1.0
2,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,0.0,1.0,0.0,0.0,0.0,1.0
3,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,0.0,1.0,0.0,0.0,0.0,1.0
4,0,16.0,0,303.0,70.0,151.0,565.0,2.0,1,1.0,0.0,0.0,0.0,0.0,1.0


In [4]:
# Split data into train/test 

train_df, test_df = train_test_split(df, test_size=validation_size, random_state=seed)
print(f"Train size: {train_df.shape}\nTest size: {test_df.shape}")

train_x, train_y = train_df.drop(target_col, axis=1), train_df[target_col]

Train size: (7823, 15)
Test size: (870, 15)


In [5]:
# Train XGB model
xgb_model = XGBModel(train_x, train_y, cross_validation_strat)
xgb_model.run_trial(n_trials=10)

[I 2023-08-11 16:46:34,910] A new study created in memory with name: no-name-a557500b-64da-4c62-82c3-447c500edd6c
[I 2023-08-11 16:46:38,025] Trial 0 finished with value: 0.7924067068138548 and parameters: {'max_depth': 6, 'subsample': 0.75, 'n_estimators': 1300, 'eta': 0.06999999999999999, 'reg_alpha': 42, 'reg_lambda': 86, 'min_child_weight': 2, 'colsample_bytree': 0.5664146998709992}. Best is trial 0 with value: 0.7924067068138548.
[I 2023-08-11 16:46:42,531] Trial 1 finished with value: 0.7906173243015779 and parameters: {'max_depth': 7, 'subsample': 0.8, 'n_estimators': 3525, 'eta': 0.06999999999999999, 'reg_alpha': 40, 'reg_lambda': 56, 'min_child_weight': 17, 'colsample_bytree': 0.4803120379225171}. Best is trial 0 with value: 0.7924067068138548.
[I 2023-08-11 16:46:47,388] Trial 2 finished with value: 0.7936847437961155 and parameters: {'max_depth': 6, 'subsample': 1.0, 'n_estimators': 3025, 'eta': 0.05, 'reg_alpha': 22, 'reg_lambda': 96, 'min_child_weight': 9, 'colsample_bytre