In [5]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RandomizedSearchCV, train_test_split, cross_val_score
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import joblib
import numpy as np
import warnings

In [6]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
df_train = df_train.drop(['id'], axis=1)
df_test = df_test.drop(['id'], axis=1)
X = df_train.drop(['smoking'], axis=1)
y = df_train['smoking']

In [7]:
#split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [20]:
# Define cross-validation strategy
cross_val_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define XGBoost classifier
xgb_classifier = XGBClassifier(random_state=42)

# Define parameter grid for grid search
param_grid = {
    'max_depth': [2, 3, 4, 5, 6, None],
    'learning_rate': np.linspace(0.001, 0.1, 5),
    'n_estimators': [50, 100, 150]
}

# Perform grid search
grid_search_xgbm = GridSearchCV(
    xgb_classifier, 
    param_grid=param_grid,
    verbose=3,
    cv=cross_val_strategy, 
    scoring='neg_log_loss'
)

# Fit the model
grid_search_xgbm.fit(X_train, y_train)

# Print the best parameters and score
print(grid_search_xgbm.best_params_)
print(grid_search_xgbm.best_score_)
gs_xgbm_model = grid_search_xgbm.best_estimator_
# Save the model
joblib.dump(gs_xgbm_model, 'gs_xgbm_model.pkl')


Fitting 5 folds for each of 90 candidates, totalling 450 fits
[CV 1/5] END learning_rate=0.001, max_depth=2, n_estimators=50;, score=-0.672 total time=   0.4s
[CV 2/5] END learning_rate=0.001, max_depth=2, n_estimators=50;, score=-0.672 total time=   0.3s
[CV 3/5] END learning_rate=0.001, max_depth=2, n_estimators=50;, score=-0.672 total time=   0.2s
[CV 4/5] END learning_rate=0.001, max_depth=2, n_estimators=50;, score=-0.672 total time=   0.2s
[CV 5/5] END learning_rate=0.001, max_depth=2, n_estimators=50;, score=-0.672 total time=   0.2s
[CV 1/5] END learning_rate=0.001, max_depth=2, n_estimators=100;, score=-0.660 total time=   0.3s
[CV 2/5] END learning_rate=0.001, max_depth=2, n_estimators=100;, score=-0.659 total time=   0.3s
[CV 3/5] END learning_rate=0.001, max_depth=2, n_estimators=100;, score=-0.660 total time=   0.3s
[CV 4/5] END learning_rate=0.001, max_depth=2, n_estimators=100;, score=-0.660 total time=   0.3s
[CV 5/5] END learning_rate=0.001, max_depth=2, n_estimators=1

['gs_xgbm_model.pkl']

In [15]:
# Define cross-validation strategy
cross_val_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define XGBoost classifier
xgb_classifier = XGBClassifier(random_state=42)

# Define parameter grid for random search
param_grid = {
    'max_depth': range(1, 11),  # exploring depths from 1 to 10
    'learning_rate': np.linspace(0.001, 0.999, 10),
    'n_estimators': range(50, 200, 10)  # exploring number of trees from 50 to 200 with step of 10
}

# Perform random search
random_search_xgb = RandomizedSearchCV(
    xgb_classifier, 
    param_distributions=param_grid,
    verbose=3,
    cv=cross_val_strategy, 
    scoring='neg_log_loss',
    n_iter=100,
)

# Fit the model
random_search_xgb.fit(X_train, y_train)

# Print the best parameters and score
print(random_search_xgb.best_params_)
print(random_search_xgb.best_score_)

# Save the model
joblib.dump(random_search_xgb.best_estimator_, 'rs_xgb_model.pkl')

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 1/5] END learning_rate=0.001, max_depth=8, n_estimators=70;, score=-0.662 total time=   1.1s
[CV 2/5] END learning_rate=0.001, max_depth=8, n_estimators=70;, score=-0.662 total time=   0.8s
[CV 3/5] END learning_rate=0.001, max_depth=8, n_estimators=70;, score=-0.662 total time=   0.7s
[CV 4/5] END learning_rate=0.001, max_depth=8, n_estimators=70;, score=-0.662 total time=   0.7s
[CV 5/5] END learning_rate=0.001, max_depth=8, n_estimators=70;, score=-0.662 total time=   0.8s
[CV 1/5] END learning_rate=0.5554444444444444, max_depth=9, n_estimators=130;, score=-0.541 total time=   1.2s
[CV 2/5] END learning_rate=0.5554444444444444, max_depth=9, n_estimators=130;, score=-0.539 total time=   1.2s
[CV 3/5] END learning_rate=0.5554444444444444, max_depth=9, n_estimators=130;, score=-0.550 total time=   1.2s
[CV 4/5] END learning_rate=0.5554444444444444, max_depth=9, n_estimators=130;, score=-0.544 total time=   1.3s
[CV 5/5]

['rs_xgb_model.pkl']

In [9]:
# Define cross-validation strategy
cross_val_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define LightGBM classifier
lgbm_classifier = LGBMClassifier(random_state=23)

# Define parameter grid for grid search
param_grid = {
    'max_depth': [2, 3, 4, 5, 6, None],
    'learning_rate': np.linspace(0.001, 0.999, 10),
    'n_estimators': [50, 100, 150]
}

# Perform grid search
grid_search_lgbm = GridSearchCV(
    lgbm_classifier, 
    param_grid=param_grid,
    verbose=1,
    cv=cross_val_strategy, 
    scoring='neg_log_loss'
)

# Fit the model
grid_search_lgbm.fit(X_train, y_train)

# Print the best parameters and score
print(grid_search_lgbm.best_params_)
print(grid_search_lgbm.best_score_)

# Save the model
joblib.dump(grid_search_lgbm.best_estimator_, 'gs_lgbm_model.pkl')

[LightGBM] [Info] Number of positive: 44610, number of negative: 57314
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004201 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2110
[LightGBM] [Info] Number of data points in the train set: 101924, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.437679 -> initscore=-0.250587
[LightGBM] [Info] Start training from score -0.250587
[LightGBM] [Info] Number of positive: 44609, number of negative: 57314
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005467 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2103
[LightGBM] [Info] Number of data points in the train set: 101923, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.437674 -> initscore=-0.250609
[LightGBM] [Info] Start training from score -0.250609
[LightGBM]

['gs_lgbm_model.pkl']

In [8]:
# Define cross-validation strategy
cross_val_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define CatBoost classifier
catboost_classifier = CatBoostClassifier(random_state=42)

# Define parameter grid for grid search
param_grid = {
    'max_depth': [2, 3, 4, 5, 6, None],
    'learning_rate':np.linspace(0.01, 0.5,10),
    'n_estimators': [50, 100, 150]
}

# Perform grid search
grid_search_catboost = GridSearchCV(
    catboost_classifier, 
    param_grid=param_grid,
    verbose=2,
    cv=cross_val_strategy, 
    scoring='neg_log_loss'
)

# Fit the model
grid_search_catboost.fit(X_train, y_train)

# Print the best parameters and score
print(grid_search_catboost.best_params_)
print(grid_search_catboost.best_score_)

# Save the model
joblib.dump(grid_search_catboost.best_estimator_, 'gs_catboost_model.pkl')

Fitting 5 folds for each of 180 candidates, totalling 900 fits
0:	learn: 0.6893127	total: 80.5ms	remaining: 3.95s
1:	learn: 0.6854778	total: 96.5ms	remaining: 2.31s
2:	learn: 0.6817526	total: 110ms	remaining: 1.72s
3:	learn: 0.6780867	total: 123ms	remaining: 1.41s
4:	learn: 0.6745380	total: 136ms	remaining: 1.22s
5:	learn: 0.6710810	total: 149ms	remaining: 1.09s
6:	learn: 0.6677365	total: 161ms	remaining: 988ms
7:	learn: 0.6643166	total: 173ms	remaining: 907ms
8:	learn: 0.6609818	total: 185ms	remaining: 843ms
9:	learn: 0.6577393	total: 199ms	remaining: 795ms
10:	learn: 0.6546436	total: 213ms	remaining: 756ms
11:	learn: 0.6515336	total: 226ms	remaining: 717ms
12:	learn: 0.6489742	total: 238ms	remaining: 678ms
13:	learn: 0.6460038	total: 251ms	remaining: 645ms
14:	learn: 0.6436097	total: 263ms	remaining: 614ms
15:	learn: 0.6408754	total: 276ms	remaining: 585ms
16:	learn: 0.6382274	total: 288ms	remaining: 558ms
17:	learn: 0.6357479	total: 300ms	remaining: 534ms
18:	learn: 0.6331435	total:

['gs_catboost_model.pkl']

In [28]:
# Define CatBoost classifier
catboost_classifier = CatBoostClassifier(random_state=42, max_depth=3, learning_rate=0.01, n_estimators=100)

# Define cross-validation strategy
cross_val_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Compute cross-validation score
scores = cross_val_score(catboost_classifier, X_train, y_train, cv=cross_val_strategy, scoring='neg_log_loss')

# Print the scores
print(f"Mean Log Loss: {scores.mean():.2f}")

0:	learn: 0.6885591	total: 15.1ms	remaining: 1.49s
1:	learn: 0.6840537	total: 29.4ms	remaining: 1.44s
2:	learn: 0.6797094	total: 45.8ms	remaining: 1.48s
3:	learn: 0.6757148	total: 63.5ms	remaining: 1.52s
4:	learn: 0.6715069	total: 78.1ms	remaining: 1.48s
5:	learn: 0.6675010	total: 92.6ms	remaining: 1.45s
6:	learn: 0.6635374	total: 106ms	remaining: 1.41s
7:	learn: 0.6597411	total: 120ms	remaining: 1.38s
8:	learn: 0.6560755	total: 134ms	remaining: 1.35s
9:	learn: 0.6523945	total: 147ms	remaining: 1.32s
10:	learn: 0.6487785	total: 161ms	remaining: 1.3s
11:	learn: 0.6453168	total: 174ms	remaining: 1.28s
12:	learn: 0.6418658	total: 187ms	remaining: 1.25s
13:	learn: 0.6387538	total: 202ms	remaining: 1.24s
14:	learn: 0.6354844	total: 215ms	remaining: 1.22s
15:	learn: 0.6325381	total: 229ms	remaining: 1.2s
16:	learn: 0.6295968	total: 243ms	remaining: 1.19s
17:	learn: 0.6268397	total: 256ms	remaining: 1.17s
18:	learn: 0.6239689	total: 269ms	remaining: 1.15s
19:	learn: 0.6210553	total: 290ms	rem

In [9]:
#load all the saved models from checpoints folder abd create a dictionary of models with their names
models = {
    'gs_xgbm_model': joblib.load('checkpoints/gs_xgbm_model.pkl'),
    'rs_xgb_model': joblib.load('checkpoints/rs_xgb_model.pkl'),
    'gs_lgbm_model': joblib.load('checkpoints/gs_lgbm_model.pkl'),
    'gs_catboost_model': joblib.load('checkpoints/gs_catboost_model.pkl')
}
#now using the models predict the probabilities of the test (eval) data and calulate roc_auc_score
predictions = {}
for model_name, model in models.items():
    predictions[model_name] = model.predict_proba(X_test)[:, 1]
    print(f'{model_name} roc_auc_score: {roc_auc_score(y_test, predictions[model_name])}')

gs_xgbm_model roc_auc_score: 0.8847283005325309
rs_xgb_model roc_auc_score: 0.8850898382411589
gs_lgbm_model roc_auc_score: 0.8800076320197806
gs_catboost_model roc_auc_score: 0.8659115378134018
