In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import math
import xgboost as xgb
import shap
import optuna
from optuna import Trial
from sklearn.preprocessing import StandardScaler, LabelEncoder
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

/kaggle/input/playground-series-s3e26/sample_submission.csv
/kaggle/input/playground-series-s3e26/train.csv
/kaggle/input/playground-series-s3e26/test.csv


In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s3e26/train.csv').drop(columns=['id'])
test = pd.read_csv('/kaggle/input/playground-series-s3e26/test.csv').drop(columns=['id'])

In [None]:
numeric_columns = train.select_dtypes(include='number').columns.tolist()
categorical_columns = train.select_dtypes(include='object').columns.tolist()
categorical_columns.remove('Status')

## Data Pre-Processing

In [12]:
# Encode target variable

# Creating a instance of label Encoder.
le = LabelEncoder()

# Using .fit_transform function to fit label
# encoder and return encoded label
train['Status'] = le.fit_transform(train['Status'])
le_name_mapping = dict(zip(le.transform(['C', 'D', 'CL']), ['Status_' + item for item in le.inverse_transform([0, 1, 2])]))

print(le_name_mapping)

{0: 'Status_C', 2: 'Status_CL', 1: 'Status_D'}


In [13]:
# Add stage to categorical columns
categorical_columns.append('Stage')
numeric_columns.remove('Stage')

In [14]:
# Create dummies for categorical variables
train = pd.get_dummies(train, columns=categorical_columns, drop_first=True, dtype=int)
test = pd.get_dummies(test, columns=categorical_columns, drop_first=True, dtype=int)

In [15]:
X = train.drop(columns=['Status'], axis =1).values
y = train['Status'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.15, random_state=4131)

## Fitting Individual Models
For the competition, an ensemble of models will be developed. As such, individual models need to be fitted. The following models shall be fitted:
* XGBoost
* LGBM
* CatBoost

The Optuna hyperparameter tuning framework shall be used to efficiently search the parameter space and select the model pa

In [19]:
def objective_xgb(trial):
    
    xgb_params = {
        'booster': trial.suggest_categorical('booster', ['gbtree']),
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 500, 1500),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.01, 1),
        'subsample': trial.suggest_loguniform('subsample', 0.1, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.1, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.1, 1),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1),
        'eval_metric':trial.suggest_categorical('eval_metric', ['mlogloss']),
    }
    xgb_model = xgb.XGBClassifier(**xgb_params,random_state=42)
    xgb_model.fit(X_train,y_train)
    
    return log_loss(y_test, xgb_model.predict_proba(X_test))

study_xgb = optuna.create_study(direction='minimize')

study_xgb.optimize(objective_xgb, n_trials=50, show_progress_bar=True)

[I 2024-01-02 13:15:50,682] A new study created in memory with name: no-name-cdb3be81-9ad8-4b6b-9256-b24dd64b7346


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2024-01-02 13:15:55,907] Trial 0 finished with value: 0.44367731782823766 and parameters: {'booster': 'gbtree', 'max_depth': 14, 'learning_rate': 0.021776758513913308, 'n_estimators': 1136, 'min_child_weight': 0.11487140285373688, 'subsample': 0.11426374913141848, 'colsample_bylevel': 0.8528572614677947, 'colsample_bytree': 0.21399355862278832, 'colsample_bynode': 0.5662414088691785, 'reg_alpha': 0.019544258086253263, 'reg_lambda': 0.6510515806305793, 'eval_metric': 'mlogloss'}. Best is trial 0 with value: 0.44367731782823766.
[I 2024-01-02 13:16:03,157] Trial 1 finished with value: 0.4357076533012237 and parameters: {'booster': 'gbtree', 'max_depth': 12, 'learning_rate': 0.015208171401725774, 'n_estimators': 1379, 'min_child_weight': 0.3690832691324445, 'subsample': 0.31181618187542265, 'colsample_bylevel': 0.9850524556364444, 'colsample_bytree': 0.1502139289642078, 'colsample_bynode': 0.6093190078948872, 'reg_alpha': 0.02965813553305143, 'reg_lambda': 0.1747375184405854, 'eval_met

In [20]:
def objective_cat(trial):
    
    cat_params = {
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['MVS','Bayesian']),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.1, 1),
        'iterations' : trial.suggest_int('iterations', 500, 1100),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg',0.1,4.0),
        'bagging_temperature' :trial.suggest_float('bagging_temperature', 1, 3),
        'depth' : trial.suggest_int('depth', 1, 12),
        'random_strength': trial.suggest_float('random_strength', 0.1, 10.0),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 20),
        'grow_policy' : trial.suggest_categorical('grow_policy', ['SymmetricTree','Depthwise']),
        'boosting_type': trial.suggest_categorical('boosting_type',['Plain']),
        'score_function': trial.suggest_categorical('score_function',['Cosine','L2']),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15)
    }
    
    cat = CatBoostClassifier(**cat_params,logging_level='Silent',random_state=42)
    cat.fit(X_train,y_train)
 
    # Show the log loss score directly
    return log_loss(y_test, cat.predict_proba(X_test))

study_cat = optuna.create_study(direction='minimize')

study_cat.optimize(objective_cat, n_trials=50, show_progress_bar=True)

[I 2024-01-02 13:21:47,330] A new study created in memory with name: no-name-f1d926c7-7814-4412-b2e1-29a7d2547165


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2024-01-02 13:21:50,740] Trial 0 finished with value: 0.46460324834237454 and parameters: {'bootstrap_type': 'MVS', 'colsample_bylevel': 0.21448044391161836, 'iterations': 626, 'l2_leaf_reg': 3.4637813054314464, 'bagging_temperature': 2.7848133188954876, 'depth': 3, 'random_strength': 7.228274377567994, 'min_data_in_leaf': 11, 'grow_policy': 'SymmetricTree', 'boosting_type': 'Plain', 'score_function': 'Cosine', 'leaf_estimation_iterations': 2}. Best is trial 0 with value: 0.46460324834237454.
[I 2024-01-02 13:21:55,595] Trial 1 finished with value: 0.46268924327731337 and parameters: {'bootstrap_type': 'MVS', 'colsample_bylevel': 0.5367397627825986, 'iterations': 500, 'l2_leaf_reg': 1.2933320326959816, 'bagging_temperature': 1.3037827216784657, 'depth': 3, 'random_strength': 5.435350302574786, 'min_data_in_leaf': 1, 'grow_policy': 'Depthwise', 'boosting_type': 'Plain', 'score_function': 'Cosine', 'leaf_estimation_iterations': 5}. Best is trial 1 with value: 0.46268924327731337.
[I 2

In [21]:
def objective_lgbm(trial):
    
    lgbm_params = {
    'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
    'max_depth': trial.suggest_int('max_depth', 10, 30),
    'num_leaves': trial.suggest_int('num_leaves', 20, 100),
    'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 30),
    'boosting_type': trial.suggest_categorical('boosting_type',['gbdt', 'dart','rf']),
    'n_estimators': trial.suggest_int('n_estimators', 500, 1400),
    'subsample': trial.suggest_float('subsample',0.1,1.0),
    'subsample_freq': trial.suggest_int('subsample_freq',1,20),
    'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
    'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
    'min_child_samples': trial.suggest_int('min_child_samples', 10, 30),
    'colsample_bytree': trial.suggest_float('colsample_bytree', 0.0, 1.0)
    }
    
    lgbm = LGBMClassifier(**lgbm_params,verbose=-1,random_state=22)
    lgbm.fit(X_train,y_train)
    
    # Show the log loss score directly
    return log_loss(y_test, lgbm.predict_proba(X_test))

study_lgbm = optuna.create_study(direction='minimize')

study_lgbm.optimize(objective_lgbm, n_trials=50, show_progress_bar=True)


[I 2024-01-02 13:34:46,966] A new study created in memory with name: no-name-29ebcd0f-2dc5-46b1-b0ca-67ab8923f848


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2024-01-02 13:34:49,790] Trial 0 finished with value: 0.4408142073693818 and parameters: {'learning_rate': 0.08892953901501109, 'max_depth': 17, 'num_leaves': 83, 'min_data_in_leaf': 11, 'boosting_type': 'gbdt', 'n_estimators': 1040, 'subsample': 0.4966784183901134, 'subsample_freq': 8, 'reg_alpha': 1.833177064306184, 'reg_lambda': 2.9561034238201556, 'min_child_samples': 21, 'colsample_bytree': 0.019291986209904133}. Best is trial 0 with value: 0.4408142073693818.
[I 2024-01-02 13:34:51,930] Trial 1 finished with value: 0.44296736023035943 and parameters: {'learning_rate': 0.05009528112249193, 'max_depth': 16, 'num_leaves': 66, 'min_data_in_leaf': 18, 'boosting_type': 'gbdt', 'n_estimators': 910, 'subsample': 0.685563929947971, 'subsample_freq': 13, 'reg_alpha': 6.580463298993703, 'reg_lambda': 4.9661594758986185, 'min_child_samples': 23, 'colsample_bytree': 0.04693635985918421}. Best is trial 0 with value: 0.4408142073693818.
[I 2024-01-02 13:34:53,195] Trial 2 finished with value

## Fitting Ensemble of Trained Classifiers

In [22]:
xgb_model = xgb.XGBClassifier(**study_xgb.best_params,random_state=42)

lgb_model = LGBMClassifier(**study_lgbm.best_params,random_state=42,verbose=-1)

cat_model = CatBoostClassifier(**study_cat.best_params,logging_level='Silent',random_state=42)

# Create ensemble classifier using the pre-trained models
_classifier = VotingClassifier(estimators=[('xgb', xgb_model),
                                     ('cat', cat_model),
                                     ('lgb', lgb_model)], voting='soft')
_classifier = _classifier.fit(X_train, y_train)



In [23]:
y_pred_proba = _classifier.predict_proba(X_test)

# Evaluate the model performance using log loss
logloss = log_loss(y_test, y_pred_proba)
print("Log Loss on Test Set:", logloss)

Log Loss on Test Set: 0.4292927563824632
