## Train current best model & create submission

In [62]:
# essentials
import os
import pathlib

import pandas as pd
import numpy as np
from tqdm import tqdm

# visualisation
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn imports
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, cross_validate, RepeatedStratifiedKFold
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MaxAbsScaler, PowerTransformer, FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, chi2, f_classif, SequentialFeatureSelector, RFECV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.base import clone as clone_model
from sklearn.metrics import classification_report, confusion_matrix, log_loss, recall_score, make_scorer


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, ExtraTreesClassifier, BaggingClassifier, StackingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import RocCurveDisplay

# others
import xgboost as xgb 
import lightgbm as lgb
import catboost as cb

from imblearn.over_sampling import RandomOverSampler, ADASYN, SMOTE, SVMSMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as imbPipeline

RANDOM_SEED = 64

In [63]:
IN_KAGGLE = False
kaggle_folder = "/kaggle/input/playground-series-s3e26"
local_folder = "./data"
train_df = pd.read_csv(kaggle_folder if IN_KAGGLE else local_folder + "/train.csv", index_col="id")
test_df = pd.read_csv(kaggle_folder if IN_KAGGLE else local_folder  + "/test.csv", index_col="id")

target_column = "Status"

def feature_engineering(df):
    df['date_of_diagnosis'] = df['Age'] - df['N_Days']
    df['no_diseases'] = (df['Ascites'] + df['Hepatomegaly'] + df['Spiders'] + df['Edema']) == 0
    df['diseases'] = df['Ascites'] + df['Hepatomegaly'] + df['Spiders'] + df['Edema']

    df['Drug'] = df['Drug'].map({"D-penicillamine": 1,"placebo": 0})
    df['Sex'] = df['Sex'].map({"F": 1,"M": 0})

    # change "Stage" to string
    df["Stage"] = df["Stage"].apply(lambda x: str(x))
    return df

run_feature_engineering = True

categorical_features = ["Drug", "Sex", "Ascites", "Hepatomegaly", "Spiders", "Edema", "Stage"]
numerical_features = ["N_Days", "Age", "Bilirubin", "Cholesterol", "Albumin", "Copper", "Alk_Phos", "SGOT", "Tryglicerides", "Platelets", "Prothrombin"]

if run_feature_engineering:
    train_df = feature_engineering(train_df)

if run_feature_engineering:
    categorical_features += ["no_diseases", "diseases"]
    numerical_features += ["date_of_diagnosis"]

X = train_df.drop(columns=target_column)
y = train_df[target_column]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=RANDOM_SEED, stratify=y, shuffle=True)

le = LabelEncoder()

y_train = le.fit_transform(y_train)
y_val = le.transform(y_val)

In [64]:
numeric_transformer = Pipeline(
    [
        #("power_transformer", PowerTransformer()),
        ("scaler", StandardScaler()),
        
    ]
)

categorical_transformer = Pipeline(
    [
        ("onehot", OneHotEncoder(handle_unknown="ignore", drop="if_binary")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_SEED)





models = {
    "default catboost": cb.CatBoostClassifier(random_state=RANDOM_SEED, verbose=False),
    "default lightgbm": lgb.LGBMClassifier(random_state=RANDOM_SEED, n_jobs=4),
    "stacking classifier": StackingClassifier(
        estimators=[
            ("xgb", xgb.XGBClassifier(
                objective="multi:softprob", random_state=RANDOM_SEED, n_jobs=-1,
                subsample=0.8,
                min_child_weight=7,
                max_depth=7,
                reg_lambda=0.9,
                gamma=0.9,
                eta=0.08,
                colsample_bytree=0.5,
                reg_alpha=0.5
            )),
            ("catboost", cb.CatBoostClassifier(random_state=RANDOM_SEED, verbose=False)),
        ],
        stack_method="predict_proba",
        final_estimator=LogisticRegression(),
        n_jobs=-1,
        verbose=1), 
}

def recall_scorer_class_0(estimator, X_data, y_labels):
    y_pred = estimator.predict(X_data)
    return recall_score(y_labels, y_pred, average=None)[0]

def recall_scorer_class_1(estimator, X_data, y_labels):
    y_pred = estimator.predict(X_data)
    return recall_score(y_labels, y_pred, average=None)[1]

def recall_scorer_class_2(estimator, X_data, y_labels):
    y_pred = estimator.predict(X_data)
    return recall_score(y_labels, y_pred, average=None)[2]

data = []
for model_name, model in models.items():
    print(f"Training {model_name}")

    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("classifier", model)
    ])

    scores = cross_validate(pipeline, X, y, scoring={
        "neg_log_loss": "neg_log_loss",
        "precision_macro": "precision_macro",
        "recall_macro": "recall_macro",
        "recall_class_0": recall_scorer_class_0,
        "recall_class_1": recall_scorer_class_1,
        "recall_class_2": recall_scorer_class_2,
    }, cv=skf, n_jobs=-1)


    data.append({
        "model": model_name, 
        "log_loss_score": -scores['test_neg_log_loss'].mean(),
        "precision_macro": scores['test_precision_macro'].mean(),
        "recall_macro": scores['test_recall_macro'].mean(),
        "recall_class_0": scores['test_recall_class_0'].mean(),
        "recall_class_1": scores['test_recall_class_1'].mean(),
        "recall_class_2": scores['test_recall_class_2'].mean()
    })

for model_name, model in models.items():
    print(f"Training {model_name}")

    pipeline = imbPipeline(steps=[
        ('sampler', RandomOverSampler()),
        ("preprocessor", preprocessor),
        ("classifier", model)
    ])

    scores = cross_validate(pipeline, X, y, scoring={
        "neg_log_loss": "neg_log_loss",
        "precision_macro": "precision_macro",
        "recall_macro": "recall_macro",
        "recall_class_0": recall_scorer_class_0,
        "recall_class_1": recall_scorer_class_1,
        "recall_class_2": recall_scorer_class_2,
    }, cv=skf, n_jobs=-1)


    data.append({
        "model": model_name + "_oversampled", 
        "log_loss_score": -scores['test_neg_log_loss'].mean(),
        "precision_macro": scores['test_precision_macro'].mean(),
        "recall_macro": scores['test_recall_macro'].mean(),
        "recall_class_0": scores['test_recall_class_0'].mean(),
        "recall_class_1": scores['test_recall_class_1'].mean(),
        "recall_class_2": scores['test_recall_class_2'].mean(),
    })

for model_name, model in models.items():
    print(f"Training {model_name}")

    pipeline = imbPipeline(steps=[
        ('sampler', RandomUnderSampler()),
        ("preprocessor", preprocessor),
        ("classifier", model)
    ])

    scores = cross_validate(pipeline, X, y, scoring={
        "neg_log_loss": "neg_log_loss",
        "precision_macro": "precision_macro",
        "recall_macro": "recall_macro",
        "recall_class_0": recall_scorer_class_0,
        "recall_class_1": recall_scorer_class_1,
        "recall_class_2": recall_scorer_class_2,
    }, cv=skf, n_jobs=-1)


    data.append({
        "model": model_name + "_undersampled", 
        "log_loss_score": -scores['test_neg_log_loss'].mean(),
        "precision_macro": scores['test_precision_macro'].mean(),
        "recall_macro": scores['test_recall_macro'].mean(),
        "recall_class_0": scores['test_recall_class_0'].mean(),
        "recall_class_1": scores['test_recall_class_1'].mean(),
        "recall_class_2": scores['test_recall_class_2'].mean(),
    })

Training default catboost




Training default lightgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000359 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2220
[LightGBM] [Info] Number of data points in the train set: 5270, number of used features: 37
[LightGBM] [Info] Start training from score -0.465082
[LightGBM] [Info] Start training from score -3.360299
[LightGBM] [Info] Start training from score -1.087104
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000573 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2218
[LightGBM] [Info] Number of data points in the train set: 5270, number of used features: 37
[LightGBM] [Info] Start training from score -0.465082
[LightGBM] [Info] Start training from score -3.360299
[LightGBM] [Info] Start training from score -1.087104
[LightGBM]



Training stacking classifier




Training default catboost




Training default lightgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000536 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2254
[LightGBM] [Info] Number of data points in the train set: 9930, number of used features: 40
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000700 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2244
[LightGBM] [Info] Number of data points in the train set: 9930, number of used features: 40
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




Training stacking classifier




Training default catboost




Training default lightgbm
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000108 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1257
[LightGBM] [Info] Number of data points in the train set: 552, number of used features: 28
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000084 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.

[LightGBM] [Info] Number of data points in the train set: 549, number of used features: 28
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612










You can set `force_col_wise=true` t



Training stacking classifier




In [66]:
for model_name, model in models.items():
    print(f"Training {model_name}")

    pipeline = imbPipeline(steps=[
        ("preprocessor", preprocessor),
        ('sampler', ADASYN()),
        ("classifier", model)
    ])

    scores = cross_validate(pipeline, X, y, scoring={
        "neg_log_loss": "neg_log_loss",
        "precision_macro": "precision_macro",
        "recall_macro": "recall_macro",
        "recall_class_0": recall_scorer_class_0,
        "recall_class_1": recall_scorer_class_1,
        "recall_class_2": recall_scorer_class_2,
    }, cv=skf, n_jobs=-1)


    data.append({
        "model": model_name + "_adasyn", 
        "log_loss_score": -scores['test_neg_log_loss'].mean(),
        "precision_macro": scores['test_precision_macro'].mean(),
        "recall_macro": scores['test_recall_macro'].mean(),
        "recall_class_0": scores['test_recall_class_0'].mean(),
        "recall_class_1": scores['test_recall_class_1'].mean(),
        "recall_class_2": scores['test_recall_class_2'].mean(),
    })



Training default lightgbm
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001640 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6474
[LightGBM] [Info] Number of data points in the train set: 9818, number of used features: 40
[LightGBM] [Info] Start training from score -1.087269
[LightGBM] [Info] Start training from score -1.076153
[LightGBM] [Info] Start training from score -1.133329
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001690 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6113
[LightGBM] [Info] Number of data points in the train set: 9849, number of used features: 38
[LightGBM] [Info] Start training from score -1.090422
[LightGBM] [Info] Start training from score -1.067425
[LightGBM] [Info] Start training from score -1.139333
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of 



Training stacking classifier




In [67]:
for model_name, model in models.items():
    print(f"Training {model_name}")

    pipeline = imbPipeline(steps=[
        ("preprocessor", preprocessor),
        ('sampler', SMOTE()),
        ("classifier", model)
    ])

    scores = cross_validate(pipeline, X, y, scoring={
        "neg_log_loss": "neg_log_loss",
        "precision_macro": "precision_macro",
        "recall_macro": "recall_macro",
        "recall_class_0": recall_scorer_class_0,
        "recall_class_1": recall_scorer_class_1,
        "recall_class_2": recall_scorer_class_2,
    }, cv=skf, n_jobs=-1)


    data.append({
        "model": model_name + "_adasyn", 
        "log_loss_score": -scores['test_neg_log_loss'].mean(),
        "precision_macro": scores['test_precision_macro'].mean(),
        "recall_macro": scores['test_recall_macro'].mean(),
        "recall_class_0": scores['test_recall_class_0'].mean(),
        "recall_class_1": scores['test_recall_class_1'].mean(),
        "recall_class_2": scores['test_recall_class_2'].mean(),
    })

Training default catboost




Training default lightgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000820 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6623
[LightGBM] [Info] Number of data points in the train set: 9930, number of used features: 40
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001906 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6317
[LightGBM] [Info] Number of data points in the train set: 9930, number of used features: 39
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM]



Training stacking classifier




In [68]:
for model_name, model in models.items():
    print(f"Training {model_name}")

    pipeline = imbPipeline(steps=[
        ("preprocessor", preprocessor),
        ('sampler', SVMSMOTE()),
        ("classifier", model)
    ])

    scores = cross_validate(pipeline, X, y, scoring={
        "neg_log_loss": "neg_log_loss",
        "precision_macro": "precision_macro",
        "recall_macro": "recall_macro",
        "recall_class_0": recall_scorer_class_0,
        "recall_class_1": recall_scorer_class_1,
        "recall_class_2": recall_scorer_class_2,
    }, cv=skf, n_jobs=-1)


    data.append({
        "model": model_name + "_adasyn", 
        "log_loss_score": -scores['test_neg_log_loss'].mean(),
        "precision_macro": scores['test_precision_macro'].mean(),
        "recall_macro": scores['test_recall_macro'].mean(),
        "recall_class_0": scores['test_recall_class_0'].mean(),
        "recall_class_1": scores['test_recall_class_1'].mean(),
        "recall_class_2": scores['test_recall_class_2'].mean(),
    })

Training default catboost




Training default lightgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000885 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5793
[LightGBM] [Info] Number of data points in the train set: 9930, number of used features: 38
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001480 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6042
[LightGBM] [Info] Number of data points in the train set: 8162, number of used features: 38
[LightGBM] [Info] Start training from score -0.902541
[LightGBM] [Info] Start training from score -1.666409
[LightGBM] [Info] Start training from score -0.902541
[LightGBM]



Training stacking classifier




In [71]:
pd.DataFrame(data).drop_duplicates().sort_values(by=["recall_class_1", "log_loss_score"], ascending=False)

Unnamed: 0,model,log_loss_score,precision_macro,recall_macro,recall_class_0,recall_class_1,recall_class_2
8,stacking classifier_undersampled,0.742449,0.584866,0.685247,0.721853,0.672719,0.661169
7,default lightgbm_undersampled,0.957457,0.574847,0.663571,0.717623,0.614548,0.658541
6,default catboost_undersampled,0.807973,0.576075,0.663164,0.731118,0.614469,0.643905
4,default lightgbm_oversampled,0.482498,0.671288,0.64932,0.873112,0.305622,0.769227
3,default catboost_oversampled,0.48518,0.665832,0.645459,0.875932,0.294593,0.765853
11,stacking classifier_adasyn,0.504653,0.655089,0.632608,0.869889,0.261706,0.766229
12,default catboost_adasyn,0.502396,0.650677,0.628039,0.879355,0.250916,0.753845
9,default catboost_adasyn,0.507306,0.643157,0.625906,0.869285,0.250836,0.757597
10,default lightgbm_adasyn,0.488482,0.651657,0.626153,0.880363,0.247253,0.750842
16,default lightgbm_adasyn,0.48801,0.682979,0.629298,0.880161,0.240006,0.767726
