In [18]:
# essentials
import os
import pathlib

import pandas as pd
import numpy as np
from tqdm import tqdm

# visualisation
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn imports
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MaxAbsScaler, PowerTransformer, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, chi2, f_classif, SequentialFeatureSelector, RFECV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.base import clone as clone_model
from sklearn.metrics import classification_report, confusion_matrix, log_loss


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, ExtraTreesClassifier, BaggingClassifier, StackingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# others
import xgboost as xgb 
import lightgbm as lgb
import catboost as cb

RANDOM_SEED = 64

In [19]:
IN_KAGGLE = False
kaggle_folder = "/kaggle/input/playground-series-s3e26"
local_folder = "./data"
train_df = pd.read_csv(kaggle_folder if IN_KAGGLE else local_folder + "/train.csv", index_col="id")
test_df = pd.read_csv(kaggle_folder if IN_KAGGLE else local_folder  + "/test.csv", index_col="id")

target_column = "Status"

In [20]:
#  read models params from file and construct estimators list

models_params = pd.read_csv("great_hyperparam_search_result.csv")

# from models_params[["model", "params"]] create a dictionary with row's "model" as key and "params" as value
models_params_dict = dict(zip(models_params["model"], models_params["params"]))
models_params_dict = {k: eval(v) for k, v in models_params_dict.items()}

# models_params_dict contains model names as keys and dicts with params to use as values. From the inner dictionaries keys remove "classifier__" prefix
models_params_dict = {k: {k2.split("__")[1]: v2 for k2, v2 in v.items()} for k, v in models_params_dict.items()}
models_params_dict

{'xgboost': {'subsample': 0.8999999999999999,
  'scale_pos_weight': 0.8,
  'min_child_weight': 6,
  'max_depth': 5,
  'lambda': 0.9,
  'gamma': 0.0,
  'eta': 0.11,
  'colsample_bytree': 0.5,
  'alpha': 0.8},
 'catboost': {'min_data_in_leaf': 4,
  'learning_rate': 0.02,
  'l2_leaf_reg': 7,
  'depth': 6,
  'border_count': 9},
 'random_forest': {'n_estimators': 700,
  'min_samples_split': 2,
  'min_samples_leaf': 1,
  'max_depth': 9,
  'criterion': 'entropy'},
 'logistic_regression': {'solver': 'liblinear', 'penalty': 'l1', 'C': 0.6}}

In [37]:
optimized_estimators = [
    ("xgboost", xgb.XGBClassifier(**models_params_dict["xgboost"])),
    ("catboost", cb.CatBoostClassifier(**models_params_dict["catboost"])),
    #("random_forest", RandomForestClassifier(**models_params_dict["random_forest"])),
    #("logistic_regression", LogisticRegression(**models_params_dict["logistic_regression"])),
]

In [38]:
def feature_engineering(df):
    #train_df['Status'] = train_df['Status'].map({"D": 0,"C": 1,"CL": 2})
    df['date_of_diagnosis'] = df['Age'] - df['N_Days']
    
    df['no_diseases'] = (df['Ascites'] + df['Hepatomegaly'] + df['Spiders'] + df['Edema']) == 0
    df['diseases'] = df['Ascites'] + df['Hepatomegaly'] + df['Spiders'] + df['Edema']
    #df['Drug'] = df['Drug'].map({"D-penicillamine": 1,"placebo": 0})

    # change "Stage" to string
    df["Stage"] = df["Stage"].apply(lambda x: str(x))
    return df

train_df = pd.read_csv(kaggle_folder if IN_KAGGLE else local_folder + "/train.csv", index_col="id")

run_feature_engineering = True

categorical_features = ["Drug", "Sex", "Ascites", "Hepatomegaly", "Spiders", "Edema", "Stage"]
numerical_features = ["N_Days", "Age", "Bilirubin", "Cholesterol", "Albumin", "Copper", "Alk_Phos", "SGOT", "Tryglicerides", "Platelets", "Prothrombin"]

if run_feature_engineering:
    train_df = feature_engineering(train_df)

if run_feature_engineering:
    categorical_features += ["no_diseases", "diseases"]
    numerical_features += ["date_of_diagnosis"]

for col in categorical_features:
    train_df[col] = train_df[col].astype("category")

X = train_df.drop(columns=target_column)
y = train_df[target_column]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=RANDOM_SEED, stratify=y, shuffle=True)

le = LabelEncoder()

y_train = le.fit_transform(y_train)
y_val = le.transform(y_val)

numeric_transformer = Pipeline(
    [
        ("power_transformer", PowerTransformer()),
        ("scaler", MaxAbsScaler()),
    ]
)

categorical_transformer = Pipeline(
    [
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)


model = VotingClassifier(
    estimators=optimized_estimators,
    voting="soft",
    n_jobs=-1,
)

skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_SEED)


pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        #("selection", SelectKBest(f_classif, k=22)),
        ("classifier", model),
    ]
)


clf = CalibratedClassifierCV(pipeline, cv=skf, method="sigmoid")

clf.fit(X_train, y_train)

Parameters: { "scale_pos_weight" } are not used.

0:	learn: 1.0797423	total: 2.93ms	remaining: 2.93s
1:	learn: 1.0616201	total: 5.32ms	remaining: 2.65s
2:	learn: 1.0439719	total: 8.64ms	remaining: 2.87s
3:	learn: 1.0268401	total: 11.6ms	remaining: 2.88s
4:	learn: 1.0113963	total: 14.6ms	remaining: 2.91s
5:	learn: 0.9966924	total: 18.4ms	remaining: 3.05s
6:	learn: 0.9817343	total: 25.2ms	remaining: 3.58s
7:	learn: 0.9668881	total: 28.4ms	remaining: 3.52s
8:	learn: 0.9539725	total: 32ms	remaining: 3.52s
9:	learn: 0.9407491	total: 36ms	remaining: 3.57s
10:	learn: 0.9280812	total: 39.1ms	remaining: 3.52s
11:	learn: 0.9158965	total: 42.6ms	remaining: 3.51s
12:	learn: 0.9043426	total: 46.8ms	remaining: 3.56s
13:	learn: 0.8935453	total: 50.7ms	remaining: 3.57s
14:	learn: 0.8823710	total: 54ms	remaining: 3.54s
15:	learn: 0.8719786	total: 56.9ms	remaining: 3.5s
16:	learn: 0.8618979	total: 60.2ms	remaining: 3.48s
17:	learn: 0.8528367	total: 63.7ms	remaining: 3.48s
18:	learn: 0.8429066	total: 67.

In [39]:
y_pred = clf.predict_proba(X_val)
print(f"Log loss: {log_loss(y_val, y_pred)}")

Log loss: 0.44016313193440093


In [40]:
train_df = pd.read_csv("./data/train.csv", index_col="id")
test_df = pd.read_csv("./data/test.csv", index_col="id")

run_feature_engineering = True

train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)

X_train = train_df.drop(columns=target_column)
y_train = le.transform(train_df[target_column])

X_test = test_df

clf = clone_model(pipeline)

clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)

submission_df = pd.DataFrame(y_pred, index=X_test.index, columns=[f"Status_{target}" for target in le.classes_])
submission_df.to_csv("./submission.csv")
submission_df

Parameters: { "scale_pos_weight" } are not used.

0:	learn: 1.0792330	total: 49.7ms	remaining: 49.7s
1:	learn: 1.0599141	total: 53.2ms	remaining: 26.6s
2:	learn: 1.0416843	total: 57ms	remaining: 18.9s
3:	learn: 1.0241888	total: 60.8ms	remaining: 15.1s
4:	learn: 1.0072334	total: 64.4ms	remaining: 12.8s
5:	learn: 0.9916383	total: 67.5ms	remaining: 11.2s
6:	learn: 0.9764636	total: 70.6ms	remaining: 10s
7:	learn: 0.9619347	total: 75.1ms	remaining: 9.31s
8:	learn: 0.9473454	total: 78.7ms	remaining: 8.66s
9:	learn: 0.9343353	total: 82.3ms	remaining: 8.14s
10:	learn: 0.9222044	total: 85.5ms	remaining: 7.68s
11:	learn: 0.9095116	total: 88.8ms	remaining: 7.31s
12:	learn: 0.8978944	total: 91.9ms	remaining: 6.98s
13:	learn: 0.8862060	total: 94.9ms	remaining: 6.68s
14:	learn: 0.8749911	total: 98.2ms	remaining: 6.45s
15:	learn: 0.8639316	total: 102ms	remaining: 6.26s
16:	learn: 0.8533506	total: 105ms	remaining: 6.09s
17:	learn: 0.8432662	total: 109ms	remaining: 5.95s
18:	learn: 0.8333181	total: 112

Unnamed: 0_level_0,Status_C,Status_CL,Status_D
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7905,0.648260,0.030806,0.320934
7906,0.680551,0.171755,0.147695
7907,0.059081,0.031877,0.909041
7908,0.956676,0.003493,0.039831
7909,0.769332,0.042212,0.188456
...,...,...,...
13171,0.895071,0.028639,0.076290
13172,0.970134,0.004549,0.025317
13173,0.877393,0.011898,0.110709
13174,0.974918,0.015113,0.009969
