## Train current best model & create submission

In [None]:
# essentials
import os
import pathlib
from copy import copy


import pandas as pd
import numpy as np
from tqdm import tqdm

# visualisation
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn imports
import sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MaxAbsScaler, PowerTransformer, FunctionTransformer, StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MaxAbsScaler, PowerTransformer, FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline, make_union, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, chi2, f_classif, SequentialFeatureSelector, RFECV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.base import clone as clone_model
from sklearn.metrics import classification_report, confusion_matrix, log_loss
from sklearn.impute import SimpleImputer, MissingIndicator, KNNImputer


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, ExtraTreesClassifier, BaggingClassifier, StackingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import RocCurveDisplay, roc_auc_score, make_scorer, roc_curve

from sklearn.preprocessing import Binarizer, Normalizer, RobustScaler, StandardScaler
from tpot.builtins import StackingEstimator, ZeroCount
from tpot.export_utils import set_param_recursive
from sklearn.preprocessing import FunctionTransformer

# others
import xgboost as xgb 
import lightgbm as lgb
import catboost as cb
import optuna

RANDOM_SEED = 64

In [None]:
IN_KAGGLE = False

kaggle_folder = "/kaggle/input/"
local_folder = "./data/"
train_df = pd.read_csv(kaggle_folder if IN_KAGGLE else local_folder + "playground-series-s3e26/train.csv", index_col="id")
test_df = pd.read_csv(kaggle_folder if IN_KAGGLE else local_folder  + "playground-series-s3e26/test.csv", index_col="id")
original_df = pd.read_csv(kaggle_folder if IN_KAGGLE else local_folder  + "cirrhosis-prediction-dataset/cirrhosis.csv", index_col="ID")

original_df['generated'] = 0
train_df['generated'] = 1
test_df['generated'] = 1
train_df = pd.concat([train_df, original_df], axis=0)
train_df = train_df.reset_index(drop=True)

target_column = "Status"

target_map = {"C": "censored", "CL": "censored due to liver transplant", "D": "death"} # for readability of charts
train_df[target_column] = train_df[target_column].map(target_map)

categorical_features = ["Drug", "Sex", "Ascites", "Hepatomegaly", "Spiders", "Edema", "Stage"]
numerical_features = ["N_Days", "Age", "Bilirubin", "Cholesterol", "Albumin", "Copper", "Alk_Phos", "SGOT", "Tryglicerides", "Platelets", "Prothrombin"]

#categorical_features += ["generated"]

def num_features_1(df):
    df['bilirubin_increased_levels'] = df['Bilirubin'] > 1.1
    df['cholesterol_increased'] = df['Cholesterol'] > 240
    df["albumin_low"] = df['Albumin'] < 3.5
    df["urinary_copper_increased"] = df['Copper'] > 40
    df["Alk_Phos_increased"] = df['Alk_Phos'] > 1400
    df['SGOT_increased'] = df["SGOT"] > 80 
    df["Tryglicerides_normal"] = df['Tryglicerides'] < 150 
    df["Platelets_normal"] = (df['Platelets'] >= 150) & (df['Platelets'] <= 400)
    
    threshold_platelets = 150
    df['thrombocytopenia'] = np.where(df['Platelets'] < threshold_platelets, 1, 0)

    new_cat_feature_names = [
        "thrombocytopenia"
    ]
    new_num_feature_names = [
        'bilirubin_increased_levels',
        'cholesterol_increased',
        'albumin_low',
        'urinary_copper_increased',
        'Alk_Phos_increased',
        'SGOT_increased',
        'Tryglicerides_normal',
        'Platelets_normal',
    ]
    return df, new_cat_feature_names, new_num_feature_names

def num_features_2(df):
    normal_ranges = {
        'Bilirubin': (0.1, 1.2),
        'Cholesterol': (0, 200),
        'Albumin': (3.5, 5.5),
        'Copper': (10, 30),
        'Alk_Phos': (40, 129),
        'SGOT': (8, 45),
        'Tryglicerides': (48.68, 168.15),
        'Platelets': (150, 400),
        'Prothrombin': (9.4, 12.5)
    }

    for feature, (normal_range_min, normal_range_max) in normal_ranges.items():
        if feature == 'Albumin':
            df[f'{feature}_is_normal'] = (df[feature] >= normal_range_min)
        else:
            df[f'{feature}_is_normal'] = (df[feature] >= normal_range_min) & (df[feature] <= normal_range_max) 

        # Add deviation calculation as before
        df.loc[~df[f'{feature}_is_normal'], f'{feature}_deviation'] = df[feature] - ((normal_range_min + normal_range_max) / 2)

    threshold_platelets = 150
    df['thrombocytopenia'] = np.where(df['Platelets'] < threshold_platelets, 1, 0)


    df.fillna(0, inplace=True)
    
    new_cat_feature_names = [f'{feature}_is_normal' for feature in normal_ranges.keys() ]
    new_num_feature_names = [f'{feature}_deviation' for feature in normal_ranges.keys()]

    new_cat_feature_names.append('thrombocytopenia')
    
    return df, new_cat_feature_names, new_num_feature_names

def num_features_3(df):
    ##initial pass as creating features for meaningful cutoffs
    df['APRI']=100 * (df['SGOT'])/df['Platelets']
    df['under769days']=np.where(df['N_Days']<769, 1, 0)
    df['bilirubin_1.2']=np.where(df['Bilirubin']>1.2, 1, 0)
    df['albumin_low']=np.where(df['Albumin']<2.23, 1, 0)
    df['copper_high']=np.where(df['Copper']>73, 1, 0)
    df['SGOT_high']=np.where(df['SGOT']>73, 1, 0)
    df['Prothrombin_high']=np.where(df['Prothrombin']>10.8, 1, 0)
    df['Edema_yn']=np.where(df['Edema']=='N', 0, 1)
    df['bilirubin_3']=np.where(df['Bilirubin']>3, 1, 0)
    df['high_cholesteroal']=np.where(df['Cholesterol']>240, 1, 0)
    df['age_over_70']=np.where((df['Age']/365)>=70, 1, 0)
    df['abnormal_alp']=np.where(((df['Alk_Phos']<30 )| (df['Alk_Phos']>147)), 1, 0)
    df['very_high_tri']=np.where(df['Tryglicerides']>500, 1, 0)
    df['high_tri']=np.where(df['Tryglicerides']>200, 1, 0)
    df['copper_deficient']=np.where(((df['Sex']=='F') & (df['Copper']<80) |(df['Sex']=='M') & (df['Copper']<70)), 1, 0)
    df['FIB4']=(df['Age']/365)* (df['SGOT']/df['Platelets'])
    df['ALBI']=.66*np.log(df['Bilirubin'])-.085 * df['Albumin']

    new_cat_feature_names = [
        "under769days",
        "bilirubin_1.2",
        "albumin_low",
        "copper_high",
        "SGOT_high",
        "Prothrombin_high",
        "Edema_yn",
        "bilirubin_3",
        "high_cholesteroal",
        "age_over_70",
        "abnormal_alp",
        "very_high_tri",
        "high_tri",
        "copper_deficient",
    ]
    new_num_feature_names = [
        "APRI",
        "FIB4",
        "ALBI",
    ]

    
    return df, new_cat_feature_names, new_num_feature_names

def base_feature_engineering(df):
    df['Drug'] = df['Drug'].map({"D-penicillamine": 1,"Placebo": 0})
    df['Sex'] = df['Sex'].map({"F": 1,"M": 0})
    
    df["Stage"] = df["Stage"].apply(lambda x: str(x))
    df['Stage'] = df['Stage'].astype('category')

    for c in categorical_features:
        df[c] = df[c].astype('category')
    return df


train_df, new_cat, new_num = num_features_2(train_df)
train_df = base_feature_engineering(train_df)


categorical_features += new_cat
numerical_features += new_num

X = train_df.drop(columns=target_column)
y = train_df[target_column]

le = LabelEncoder()

y = le.fit_transform(y)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=RANDOM_SEED, stratify=y, shuffle=True)

In [10]:
num_imputer = SimpleImputer(strategy="constant", fill_value=0, add_indicator=True)
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing", add_indicator=True)

numeric_transformer = Pipeline(
    [
        ("num_imputer", num_imputer),
        ("power_transformer", PowerTransformer()),
    ]
)

categorical_transformer = Pipeline(
    [
        ("cast as str", FunctionTransformer(lambda x: x.astype(str), validate=False)),
        ("cat_imputer", cat_imputer),
        ("onehot", OneHotEncoder(handle_unknown="ignore", drop="if_binary")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_SEED)


lgbm_best_params = {'n_estimators': 394, 'max_depth': 24, 'learning_rate': 0.018470603878367985, 'reg_alpha': 0.41378770694989003, 'reg_lambda': 0.032090753198459054, 'min_child_weight': 3.8061942196940897, 'min_child_samples': 47, 'subsample': 0.8047498902111587, 'subsample_freq': 5, 'colsample_bytree': 0.20152270171348546, 'num_leaves': 126, 'max_bin': 872, 'boosting_type': 'gbdt', 'num_imputer_strategy': 'constant', 'cat_imputer_strategy': 'constant'}
xgb_params = {'n_estimators': 500, 'max_depth': 48, 'learning_rate': 0.012196600643907861, 'reg_alpha': 0.04387533822107198, 'reg_lambda': 0.0019799138401186277, 'min_child_weight': 3.877063073846295, 'min_child_samples': 6, 'subsample': 0.8443268948580747, 'subsample_freq': 0, 'colsample_bytree': 0.23694087295634517, 'num_leaves': 81, 'max_bin': 405, 'boosting_type': 'gbdt', 'num_imputer_strategy': 'constant', 'cat_imputer_strategy': 'most_frequent'}
lgbm_model_optuna = lgb.LGBMClassifier(**lgbm_best_params, random_state=RANDOM_SEED, verbose=-1, n_jobs=5)
xgb_model_optuna = xgb.XGBClassifier(
        **xgb_params, objective="multi:softprob", random_state=RANDOM_SEED, n_jobs=-1, tree_method="hist", device="cuda"
    )

models = {
    "voting classifier 1": VotingClassifier(
        estimators=[
            ("catboost", cb.CatBoostClassifier(random_state=RANDOM_SEED, verbose=False)),
            ("lightgbm_optuna", lgbm_model_optuna),
            ("xgb_optuna", xgb_model_optuna),
        ],
        voting="soft",
        n_jobs=-1,
        verbose=1),
    "stacking classifier 1": StackingClassifier(
        estimators=[
            ("catboost", cb.CatBoostClassifier(random_state=RANDOM_SEED, verbose=False)),
            ("lightgbm_optuna", lgbm_model_optuna),
            ("xgb_optuna", xgb_model_optuna),
        ],
        stack_method="predict_proba",
        final_estimator=LogisticRegression(),
        n_jobs=-1,
        verbose=1),
}

In [None]:
data = []
for model_name, model in models.items():
    clf = Pipeline(
        [
            ("preprocessor", preprocessor),
            ("classifier", model),
        ]
    )

    cc_cv = CalibratedClassifierCV(clf, cv=skf)

    cc_cv.fit(X_train, y_train)
    y_pred_proba = cc_cv.predict_proba(X_val)
    y_pred = cc_cv.predict(X_val)
    
    cr = classification_report(y_val, y_pred, output_dict=True)

    recall_class_0 = confusion_matrix(y_val, y_pred, normalize="true")[0, 0]
    recall_class_1 = confusion_matrix(y_val, y_pred, normalize="true")[1, 1]
    recall_class_2 = confusion_matrix(y_val, y_pred, normalize="true")[2, 2]
    data.append({
        "model": model_name, 
        "log_loss_score": log_loss(y_val, y_pred_proba),
        "avg_precision": cr["macro avg"]["precision"],
        "f1_score": cr["macro avg"]["f1-score"],
        "recall_class_0": recall_class_0,
        "recall_class_1": recall_class_1,
        "recall_class_2": recall_class_2,
    })    

In [11]:
output_df = pd.DataFrame(data).drop_duplicates().sort_values(by=["log_loss_score", "recall_class_1"], ascending=[True, False])
output_df

Unnamed: 0,model,log_loss_score,avg_precision,f1_score,recall_class_0,recall_class_1,recall_class_2
0,voting classifier 1,0.405173,0.799585,0.707588,0.936538,0.3,0.770318


## Parameter optimization

- XGBoost
- LightGBM
- CatBoost

In [12]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_SEED)

def objective_xgboost(trial):
    xgboost_model_optuna = xgb.XGBClassifier(
        objective="multi:softprob", n_jobs=-1, tree_method="hist", device="cuda"
    )
    clf = Pipeline(
        [
            ("preprocessor", preprocessor),
            ("classifier", xgboost_model_optuna),
        ]
    )

    params = {
        "classifier__random_state": trial.suggest_int("random_state", 1, 1000),
        'classifier__n_estimators' : trial.suggest_int('n_estimators',50,500),
        "classifier__max_depth":trial.suggest_int('max_depth',3,50),
        "classifier__eta" : trial.suggest_float('eta',1e-4, 0.25, log=True),

        'classifier__reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'classifier__reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        "classifier__min_child_weight" : trial.suggest_float('min_child_weight', 0.5,10),
        "classifier__subsample" : trial.suggest_float('subsample', 0.1, 1),
        "classifier__colsample_bytree" : trial.suggest_float('colsample_bytree',0.2,1),
        'classifier__num_leaves' : trial.suggest_int('num_leaves', 2, 64),
        'classifier__max_delta_step' : trial.suggest_int('max_delta_step', 0, 10),
        'classifier__scale_pos_weight' : trial.suggest_int('scale_pos_weight', 0, 10),
        'preprocessor__num__num_imputer__strategy': trial.suggest_categorical('num_imputer_strategy', ['mean', 'median', 'constant']),
        'preprocessor__cat__cat_imputer__strategy': trial.suggest_categorical('cat_imputer_strategy', ['most_frequent', 'constant']),
    }
    clf.set_params(**params)
    
    cv = abs(cross_val_score(clf, X, y, cv = skf,scoring='neg_log_loss').mean())
    return cv

def objective_lightgbm(trial):
    lgbm_best_params = {
        'n_estimators': 377, 
        'max_depth': 24, 
        'learning_rate': 0.018470603878367985, 
        'reg_alpha': 0.41378770694989003, 
        'reg_lambda': 0.032090753198459054, 
        'min_child_weight': 3.8061942196940897, 
        'min_child_samples': 47, 
        'subsample': 0.8047498902111587, 
        'subsample_freq': 5, 
        'colsample_bytree': 0.20152270171348546,
        'num_leaves': 126, 
        'max_bin': 872, 
        }
    lightgbm_model_optuna = lgb.LGBMClassifier(**lgbm_best_params, random_state=RANDOM_SEED, verbose=-1, n_jobs=-1)
    clf = Pipeline(
        [
            ("preprocessor", preprocessor),
            ("classifier", lightgbm_model_optuna),
        ]
    )

    params = {
        'classifier__n_estimators' : 377,
        "classifier__max_depth":trial.suggest_int('max_depth',3,50),
        "classifier__learning_rate" : trial.suggest_float('learning_rate',1e-4, 0.25, log=True),
        'classifier__reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0),
        'classifier__reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0),
        "classifier__min_child_weight" : trial.suggest_float('min_child_weight', 0.5,4),
        "classifier__min_child_samples" : trial.suggest_int('min_child_samples',1,100),
        "classifier__subsample" : trial.suggest_float('subsample', 0.4, 1),
        "classifier__subsample_freq" : trial.suggest_int('subsample_freq',0,5),
        "classifier__colsample_bytree" : trial.suggest_float('colsample_bytree',0.2,1),
        "classifier__num_leaves" : trial.suggest_int('num_leaves', 2, 64*2),
        "classifier__max_bin" : trial.suggest_int('max_bin', 128, 1024),
        #'preprocessor__num__num_imputer__n_neighbors': 44,
    }
    
    clf.set_params(**params)
    
    cv = abs(cross_val_score(clf, X, y, cv = skf,scoring='neg_log_loss').mean())
    return cv


In [None]:
%%time
study = optuna.create_study(direction='minimize')
study.optimize(objective_lightgbm, n_trials=50, timeout=1000)

In [14]:
print("Best score:", study.best_value)
print("Best params:", study.best_params)
lgbm_best_params = study.best_params

Best score: 0.4266954252981998
Best params: {'max_depth': 49, 'learning_rate': 0.024023967768376772, 'reg_alpha': 0.35728585250443956, 'reg_lambda': 7.807435805826675, 'min_child_weight': 0.95207025899839, 'min_child_samples': 47, 'subsample': 0.45170744695557963, 'subsample_freq': 0, 'colsample_bytree': 0.21692098536099505, 'num_leaves': 75, 'max_bin': 702}


In [None]:
%%time
study = optuna.create_study(direction='minimize')
study.optimize(objective_xgboost, n_trials=200, timeout=2000)

In [None]:
print("Best score:", study.best_value)
print("Best params:", study.best_params)
xgb_best_params = study.best_params

In [None]:
%%time
study = optuna.create_study(direction='minimize')
study.optimize(objective_catboost_preprocessor, n_trials=50, timeout=2000)
print("Best score:", study.best_value)
print("Best params:", study.best_params)
xgb_best_params = study.best_params

In [15]:
#lgbm_best_params = {'n_estimators': 394, 'max_depth': 24, 'learning_rate': 0.018470603878367985, 'reg_alpha': 0.41378770694989003, 'reg_lambda': 0.032090753198459054, 'min_child_weight': 3.8061942196940897, 'min_child_samples': 47, 'subsample': 0.8047498902111587, 'subsample_freq': 5, 'colsample_bytree': 0.20152270171348546, 'num_leaves': 126, 'max_bin': 872, 'boosting_type': 'gbdt', 'num_imputer_strategy': 'constant', 'cat_imputer_strategy': 'constant'}
lgbmmodel_optuna = lgb.LGBMClassifier(**lgbm_best_params, verbose=-1, n_jobs=-1)
clf = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("classifier", lgbmmodel_optuna),
    ]
)
clf.fit(X_train, y_train)
y_pred_proba = clf.predict_proba(X_val)
y_pred = clf.predict(X_val)

cr = classification_report(y_val, y_pred, output_dict=True)

recall_class_0 = confusion_matrix(y_val, y_pred, normalize="true")[0, 0]
recall_class_1 = confusion_matrix(y_val, y_pred, normalize="true")[1, 1]
recall_class_2 = confusion_matrix(y_val, y_pred, normalize="true")[2, 2]
data.append({
    "model": "LightGBM_tuned", 
    "log_loss_score": log_loss(y_val, y_pred_proba),
    "avg_precision": cr["macro avg"]["precision"],
    "f1_score": cr["macro avg"]["f1-score"],
    "recall_class_0": recall_class_0,
    "recall_class_1": recall_class_1,
    "recall_class_2": recall_class_2,
})    

In [None]:
#xgb_best_params = {'n_estimators': 468, 'max_depth': 8, 'eta': 0.015470486727229083, 'reg_alpha': 0.20921594374776836, 'reg_lambda': 0.13852230882501773, 'min_child_weight': 0.7650897230059427, 'subsample': 0.35568531774241186, 'sampling_method': 'uniform', 'colsample_bytree': 0.5094764848989666, 'num_leaves': 42, 'max_delta_step': 8, 'scale_pos_weight': 2, 'num_imputer_strategy': 'constant', 'cat_imputer_strategy': 'constant'}
model_optuna = xgb.XGBClassifier(
        **xgb_best_params, objective="multi:softprob", n_jobs=-1,
    )
clf = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("classifier", model_optuna),
    ]
)
clf.fit(X_train, y_train)
y_pred_proba = clf.predict_proba(X_val)
y_pred = clf.predict(X_val)

cr = classification_report(y_val, y_pred, output_dict=True)

recall_class_0 = confusion_matrix(y_val, y_pred, normalize="true")[0, 0]
recall_class_1 = confusion_matrix(y_val, y_pred, normalize="true")[1, 1]
recall_class_2 = confusion_matrix(y_val, y_pred, normalize="true")[2, 2]
data.append({
    "model": "xgboost_tuned", 
    "log_loss_score": log_loss(y_val, y_pred_proba),
    "avg_precision": cr["macro avg"]["precision"],
    "f1_score": cr["macro avg"]["f1-score"],
    "recall_class_0": recall_class_0,
    "recall_class_1": recall_class_1,
    "recall_class_2": recall_class_2,
})    

In [None]:
output_df = pd.DataFrame(data).drop_duplicates().sort_values(by=["log_loss_score", "recall_class_1"], ascending=[True, False])
output_df

In [16]:
#model = models["LightGBM_tuned"]
model = lgb.LGBMClassifier(**lgbm_best_params, verbose=-1, n_jobs=5)

#model = VotingClassifier(
#        estimators=[
#            ("lightgbm_optuna", lgb.LGBMClassifier(**lgbm_best_params, verbose=-1, n_jobs=5)),
#            ("xgb_optuna", xgb.XGBClassifier(
#                **xgb_best_params, objective="multi:softprob", n_jobs=-1, tree_method="hist", device="cuda"
#            )),
#        ],
#        voting="soft",
#        n_jobs=-1,
#        verbose=1),


## Submission

In [17]:
IN_KAGGLE = False

kaggle_folder = "/kaggle/input/"
local_folder = "./data/"
train_df = pd.read_csv(kaggle_folder if IN_KAGGLE else local_folder + "playground-series-s3e26/train.csv", index_col="id")
test_df = pd.read_csv(kaggle_folder if IN_KAGGLE else local_folder  + "playground-series-s3e26/test.csv", index_col="id")
original_df = pd.read_csv(kaggle_folder if IN_KAGGLE else local_folder  + "cirrhosis-prediction-dataset/cirrhosis.csv", index_col="ID")

categorical_features = ["Drug", "Sex", "Ascites", "Hepatomegaly", "Spiders", "Edema", "Stage"]
numerical_features = ["N_Days", "Age", "Bilirubin", "Cholesterol", "Albumin", "Copper", "Alk_Phos", "SGOT", "Tryglicerides", "Platelets", "Prothrombin"]

#categorical_features += ["generated"]
#original_df['generated'] = 0
#train_df['generated'] = 1
#test_df['generated'] = 1
train_df = pd.concat([train_df, original_df], axis=0)
train_df = train_df.reset_index(drop=True)

target_column = "Status"

train_df, new_cat, new_num = num_features_2(train_df)
train_df = base_feature_engineering(train_df)

categorical_features += new_cat
numerical_features += new_num

test_df, _, _ = num_features_2(test_df)
test_df = base_feature_engineering(test_df)

le = LabelEncoder()

X_train = train_df.drop(columns=target_column)
y_train = le.fit_transform(train_df[target_column])

X_test = test_df


num_imputer = SimpleImputer(strategy="constant", add_indicator=True)
cat_imputer = SimpleImputer(strategy="constant", fill_value="MISSING", add_indicator=True)

numeric_transformer = Pipeline(
    [
        ("num_imputer", num_imputer),
        ("standard scaler", StandardScaler()),
        #("power_transformer", PowerTransformer()),
    ]
)

categorical_transformer = Pipeline(
    [
        ("cast as str", FunctionTransformer(lambda x: x.astype(str), validate=False)),
        ("cat_imputer", cat_imputer),
        ("onehot", OneHotEncoder(handle_unknown="ignore", drop="if_binary")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("classifier", model),
    ]
)

clf = pipeline
#clf = clone_model(pipeline)

clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)

submission_df = pd.DataFrame(y_pred, index=X_test.index, columns=[f"Status_{target}" for target in le.classes_])
submission_df.to_csv("./submission.csv")
submission_df



Unnamed: 0_level_0,Status_C,Status_CL,Status_D
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7905,0.622510,0.035168,0.342322
7906,0.666710,0.082269,0.251021
7907,0.166860,0.024842,0.808298
7908,0.922391,0.009413,0.068197
7909,0.608441,0.067972,0.323587
...,...,...,...
13171,0.792300,0.059861,0.147839
13172,0.900534,0.014434,0.085032
13173,0.868473,0.014163,0.117364
13174,0.939651,0.010690,0.049659
