# Import Libraries

In [23]:
import pandas as pd

In [24]:
df_train = pd.read_csv("../data/raw/train.csv").drop(columns=["id"])
df_test = pd.read_csv("../data/raw/test.csv").drop(columns=["id"])
df_sample_submission = pd.read_csv("../data/raw/sample_submission.csv")

In [25]:
def analyze_dataframe(df: pd.DataFrame) -> None:
    """Analyzes the given DataFrame. Analysis include
        1) DataFrame shape
        2) Number of NULL values in each column
        3) Data types
        4) Total duplicate data points
        5) Total unique values
        6) DataFrame description
        7) DataFrame information
        8) DataFrame data types

    Args:
        df (pd.DataFrame): _description_
    """
    
    # 1) DataFrame shape
    print("*"*3 + "DataFrame shape" + "*"*3)
    print("*"*len("DataFrame shape") + "*"*6)
    print(df.shape)
    print("="*40, end="\n"*3)
    
    # 2) Number of NULL values in each column
    print("*"*3 + "Number of NULL values in each column" + "*"*3)
    print("*"*len("Number of NULL values in each column") + "*"*6)
    print(df.isnull().sum())
    print("="*40, end="\n"*3)
    
    # 3) Data types
    print("*"*3 + "Data types" + "*"*3)
    print("*"*len("Data types") + "*"*6)
    print(df.dtypes)
    print("="*40, end="\n"*3)
    
    # 4) Total duplicate data points
    print("*"*3 + "Total duplicate data points" + "*"*3)
    print("*"*len("Total duplicate data points") + "*"*6)
    print(df.duplicated().sum())
    print("="*40, end="\n"*3)
    
    # 5) Total unique values
    print("*"*3 + "Total unique values" + "*"*3)
    print("*"*len("Total unique values") + "*"*6)
    print(df.nunique())
    print("="*40, end="\n"*3)
    
    # 6) DataFrame description
    print("*"*3 + "DataFrame description" + "*"*3)
    print("*"*len("DataFrame description") + "*"*6)
    print(df.describe())
    print("="*40, end="\n"*3)
    
    # 7) DataFrame information
    print("*"*3 + "DataFrame information" + "*"*3)
    print("*"*len("DataFrame information") + "*"*6)
    print(df.info(verbose=True))
    print("="*40, end="\n"*3)
    
    # 8) DataFrame data types
    print("*"*3 + "DataFrame data types" + "*"*3)
    print("*"*len("DataFrame data types") + "*"*6)
    print(f"bool  : {list(df.select_dtypes(include='bool').columns)}")
    print(f"int   : {list(df.select_dtypes(include='int').columns)}")
    print(f"float : {list(df.select_dtypes(include='float').columns)}")
    print(f"object: {list(df.select_dtypes(include='object').columns)}")
    print("="*40, end="\n"*3)
    

In [26]:
analyze_dataframe(df=df_train)

***DataFrame shape***
*********************
(7905, 19)


***Number of NULL values in each column***
******************************************
N_Days           0
Drug             0
Age              0
Sex              0
Ascites          0
Hepatomegaly     0
Spiders          0
Edema            0
Bilirubin        0
Cholesterol      0
Albumin          0
Copper           0
Alk_Phos         0
SGOT             0
Tryglicerides    0
Platelets        0
Prothrombin      0
Stage            0
Status           0
dtype: int64


***Data types***
****************
N_Days             int64
Drug              object
Age                int64
Sex               object
Ascites           object
Hepatomegaly      object
Spiders           object
Edema             object
Bilirubin        float64
Cholesterol      float64
Albumin          float64
Copper           float64
Alk_Phos         float64
SGOT             float64
Tryglicerides    float64
Platelets        float64
Prothrombin      float64
Stage            float

In [27]:
df_train[df_train.select_dtypes(include=["object"]).columns]

Unnamed: 0,Drug,Sex,Ascites,Hepatomegaly,Spiders,Edema,Status
0,D-penicillamine,M,N,N,N,N,D
1,Placebo,F,N,N,N,N,C
2,Placebo,F,N,Y,Y,Y,D
3,Placebo,F,N,N,N,N,C
4,Placebo,F,N,Y,N,N,C
...,...,...,...,...,...,...,...
7900,D-penicillamine,F,N,N,N,N,C
7901,Placebo,F,N,Y,N,N,C
7902,D-penicillamine,F,N,N,Y,S,D
7903,D-penicillamine,M,N,Y,N,N,D


In [28]:
list(df_train.select_dtypes(include=["int"]).columns)+["Drug"]

['N_Days', 'Age', 'Drug']

In [29]:
import matplotlib.pyplot as plt
import plotly.express as px

In [30]:
fig = px.histogram(data_frame=df_train[list(df_train.select_dtypes(include=["int"]).columns)+["Drug"]], 
                   color="Drug", 
                   nbins=40, 
                   marginal="box" # box, violin, rug
                   )
fig.show()

In [31]:
fig = px.box(data_frame=df_train[df_train.select_dtypes(include=["int"]).columns])
fig.show()

In [32]:
fig = px.histogram(data_frame=df_train[list(df_train.select_dtypes(include=["float"]).columns)+["Drug"]], 
                   color="Drug", 
                   nbins=40, 
                   marginal="box" # box, violin, rug
                   )
fig.show()

In [33]:
fig = px.box(data_frame=df_train[df_train.select_dtypes(include=["float"]).columns])
fig.show()

In [34]:
df_train["Status"] = df_train["Status"].map({"D":0, "C":1, "CL":2})

In [35]:
X = df_train.drop(columns=["Status"])
X = pd.get_dummies(X)
y = df_train["Status"]

In [36]:
from sklearn.model_selection import train_test_split

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    train_size=0.8, 
                                                    random_state=42, 
                                                    stratify=y)

In [38]:
import optuna
from xgboost import XGBClassifier
from sklearn.metrics import log_loss

In [39]:
def objective_xgb(trial: optuna.Trial) -> float:
    """_summary_

    Args:
        trial (optuna.Trial): _description_

    Returns:
        float: _description_
    """
    
    params = {
        "booster": trial.suggest_categorical("booster", ["gbtree"]),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 300, 700),
        "min_child_weight": trial.suggest_int("min_child_weight", 0.01, 1),
        "subsample": trial.suggest_float("subsample", 0.1, 1.0, log=True),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.1, 1),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1),
        "colsample_bynode": trial.suggest_float("colsample_bynode", 0.1, 1),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.01, 1),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.01, 1),
        "eval_metric":trial.suggest_categorical("eval_metric", ["mlogloss"]),
    }
    
    model_xgb = XGBClassifier(**params)
    model_xgb.fit(X_train, y_train)
    y_pred = model_xgb.predict_proba(X_test)
    
    return log_loss(y_test, y_pred)

In [40]:
study_xgb = optuna.create_study(direction="minimize")
study_xgb.optimize(objective_xgb, n_trials=500, show_progress_bar=True)

[I 2023-12-07 21:53:41,729] A new study created in memory with name: no-name-9f1a9d58-11c5-4b23-afd2-6ad03ac5e017


  0%|          | 0/500 [00:00<?, ?it/s]

[I 2023-12-07 21:53:42,845] Trial 0 finished with value: 0.43939057572914736 and parameters: {'booster': 'gbtree', 'max_depth': 4, 'learning_rate': 0.04879173851950904, 'n_estimators': 498, 'min_child_weight': 1, 'subsample': 0.2516171499714178, 'colsample_bylevel': 0.8223598451332499, 'colsample_bytree': 0.8132506205150634, 'colsample_bynode': 0.25379526310826744, 'reg_alpha': 0.8743729785859975, 'reg_lambda': 0.2193957060376273, 'eval_metric': 'mlogloss'}. Best is trial 0 with value: 0.43939057572914736.
[I 2023-12-07 21:53:44,325] Trial 1 finished with value: 0.44880602561901123 and parameters: {'booster': 'gbtree', 'max_depth': 4, 'learning_rate': 0.04413852028565803, 'n_estimators': 651, 'min_child_weight': 1, 'subsample': 0.11670993961666355, 'colsample_bylevel': 0.7133120905825483, 'colsample_bytree': 0.9058575273022575, 'colsample_bynode': 0.30886104029818007, 'reg_alpha': 0.13035043098051344, 'reg_lambda': 0.20611135381925194, 'eval_metric': 'mlogloss'}. Best is trial 0 with v

In [41]:
study_xgb.best_params

{'booster': 'gbtree',
 'max_depth': 12,
 'learning_rate': 0.05350354061455062,
 'n_estimators': 694,
 'min_child_weight': 0,
 'subsample': 0.7682034387466382,
 'colsample_bylevel': 0.24172181149217678,
 'colsample_bytree': 0.11680570926810949,
 'colsample_bynode': 0.33913326941398936,
 'reg_alpha': 0.49183425619335386,
 'reg_lambda': 0.47623090528289336,
 'eval_metric': 'mlogloss'}

In [42]:
xgb = XGBClassifier(**study_xgb.best_params)
xgb.fit(X_train, y_train)

y_pred = xgb.predict_proba(X_test)
print(f"Log Loss = {log_loss(y_test, y_pred)}")

Log Loss = 0.41359892718633373


In [43]:
from lightgbm import LGBMClassifier

In [44]:
def objective_lgbm(trial: optuna.Trial) -> float:
    """_summary_

    Args:
        trial (optuna.Trial): _description_

    Returns:
        float: _description_
    """
    params = {
        "logging_level" : trial.suggest_categorical("logging_level" , [" Silent" ]),
        "grow_policy" : trial.suggest_categorical("grow_policy" , ["Lossguide"]),
        "max_leaves" : trial.suggest_int("max_leaves", 8, 64),
        "random_seed" : trial.suggest_categorical("random_seed" , [42]),
        "iterations" : trial.suggest_int("iterations", 100, 300),
        "learning_rate" : trial.suggest_float("learning_rate" , 0.01, 0.1),
        "depth" : trial.suggest_int("depth", 3, 12),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.1, 1),
        "min_data_in_leaf" : trial.suggest_int("min_data_in_leaf" , 3, 15),
        "bagging_temperature"  :trial.suggest_float("bagging_temperature" , 1, 3),
        "leaf_estimation_iterations" : trial.suggest_int("leaf_estimation_iterations" ,1,15),
        "l2_leaf_reg" : trial.suggest_float("l2_leaf_reg" ,0.01,0.1)
    }
    
    model_lgbm = LGBMClassifier(**params)
    model_lgbm.fit(X_train, y_train)
    y_pred = model_lgbm.predict_proba(X_test)
    
    return log_loss(y_test, y_pred)

In [45]:
study_lgbm = optuna.create_study(direction="minimize")
study_lgbm.optimize(objective_lgbm, n_trials=500, show_progress_bar=True)

[I 2023-12-07 22:20:03,252] A new study created in memory with name: no-name-d70d026a-1bbc-4ca1-8c97-347a49975c63


  0%|          | 0/500 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000993 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1935
[LightGBM] [Info] Number of data points in the train set: 6324, number of used features: 25
[LightGBM] [Info] Start training from score -1.087291
[LightGBM] [Info] Start training from score -0.465082
[LightGBM] [Info] Start training from score -3.358480
[I 2023-12-07 22:20:03,680] Trial 0 finished with value: 0.4452132058683649 and parameters: {'logging_level': ' Silent', 'grow_policy': 'Lossguide', 'max_leaves': 14, 'random_seed': 42, 'iterations': 208, 'learning_rate': 0.05637726123382103, 'depth': 12, 'colsample_bylevel': 0.5362074398461365, 'min_data_in_leaf': 8, 'bagging_temperature': 2.976559605673738, 'leaf_estimation_iterations': 5, 'l2_leaf_reg': 0.08991761031619791}. Best is trial 0 with value: 0.4452132058683649.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead

In [46]:
study_lgbm.best_params

{'logging_level': ' Silent',
 'grow_policy': 'Lossguide',
 'max_leaves': 63,
 'random_seed': 42,
 'iterations': 286,
 'learning_rate': 0.04618687260035939,
 'depth': 11,
 'colsample_bylevel': 0.2903332418719399,
 'min_data_in_leaf': 3,
 'bagging_temperature': 1.1435765917169007,
 'leaf_estimation_iterations': 11,
 'l2_leaf_reg': 0.06262459050095867}

In [50]:
lgbm = LGBMClassifier(**study_lgbm.best_params)
lgbm.fit(X_train, y_train)

y_pred = lgbm.predict_proba(X_test)
print(f"Log Loss = {log_loss(y_test, y_pred)}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000937 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1935
[LightGBM] [Info] Number of data points in the train set: 6324, number of used features: 25
[LightGBM] [Info] Start training from score -1.087291
[LightGBM] [Info] Start training from score -0.465082
[LightGBM] [Info] Start training from score -3.358480
Log Loss = 0.4390006493526982


In [51]:
from catboost import CatBoostClassifier

In [52]:
def objective_catb(trial: optuna.Trial) -> float:
    """_summary_

    Args:
        trial (optuna.Trial): _description_

    Returns:
        float: _description_
    """
    params = {
        "logging_level" : trial.suggest_categorical("logging_level", ["Silent"]),
        "grow_policy" : trial.suggest_categorical("grow_policy", ["Lossguide"]),
        "max_leaves" : trial.suggest_int("max_leaves", 8, 64),
        "random_seed" : trial.suggest_categorical("random_seed", [42]),
        "iterations" : trial.suggest_int("iterations", 100, 300),
        "learning_rate" : trial.suggest_float("learning_rate", 0.01, 0.1),
        "depth" : trial.suggest_int("depth", 3, 12),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.1, 1),
        "min_data_in_leaf" : trial.suggest_int("min_data_in_leaf", 3, 15),
        "bagging_temperature"  :trial.suggest_float("bagging_temperature", 1, 3),
        "leaf_estimation_iterations" : trial.suggest_int("leaf_estimation_iterations", 1, 15),
        "l2_leaf_reg" : trial.suggest_float("l2_leaf_reg", 0.01, 0.1)
    }
    
    model_cb = CatBoostClassifier(**params)
    model_cb.fit(X_train, y_train)
    y_pred = model_cb.predict_proba(X_test)
    
    return log_loss(y_test, y_pred)

In [53]:
study_catb = optuna.create_study(direction="minimize")
study_catb.optimize(objective_catb, n_trials=500, show_progress_bar=True)

[I 2023-12-07 22:28:11,161] A new study created in memory with name: no-name-05a4bf7e-b844-483e-a63b-75c0f5e6c261


  0%|          | 0/500 [00:00<?, ?it/s]

[I 2023-12-07 22:28:13,559] Trial 0 finished with value: 0.46786054153357215 and parameters: {'logging_level': 'Silent', 'grow_policy': 'Lossguide', 'max_leaves': 22, 'random_seed': 42, 'iterations': 156, 'learning_rate': 0.08572937717894069, 'depth': 12, 'colsample_bylevel': 0.8067494467751753, 'min_data_in_leaf': 8, 'bagging_temperature': 2.923790459538713, 'leaf_estimation_iterations': 12, 'l2_leaf_reg': 0.01234409393233422}. Best is trial 0 with value: 0.46786054153357215.
[I 2023-12-07 22:28:14,340] Trial 1 finished with value: 0.449206739733766 and parameters: {'logging_level': 'Silent', 'grow_policy': 'Lossguide', 'max_leaves': 56, 'random_seed': 42, 'iterations': 109, 'learning_rate': 0.07588712495824919, 'depth': 5, 'colsample_bylevel': 0.3560146118032369, 'min_data_in_leaf': 13, 'bagging_temperature': 2.8594899271011425, 'leaf_estimation_iterations': 2, 'l2_leaf_reg': 0.031118832765723164}. Best is trial 1 with value: 0.449206739733766.
[I 2023-12-07 22:28:17,380] Trial 2 fin

In [54]:
study_catb.best_params

{'logging_level': 'Silent',
 'grow_policy': 'Lossguide',
 'max_leaves': 42,
 'random_seed': 42,
 'iterations': 226,
 'learning_rate': 0.05877627518454376,
 'depth': 6,
 'colsample_bylevel': 0.25140412197611195,
 'min_data_in_leaf': 12,
 'bagging_temperature': 2.831931731250476,
 'leaf_estimation_iterations': 2,
 'l2_leaf_reg': 0.038855828884959275}

In [55]:
catm = CatBoostClassifier(**study_catb.best_params)
catm.fit(X_train, y_train)

y_pred = catm.predict_proba(X_test)
print(f"Log Loss = {log_loss(y_test, y_pred)}")

Log Loss = 0.43600419065744006


In [56]:
from sklearn.ensemble import VotingClassifier

In [57]:
voting_clf = VotingClassifier(estimators=[("xgb", xgb), 
                                          ("lgbm", lgbm), 
                                          ("catm", catm)], 
                              voting="soft")

In [58]:
voting_clf.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000871 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1935
[LightGBM] [Info] Number of data points in the train set: 6324, number of used features: 25
[LightGBM] [Info] Start training from score -1.087291
[LightGBM] [Info] Start training from score -0.465082
[LightGBM] [Info] Start training from score -3.358480


In [59]:
y_pred = voting_clf.predict_proba(X_test)
print(f"Log Loss = {log_loss(y_test, y_pred)}")

Log Loss = 0.41930726498514476



The y_pred values do not sum to one. Starting from 1.5 thiswill result in an error.



In [60]:
df_test = pd.get_dummies(df_test)

In [61]:
voting_clf.predict_proba(df_test)



array([[0.35204691, 0.62045637, 0.0274967 ],
       [0.19809708, 0.66522261, 0.1366803 ],
       [0.95872821, 0.03228377, 0.00898801],
       ...,
       [0.09870967, 0.88882634, 0.012464  ],
       [0.01077686, 0.98259622, 0.00662691],
       [0.63599262, 0.35099004, 0.01301735]])

In [63]:
result = voting_clf.predict_proba(df_test)
df_sample_submission["Status_C"] = result[:, 1]
df_sample_submission["Status_CL"] = result[:,2]
df_sample_submission["Status_D"] = result[:, 0]



In [64]:
df_sample_submission.to_csv("../submission/submission.csv",index=False)
df_sample_submission

Unnamed: 0,id,Status_C,Status_CL,Status_D
0,7905,0.620456,0.027497,0.352047
1,7906,0.665223,0.136680,0.198097
2,7907,0.032284,0.008988,0.958728
3,7908,0.952291,0.004248,0.043461
4,7909,0.772464,0.055381,0.172155
...,...,...,...,...
5266,13171,0.892982,0.033254,0.073765
5267,13172,0.966650,0.003131,0.030219
5268,13173,0.888826,0.012464,0.098710
5269,13174,0.982596,0.006627,0.010777
