In [1]:
import numpy as np
import pandas as pd

from xgboost import XGBClassifier

from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

from functools import partial

In [2]:
df_train = pd.read_csv("../data/raw/train.csv")
df_test = pd.read_csv("../data/raw/test.csv")
df_sample_submission = pd.read_csv("../data/raw/sample_submission.csv")

In [3]:
def analyze_dataframe(df: pd.DataFrame) -> None:
    """Analyzes the given DataFrame. Analysis include
        1) DataFrame shape
        2) Number of NULL values in each column
        3) Data types
        4) Total duplicate data points
        5) Total unique values
        6) DataFrame description
        7) DataFrame information
        8) DataFrame data types

    Args:
        df (pd.DataFrame): _description_
    """
    
    # 1) DataFrame shape
    print("*"*3 + "DataFrame shape" + "*"*3)
    print("*"*len("DataFrame shape") + "*"*6)
    print(df.shape)
    print("="*40, end="\n"*3)
    
    # 2) Number of NULL values in each column
    print("*"*3 + "Number of NULL values in each column" + "*"*3)
    print("*"*len("Number of NULL values in each column") + "*"*6)
    print(df.isnull().sum())
    print("="*40, end="\n"*3)
    
    # 3) Data types
    print("*"*3 + "Data types" + "*"*3)
    print("*"*len("Data types") + "*"*6)
    print(df.dtypes)
    print("="*40, end="\n"*3)
    
    # 4) Total duplicate data points
    print("*"*3 + "Total duplicate data points" + "*"*3)
    print("*"*len("Total duplicate data points") + "*"*6)
    print(df.duplicated().sum())
    print("="*40, end="\n"*3)
    
    # 5) Total unique values
    print("*"*3 + "Total unique values" + "*"*3)
    print("*"*len("Total unique values") + "*"*6)
    print(df.nunique())
    print("="*40, end="\n"*3)
    
    # 6) DataFrame description
    print("*"*3 + "DataFrame description" + "*"*3)
    print("*"*len("DataFrame description") + "*"*6)
    print(df.describe())
    print("="*40, end="\n"*3)
    
    # 7) DataFrame information
    print("*"*3 + "DataFrame information" + "*"*3)
    print("*"*len("DataFrame information") + "*"*6)
    print(df.info(verbose=True))
    print("="*40, end="\n"*3)
    
    # 8) DataFrame data types
    print("*"*3 + "DataFrame data types" + "*"*3)
    print("*"*len("DataFrame data types") + "*"*6)
    print(f"bool  : {list(df.select_dtypes(include='bool').columns)}")
    print(f"int   : {list(df.select_dtypes(include='int').columns)}")
    print(f"float : {list(df.select_dtypes(include='float').columns)}")
    print(f"object: {list(df.select_dtypes(include='object').columns)}")
    print("="*40, end="\n"*3)

In [4]:
analyze_dataframe(df=df_train)

***DataFrame shape***
*********************
(7905, 20)


***Number of NULL values in each column***
******************************************
id               0
N_Days           0
Drug             0
Age              0
Sex              0
Ascites          0
Hepatomegaly     0
Spiders          0
Edema            0
Bilirubin        0
Cholesterol      0
Albumin          0
Copper           0
Alk_Phos         0
SGOT             0
Tryglicerides    0
Platelets        0
Prothrombin      0
Stage            0
Status           0
dtype: int64


***Data types***
****************
id                 int64
N_Days             int64
Drug              object
Age                int64
Sex               object
Ascites           object
Hepatomegaly      object
Spiders           object
Edema             object
Bilirubin        float64
Cholesterol      float64
Albumin          float64
Copper           float64
Alk_Phos         float64
SGOT             float64
Tryglicerides    float64
Platelets        float64
Pro

In [5]:
analyze_dataframe(df=df_test)

***DataFrame shape***
*********************
(5271, 19)


***Number of NULL values in each column***
******************************************
id               0
N_Days           0
Drug             0
Age              0
Sex              0
Ascites          0
Hepatomegaly     0
Spiders          0
Edema            0
Bilirubin        0
Cholesterol      0
Albumin          0
Copper           0
Alk_Phos         0
SGOT             0
Tryglicerides    0
Platelets        0
Prothrombin      0
Stage            0
dtype: int64


***Data types***
****************
id                 int64
N_Days             int64
Drug              object
Age                int64
Sex               object
Ascites           object
Hepatomegaly      object
Spiders           object
Edema             object
Bilirubin        float64
Cholesterol      float64
Albumin          float64
Copper           float64
Alk_Phos         float64
SGOT             float64
Tryglicerides    float64
Platelets        float64
Prothrombin      float

In [6]:
def pre_process_data(df: pd.DataFrame) -> pd.DataFrame:
    """_summary_

    Args:
        df (pd.DataFrame): _description_

    Returns:
        pd.DataFrame: _description_
    """
    df = df.drop(columns=["id"])
    if "Status" in df.columns:
        df["Status"] = df["Status"].map({"D":0, "C":1, "CL":2})
    df = pd.get_dummies(df)
    
    return df

In [7]:
df_train.head()

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status
0,0,999,D-penicillamine,21532,M,N,N,N,N,2.3,316.0,3.35,172.0,1601.0,179.8,63.0,394.0,9.7,3.0,D
1,1,2574,Placebo,19237,F,N,N,N,N,0.9,364.0,3.54,63.0,1440.0,134.85,88.0,361.0,11.0,3.0,C
2,2,3428,Placebo,13727,F,N,Y,Y,Y,3.3,299.0,3.55,131.0,1029.0,119.35,50.0,199.0,11.7,4.0,D
3,3,2576,Placebo,18460,F,N,N,N,N,0.6,256.0,3.5,58.0,1653.0,71.3,96.0,269.0,10.7,3.0,C
4,4,788,Placebo,16658,F,N,Y,N,N,1.1,346.0,3.65,63.0,1181.0,125.55,96.0,298.0,10.6,4.0,C


In [8]:
df_train = pre_process_data(df_train)
df_test = pre_process_data(df_test)

In [9]:
df_1, df_2, df_3 = df_train.copy(), df_train.copy(), df_train.copy()

In [10]:
df_1["Status"] = df_1["Status"].map({0: 0, 1: 1, 2: 1})
df_2["Status"] = df_2["Status"].map({0: 1, 1: 0, 2: 1})
df_3["Status"] = df_3["Status"].map({0: 1, 1: 1, 2: 0})

In [11]:
def train_model(model, 
                xtrain: pd.DataFrame, 
                ytrain: pd.Series, 
                xtest: pd.DataFrame, 
                ytest: pd.Series) -> float:
    """_summary_

    Args:
        model (_type_): _description_
        xtrain (pd.DataFrame): _description_
        ytrain (pd.Series): _description_
        xtest (pd.DataFrame): _description_
        ytest (pd.Series): _description_

    Returns:
        float: _description_
    """
    
    model.fit(xtrain, ytrain)
    ypred = model.predict_proba(xtest)
    
    return log_loss(y_true=ytest, y_pred=ypred)

In [12]:
def objective_xgb(trial: optuna.Trial, df: pd.DataFrame) -> float:
    """_summary_

    Args:
        trial (optuna.Trial): _description_

    Returns:
        float: _description_
    """
    
    params = {
        "booster": trial.suggest_categorical("booster", ["gbtree"]),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 300, 700),
        "min_child_weight": trial.suggest_int("min_child_weight", 0.01, 1),
        "subsample": trial.suggest_float("subsample", 0.1, 1.0, log=True),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.1, 1),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1),
        "colsample_bynode": trial.suggest_float("colsample_bynode", 0.1, 1),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.01, 1),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.01, 1),
        "eval_metric":trial.suggest_categorical("eval_metric", ["mlogloss"]),
    }
    
    kf = StratifiedKFold(n_splits=11)
    ll_error_metric = []
    X = df.drop(columns=["Status"])
    y = df["Status"]

    for idx in kf.split(X=X, y=y):
        train_idx, test_idx = idx[0], idx[1]
        train = df.loc[train_idx, :]
        test = df.loc[test_idx, :]
        
        xtrain = train.drop(columns=["Status"])
        xtrain = pd.get_dummies(xtrain)
        ytrain = train["Status"]
        
        xtest = test.drop(columns=["Status"])
        xtest = pd.get_dummies(xtest)
        ytest = test["Status"]
        
        model = XGBClassifier(**params)
        error = train_model(model=model, 
                            xtrain=xtrain, 
                            xtest=xtest, 
                            ytrain=ytrain, 
                            ytest=ytest)
        
        ll_error_metric.append(error)

    return np.mean(ll_error_metric)

In [13]:
objective_xgb_function = partial(objective_xgb, df=df_1)
study_xgb_1 = optuna.create_study(direction="minimize")
study_xgb_1.optimize(objective_xgb_function, n_trials=500, n_jobs=-1, show_progress_bar=True)

  0%|          | 0/500 [00:00<?, ?it/s]

In [14]:
objective_xgb_function = partial(objective_xgb, df=df_2)
study_xgb_2 = optuna.create_study(direction="minimize")
study_xgb_2.optimize(objective_xgb_function, n_trials=500, n_jobs=-1, show_progress_bar=True)

  0%|          | 0/500 [00:00<?, ?it/s]

In [15]:
objective_xgb_function = partial(objective_xgb, df=df_3)
study_xgb_3 = optuna.create_study(direction="minimize")
study_xgb_3.optimize(objective_xgb_function, n_trials=500, n_jobs=-1, show_progress_bar=True)

  0%|          | 0/500 [00:00<?, ?it/s]

In [16]:
def train_final_model(df: pd.DataFrame, params: dict):
    X = df.drop(columns=["Status"])
    y = df["Status"]
    
    model = XGBClassifier(**params)
    model.fit(X, y)
    
    return model

In [18]:
model_1 = train_final_model(df_1, study_xgb_1.best_params)
model_2 = train_final_model(df_2, study_xgb_2.best_params)
model_3 = train_final_model(df_3, study_xgb_3.best_params)

In [19]:
df_sample_submission["Status_D"] = model_1.predict_proba(df_test)[:, 0]
df_sample_submission["Status_C"] = model_2.predict_proba(df_test)[:, 0]
df_sample_submission["Status_CL"] = model_3.predict_proba(df_test)[:, 0]

In [20]:
df_sample_submission.to_csv("../submission/submission.csv", index=False)
df_sample_submission

Unnamed: 0,id,Status_C,Status_CL,Status_D
0,7905,0.442080,0.026698,0.653509
1,7906,0.561146,0.198031,0.205139
2,7907,0.037831,0.012700,0.915139
3,7908,0.977083,0.004519,0.024074
4,7909,0.806393,0.049841,0.105101
...,...,...,...,...
5266,13171,0.830151,0.074755,0.041937
5267,13172,0.941485,0.007038,0.024343
5268,13173,0.873389,0.019592,0.080543
5269,13174,0.975673,0.012465,0.011580


In [94]:
df_sample_submission.to_csv("../submission/submission.csv", index=False)
df_sample_submission

Unnamed: 0,id,Status_C,Status_CL,Status_D
0,7905,0.403697,0.002497,0.135683
1,7906,0.509195,0.100343,0.129051
2,7907,0.013022,0.001492,0.991268
3,7908,0.992393,0.000119,0.036691
4,7909,0.927168,0.002618,0.007938
...,...,...,...,...
5266,13171,0.900103,0.023131,0.013301
5267,13172,0.997614,0.000015,0.004388
5268,13173,0.967101,0.000300,0.068599
5269,13174,0.997648,0.000700,0.000562
