In [1]:
import numpy as np
import pandas as pd

from xgboost import XGBClassifier

from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

from functools import partial

import plotly.express as px

In [2]:
df_train = pd.read_csv("../data/raw/train.csv").drop(columns=["id"])
df_test = pd.read_csv("../data/raw/test.csv").drop(columns=["id"])
df_sample_submission = pd.read_csv("../data/raw/sample_submission.csv")

In [3]:
def analyze_dataframe(df: pd.DataFrame) -> None:
    """Analyzes the given DataFrame. Analysis include
        1) DataFrame shape
        2) Number of NULL values in each column
        3) Data types
        4) Total duplicate data points
        5) Total unique values
        6) DataFrame description
        7) DataFrame information
        8) DataFrame data types

    Args:
        df (pd.DataFrame): _description_
    """
    
    # 1) DataFrame shape
    print("*"*3 + "DataFrame shape" + "*"*3)
    print("*"*len("DataFrame shape") + "*"*6)
    print(df.shape)
    print("="*40, end="\n"*3)
    
    # 2) Number of NULL values in each column
    print("*"*3 + "Number of NULL values in each column" + "*"*3)
    print("*"*len("Number of NULL values in each column") + "*"*6)
    print(df.isnull().sum())
    print("="*40, end="\n"*3)
    
    # 3) Data types
    print("*"*3 + "Data types" + "*"*3)
    print("*"*len("Data types") + "*"*6)
    print(df.dtypes)
    print("="*40, end="\n"*3)
    
    # 4) Total duplicate data points
    print("*"*3 + "Total duplicate data points" + "*"*3)
    print("*"*len("Total duplicate data points") + "*"*6)
    print(df.duplicated().sum())
    print("="*40, end="\n"*3)
    
    # 5) Total unique values
    print("*"*3 + "Total unique values" + "*"*3)
    print("*"*len("Total unique values") + "*"*6)
    print(df.nunique())
    print("="*40, end="\n"*3)
    
    # 6) DataFrame description
    print("*"*3 + "DataFrame description" + "*"*3)
    print("*"*len("DataFrame description") + "*"*6)
    print(df.describe())
    print("="*40, end="\n"*3)
    
    # 7) DataFrame information
    print("*"*3 + "DataFrame information" + "*"*3)
    print("*"*len("DataFrame information") + "*"*6)
    print(df.info(verbose=True))
    print("="*40, end="\n"*3)
    
    # 8) DataFrame data types
    print("*"*3 + "DataFrame data types" + "*"*3)
    print("*"*len("DataFrame data types") + "*"*6)
    print(f"bool  : {list(df.select_dtypes(include='bool').columns)}")
    print(f"int   : {list(df.select_dtypes(include='int').columns)}")
    print(f"float : {list(df.select_dtypes(include='float').columns)}")
    print(f"object: {list(df.select_dtypes(include='object').columns)}")
    print("="*40, end="\n"*3)

In [4]:
analyze_dataframe(df=df_train)

***DataFrame shape***
*********************
(7905, 19)


***Number of NULL values in each column***
******************************************
N_Days           0
Drug             0
Age              0
Sex              0
Ascites          0
Hepatomegaly     0
Spiders          0
Edema            0
Bilirubin        0
Cholesterol      0
Albumin          0
Copper           0
Alk_Phos         0
SGOT             0
Tryglicerides    0
Platelets        0
Prothrombin      0
Stage            0
Status           0
dtype: int64


***Data types***
****************
N_Days             int64
Drug              object
Age                int64
Sex               object
Ascites           object
Hepatomegaly      object
Spiders           object
Edema             object
Bilirubin        float64
Cholesterol      float64
Albumin          float64
Copper           float64
Alk_Phos         float64
SGOT             float64
Tryglicerides    float64
Platelets        float64
Prothrombin      float64
Stage            float

In [5]:
analyze_dataframe(df=df_test)

***DataFrame shape***
*********************
(5271, 18)


***Number of NULL values in each column***
******************************************
N_Days           0
Drug             0
Age              0
Sex              0
Ascites          0
Hepatomegaly     0
Spiders          0
Edema            0
Bilirubin        0
Cholesterol      0
Albumin          0
Copper           0
Alk_Phos         0
SGOT             0
Tryglicerides    0
Platelets        0
Prothrombin      0
Stage            0
dtype: int64


***Data types***
****************
N_Days             int64
Drug              object
Age                int64
Sex               object
Ascites           object
Hepatomegaly      object
Spiders           object
Edema             object
Bilirubin        float64
Cholesterol      float64
Albumin          float64
Copper           float64
Alk_Phos         float64
SGOT             float64
Tryglicerides    float64
Platelets        float64
Prothrombin      float64
Stage            float64
dtype: object




In [6]:
# for column in list(df_train.select_dtypes(include="int").columns):
#     fig = px.scatter(data_frame=df_train, 
#                     x=df_train.index, 
#                     y=column, 
#                     color="Status")
#     fig.show()

In [7]:
df = px.data.tips()

In [8]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [9]:
for column in list(df_train.select_dtypes(include="int").columns):
    fig = px.pie(data_frame=df_train, 
                 values=column, 
                 names="Status", 
                 hole=0.3, 
                 title=column)
    fig.show()

In [10]:
for column in list(df_train.select_dtypes(include="float").columns):
    fig = px.pie(data_frame=df_train, 
                 values=column, 
                 names="Status", 
                 hole=0.3)
    fig.show()

In [11]:
from sklearn.utils import resample

In [12]:
def upsample_category(df: pd.DataFrame, column: str, category) -> pd.DataFrame:
    """_summary_

    Args:
        df (pd.DataFrame): _description_
        column (str): _description_
        category (_type_): _description_

    Returns:
        pd.DataFrame: _description_
    """
    # Seperate the desired category.
    df_major = df[df[column] != category]
    df_minor = df[df[column] == category]
    
    # Upsample the minor category.
    df_minor_upsamples = resample(df_minor, 
                                  replace=True, 
                                  n_samples=len(df_major), 
                                  random_state=42)
    
    # Combine majority class with upsampled minority class.
    df_upsampled = pd.concat([df_major, df_minor_upsamples])
    
    return df_upsampled

In [13]:
df_upsampled = upsample_category(df=df_train, 
                                 column="Status", 
                                 category="CL")

In [14]:
for column in list(df_upsampled.select_dtypes(include="int").columns):
    fig = px.pie(data_frame=df_upsampled, 
                 values=column, 
                 names="Status", 
                 hole=0.3, 
                 title=column)
    fig.show()

In [15]:
df_upsampled = upsample_category(df=df_upsampled, 
                                 column="Status", 
                                 category="D")

In [16]:
for column in list(df_upsampled.select_dtypes(include="int").columns):
    fig = px.pie(data_frame=df_upsampled, 
                 values=column, 
                 names="Status", 
                 hole=0.3, 
                 title=column)
    fig.show()

In [17]:
def pre_process_data(df: pd.DataFrame) -> pd.DataFrame:
    """_summary_

    Args:
        df (pd.DataFrame): _description_

    Returns:
        pd.DataFrame: _description_
    """
    if "id" in df.columns:
        df = df.drop(columns=["id"])
    if "Status" in df.columns:
        df["Status"] = df["Status"].map({"D":0, "C":1, "CL":2})
    df = pd.get_dummies(df)
    
    return df

In [18]:
df_train = pre_process_data(df_upsampled)
df_train.reset_index(drop=True, inplace=True)
df_test = pre_process_data(df_test)

In [19]:
df_1, df_2, df_3 = df_train.copy(), df_train.copy(), df_train.copy()

In [20]:
df_1["Status"] = df_1["Status"].map({0: 0, 1: 1, 2: 1})
df_2["Status"] = df_2["Status"].map({0: 1, 1: 0, 2: 1})
df_3["Status"] = df_3["Status"].map({0: 1, 1: 1, 2: 0})

In [21]:
def train_model(model, 
                xtrain: pd.DataFrame, 
                ytrain: pd.Series, 
                xtest: pd.DataFrame, 
                ytest: pd.Series) -> float:
    """_summary_

    Args:
        model (_type_): _description_
        xtrain (pd.DataFrame): _description_
        ytrain (pd.Series): _description_
        xtest (pd.DataFrame): _description_
        ytest (pd.Series): _description_

    Returns:
        float: _description_
    """
    
    model.fit(xtrain, ytrain)
    ypred = model.predict_proba(xtest)
    
    return log_loss(y_true=ytest, y_pred=ypred)

In [22]:
def objective_xgb(trial: optuna.Trial, df: pd.DataFrame) -> float:
    """_summary_

    Args:
        trial (optuna.Trial): _description_

    Returns:
        float: _description_
    """
    
    params = {
        "booster": trial.suggest_categorical("booster", ["gbtree"]),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 300, 700),
        "min_child_weight": trial.suggest_int("min_child_weight", 0.01, 1),
        "subsample": trial.suggest_float("subsample", 0.1, 1.0, log=True),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.1, 1),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1),
        "colsample_bynode": trial.suggest_float("colsample_bynode", 0.1, 1),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.01, 1),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.01, 1),
        "eval_metric":trial.suggest_categorical("eval_metric", ["mlogloss"]),
    }
    
    kf = StratifiedKFold(n_splits=11)
    ll_error_metric = []
    X = df.drop(columns=["Status"])
    y = df["Status"]

    for idx in kf.split(X=X, y=y):
        train_idx, test_idx = idx[0], idx[1]
        train = df.loc[train_idx, :]
        test = df.loc[test_idx, :]
        
        xtrain = train.drop(columns=["Status"])
        xtrain = pd.get_dummies(xtrain)
        ytrain = train["Status"]
        
        xtest = test.drop(columns=["Status"])
        xtest = pd.get_dummies(xtest)
        ytest = test["Status"]
        
        model = XGBClassifier(**params)
        error = train_model(model=model, 
                            xtrain=xtrain, 
                            xtest=xtest, 
                            ytrain=ytrain, 
                            ytest=ytest)
        
        ll_error_metric.append(error)

    return np.mean(ll_error_metric)

In [23]:
objective_xgb_function = partial(objective_xgb, df=df_1)
study_xgb_1 = optuna.create_study(direction="minimize")
study_xgb_1.optimize(objective_xgb_function, n_trials=50, n_jobs=-1, show_progress_bar=True)

  0%|          | 0/50 [00:00<?, ?it/s]

In [24]:
objective_xgb_function = partial(objective_xgb, df=df_2)
study_xgb_2 = optuna.create_study(direction="minimize")
study_xgb_2.optimize(objective_xgb_function, n_trials=50, n_jobs=-1, show_progress_bar=True)

  0%|          | 0/50 [00:00<?, ?it/s]

In [25]:
objective_xgb_function = partial(objective_xgb, df=df_3)
study_xgb_3 = optuna.create_study(direction="minimize")
study_xgb_3.optimize(objective_xgb_function, n_trials=50, n_jobs=-1, show_progress_bar=True)

  0%|          | 0/50 [00:00<?, ?it/s]

In [26]:
def train_final_model(df: pd.DataFrame, params: dict):
    X = df.drop(columns=["Status"])
    y = df["Status"]
    
    model = XGBClassifier(**params)
    model.fit(X, y)
    
    return model

In [27]:
model_1 = train_final_model(df_1, study_xgb_1.best_params)
model_2 = train_final_model(df_2, study_xgb_2.best_params)
model_3 = train_final_model(df_3, study_xgb_3.best_params)

In [28]:
df_sample_submission["Status_D"] = model_1.predict_proba(df_test)[:, 0]
df_sample_submission["Status_C"] = model_2.predict_proba(df_test)[:, 0]
df_sample_submission["Status_CL"] = model_3.predict_proba(df_test)[:, 0]

In [29]:
df_sample_submission.to_csv("../submission/submission.csv", index=False)
df_sample_submission

Unnamed: 0,id,Status_C,Status_CL,Status_D
0,7905,0.207941,0.002001,0.647008
1,7906,0.884674,0.050305,0.091583
2,7907,0.000630,0.003827,0.996115
3,7908,0.998649,0.001091,0.032449
4,7909,0.990934,0.011484,0.033503
...,...,...,...,...
5266,13171,0.988484,0.127156,0.005790
5267,13172,0.999713,0.000311,0.003021
5268,13173,0.994916,0.001535,0.032032
5269,13174,0.999917,0.000387,0.001586
