# Notebook to Implement Model Training - LGBM

---

### 1) Setup

In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import balanced_accuracy_score, make_scorer
from sklearn.model_selection import RepeatedKFold, cross_val_score, GridSearchCV

import optuna
import lightgbm as lgb

from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

In [2]:
SEED = 42

In [3]:
TRAIN_CLINICAL_FILENAME = "train_set_clinical.csv"
TEST_CLINICAL_FILENAME = "test_set_clinical.csv"

---

### 2) Read and Preprocess Data

In [4]:
train = pd.read_csv(TRAIN_CLINICAL_FILENAME, sep=";", index_col="ID")

In [5]:
train.shape

(132, 650)

In [6]:
train.head()

Unnamed: 0_level_0,Age (Y),Sex,Fever,Cough,Headache,Sore throat,Muscle or Body Aches,Fadigue,Congestion or runny nose,Shortness of breath or difficulty breathing,...,Freq.8943.76551923189,Freq.9058.85825530971,Freq.9098.58510797401,Freq.9437.74469644083,Freq.9593.90405666006,Freq.9799.842201746,Freq.10432.4853106264,Freq.11006.9514551194,Freq.11161.31855876,Group
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,53,F,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.7e-05,5e-06,2e-05,2.9e-05,3.7e-05,2.4e-05,1.6e-05,3e-05,3e-05,MILD
2,21,F,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,2.1e-05,1.3e-05,2e-06,4e-06,5e-06,2e-06,8e-06,0.000103,2e-06,MILD
5,62,F,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,...,9.9e-05,7.9e-05,1.3e-05,1.2e-05,1e-06,6.2e-05,3e-06,8.9e-05,2.8e-05,MILD
6,34,F,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,2.5e-05,2.1e-05,1.7e-05,1e-05,2.2e-05,0.000156,1.5e-05,5.3e-05,7e-06,SEVERE
8,42,M,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,9e-06,0.000153,6e-06,4.6e-05,3e-06,1.7e-05,8e-06,0.00011,7e-06,SEVERE


In [7]:
##### Preprocessing

train_mod = train.copy()
# Drop NaN Values 
train_mod.dropna(inplace=True)
# Convert Sex column to boolean (Female: 1, Male: 0)
train_mod["Sex"] = np.where(train_mod["Sex"]=="F", 1, 0)

In [8]:
train_mod.shape

(131, 650)

In [9]:
train_mod.head()

Unnamed: 0_level_0,Age (Y),Sex,Fever,Cough,Headache,Sore throat,Muscle or Body Aches,Fadigue,Congestion or runny nose,Shortness of breath or difficulty breathing,...,Freq.8943.76551923189,Freq.9058.85825530971,Freq.9098.58510797401,Freq.9437.74469644083,Freq.9593.90405666006,Freq.9799.842201746,Freq.10432.4853106264,Freq.11006.9514551194,Freq.11161.31855876,Group
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,53,1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.7e-05,5e-06,2e-05,2.9e-05,3.7e-05,2.4e-05,1.6e-05,3e-05,3e-05,MILD
2,21,1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,2.1e-05,1.3e-05,2e-06,4e-06,5e-06,2e-06,8e-06,0.000103,2e-06,MILD
5,62,1,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,...,9.9e-05,7.9e-05,1.3e-05,1.2e-05,1e-06,6.2e-05,3e-06,8.9e-05,2.8e-05,MILD
6,34,1,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,2.5e-05,2.1e-05,1.7e-05,1e-05,2.2e-05,0.000156,1.5e-05,5.3e-05,7e-06,SEVERE
8,42,0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,9e-06,0.000153,6e-06,4.6e-05,3e-06,1.7e-05,8e-06,0.00011,7e-06,SEVERE


---

### 3) Baseline Model Training and CV

In [10]:
# Define Classifier (or pipeline)
clf = lgb.LGBMClassifier(random_state=SEED)

In [11]:
# Get Features and Target
X, y = train_mod.drop("Group", axis=1), train_mod["Group"]

In [12]:
# Defining RepeatedKFold Cross Validator
rkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=SEED)

In [13]:
# Define metric scorer
metric_scorer = make_scorer(balanced_accuracy_score)

In [14]:
# Cross validate model
scores = cross_val_score(clf, X, y, scoring=metric_scorer, cv=rkf, n_jobs=-1)

In [15]:
# Score from each CV Iteration
scores

array([0.75833333, 0.79699248, 0.76875   , 0.71895425, 0.85947712,
       0.73901099, 0.71568627, 0.94444444, 0.70606061, 0.79166667,
       0.88888889, 0.80769231, 0.67261905, 0.7593985 , 0.83333333,
       0.87058824, 0.69230769, 0.76388889, 0.875     , 0.73308271,
       0.72058824, 0.73076923, 0.80451128, 0.79761905, 0.64583333,
       0.87058824, 0.62727273, 0.75625   , 0.86363636, 0.75694444,
       0.79117647, 0.65238095, 0.675     , 0.74404762, 0.81818182,
       0.725     , 0.80357143, 0.85      , 0.72222222, 0.72556391,
       0.86111111, 0.89166667, 0.73809524, 0.6875    , 0.75151515,
       0.70723684, 0.84242424, 0.68452381, 0.82857143, 0.75      ])

In [16]:
# Mean Metric Value
np.mean(scores)

0.7703995514289632

---

### 4) Experiments

##### 4.1) Hyper Parameter Optimization with Optuna

In [17]:
# Define objectiva funtion to maximize metric
def objective(trial):
    
    # Hyper Parameter Grid
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "n_estimators": trial.suggest_int("max_depth", 100, 1000)
    }
    # Create model with param trial
    clf = lgb.LGBMClassifier(random_state=SEED, **param)
    # Get CV Metric we want to maximize
    balanced_accuracy = np.mean(cross_val_score(clf, X, y, scoring=metric_scorer, cv=rkf, n_jobs=-1))
    
    return balanced_accuracy

In [18]:
study = optuna.create_study(direction="maximize")

[32m[I 2022-06-07 21:33:10,292][0m A new study created in memory with name: no-name-8dc14008-d12f-4d4a-9f86-4896d741ebc6[0m


In [None]:
study.optimize(objective, n_trials=500)

In [20]:
# Get best trial based on metric score
trial = study.best_trial

In [21]:
# Print params for best trial
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

    lambda_l1: 1.906040265636331e-06
    lambda_l2: 0.00013566593007094776
    num_leaves: 103
    feature_fraction: 0.9084190810467557
    bagging_fraction: 0.793872889712087
    bagging_freq: 2
    min_child_samples: 32
    max_depth: 891


In [22]:
# Best Score from HP Opt
trial.values[0]

0.8481697872990442

- Aumento de performance considerável em relação ao valor baseline para um LGBM

##### 4.2) PCA Dimension Reduction + Hyper Parameter Optimization with Optuna

In [23]:
# Define objectiva funtion to maximize metric
def objective(trial):
    
    # PCA Parameter Grid
    pca_param = {
        "n_components": trial.suggest_int("n_components", 5, 100)
    }
    # Hyper Parameter Grid
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "n_estimators": trial.suggest_int("max_depth", 100, 1000)
    }
    # Create model pipeline with param trial
    clf = Pipeline([("pca", PCA(**pca_param)),
                    ("lgbm_clf", lgb.LGBMClassifier(random_state=SEED, **param))])
    # Get CV Metric we want to maximize
    balanced_accuracy = np.mean(cross_val_score(clf, X, y, scoring=metric_scorer, cv=rkf, n_jobs=-1))
    
    return balanced_accuracy

In [24]:
study = optuna.create_study(direction="maximize")

[32m[I 2022-06-07 22:03:03,265][0m A new study created in memory with name: no-name-093fcda5-6d53-4232-a790-502672351a7d[0m


In [None]:
study.optimize(objective, n_trials=500)

In [26]:
# Get best trial based on metric score
trial = study.best_trial

In [27]:
# Print params for best trial
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

    n_components: 98
    lambda_l1: 4.997984938308335e-06
    lambda_l2: 9.51579276153768e-06
    num_leaves: 44
    feature_fraction: 0.5421454819677247
    bagging_fraction: 0.48235662397332973
    bagging_freq: 4
    min_child_samples: 8
    max_depth: 563


In [28]:
# Best Score from HP Opt
trial.values[0]

0.7134464057614213

- Utilizar redução de dimensionalidade via PCA piorou bastante a performance do modelo

##### 4.3) Features Scaler + Hyper Parameter Optimization with Optuna

In [29]:
# Define objectiva funtion to maximize metric
def objective(trial):
    
    # List scalers to chose from
    scalers = trial.suggest_categorical("scalers", ['minmax', 'standard', 'robust'])
    # Define scalers
    if scalers == "minmax":
        scaler = MinMaxScaler()
    elif scalers == "standard":
        scaler = StandardScaler()
    else:
        scaler = RobustScaler()
    # LGBM Hyper Parameter Grid
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "n_estimators": trial.suggest_int("max_depth", 100, 1000)
    }
    # Create model pipeline with param trial
    clf = Pipeline([("scaler", scaler),
                    ("lgbm_clf", lgb.LGBMClassifier(random_state=SEED, **param))])
    # Get CV Metric we want to maximize
    balanced_accuracy = np.mean(cross_val_score(clf, X, y, scoring=metric_scorer, cv=rkf, n_jobs=-1))
    
    return balanced_accuracy

In [30]:
study = optuna.create_study(direction="maximize")

[32m[I 2022-06-07 22:15:09,043][0m A new study created in memory with name: no-name-9d027bd4-11fc-4b2d-8846-da6f6ffb30f6[0m


In [None]:
study.optimize(objective, n_trials=500)

In [32]:
# Get best trial based on metric score
trial = study.best_trial

In [33]:
# Print params for best trial
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

    scalers: robust
    lambda_l1: 4.3223282482835764e-07
    lambda_l2: 3.3796186878776776e-05
    num_leaves: 188
    feature_fraction: 0.650978565866076
    bagging_fraction: 0.9212536490874126
    bagging_freq: 7
    min_child_samples: 36
    max_depth: 956


In [34]:
# Best Score from HP Opt
trial.values[0]

0.8521054248039541

- A adição de um estágio de feature scaling antes do treinamento parece ter ajudado o modelo

##### 4.4) Boruta Feature Selection + Features Scaler + Hyper Parameter Optimization with Optuna

In [35]:
# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=SEED, )

In [None]:
feat_selector.fit(np.array(X), y)

In [37]:
print("Number of selected features: ", feat_selector.n_features_)

Number of selected features:  29


In [38]:
# Best Features (accord to Boruta)
X.columns[feat_selector.support_]

Index(['Freq.1324.08044804632', 'Freq.1715.32907573994',
       'Freq.1793.31292765446', 'Freq.1989.65991446642',
       'Freq.2032.95119529926', 'Freq.2148.55916353654',
       'Freq.2182.52259691583', 'Freq.2186.33577081196',
       'Freq.2242.07973381149', 'Freq.2761.81291676166',
       'Freq.3044.21704373186', 'Freq.3414.71158220371',
       'Freq.3425.70211639867', 'Freq.3912.82100942603',
       'Freq.4006.25935764913', 'Freq.4266.3135397872',
       'Freq.4282.69712175929', 'Freq.4305.85988898402',
       'Freq.4318.14922038936', 'Freq.4395.12541812139',
       'Freq.4773.1748593189', 'Freq.4823.05474215093',
       'Freq.5085.12753419191', 'Freq.5224.39772946441',
       'Freq.5433.51287445961', 'Freq.6079.05181901815',
       'Freq.7501.51838000843', 'Freq.7738.28945568542',
       'Freq.8943.76551923189'],
      dtype='object')

In [39]:
# Filter most importante features
X_transform = feat_selector.transform(np.array(X))

In [40]:
# Define objectiva funtion to maximize metric
def objective(trial):
    
    # List scalers to chose from
    scalers = trial.suggest_categorical("scalers", ['minmax', 'standard', 'robust'])
    # Define scalers
    if scalers == "minmax":
        scaler = MinMaxScaler()
    elif scalers == "standard":
        scaler = StandardScaler()
    else:
        scaler = RobustScaler()
    # LGBM Hyper Parameter Grid
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "n_estimators": trial.suggest_int("max_depth", 100, 1000)
    }
    # Create model pipeline with param trial
    clf = Pipeline([("scaler", scaler),
                    ("lgbm_clf", lgb.LGBMClassifier(random_state=SEED, **param))])
    # Get CV Metric we want to maximize
    balanced_accuracy = np.mean(cross_val_score(clf, X_transform, y, scoring=metric_scorer, cv=rkf, n_jobs=-1))
    
    return balanced_accuracy

In [41]:
study = optuna.create_study(direction="maximize")

[32m[I 2022-06-07 22:53:12,323][0m A new study created in memory with name: no-name-c4546cea-e73f-48ab-b880-81890a252ef4[0m


In [None]:
study.optimize(objective, n_trials=500)

In [43]:
# Get best trial based on metric score
trial = study.best_trial

In [44]:
# Print params for best trial
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

    scalers: standard
    lambda_l1: 0.2404118700722963
    lambda_l2: 1.492665030235694
    num_leaves: 135
    feature_fraction: 0.6682498832291581
    bagging_fraction: 0.46400514850106434
    bagging_freq: 1
    min_child_samples: 14
    max_depth: 924


In [45]:
# Best Score from HP Opt
trial.values[0]

0.9043265457023197

- Pré selecionar as features mais importantes auxiliou a obter uma performance ainda melhor a partir do pipeline com Robust Scaler e LightGBM