In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')


In [57]:
ec_train = pd.read_csv('../../dataset/processed/targetECTrain.csv')
ec_test = pd.read_csv('../../dataset/processed/targetEC.csv')

In [58]:
BIN = 5

ec_train["old_building"] = pd.qcut(ec_train["old_building"], BIN, labels=np.arange(1, BIN + 1))
ec_train["plinth_area (ft^2)"] = pd.qcut(ec_train["plinth_area (ft^2)"], BIN, labels=np.arange(1, BIN + 1))
ec_train["height_before_eq (ft)"] = pd.qcut(
    ec_train["height_before_eq (ft)"], BIN, labels=np.arange(1, BIN + 1), duplicates='drop')

ec_train.head()

Unnamed: 0,floors_before_eq (total),old_building,plinth_area (ft^2),height_before_eq (ft),land_surface_condition,type_of_foundation,type_of_roof,type_of_ground_floor,type_of_other_floor,position,...,type_of_reinforcement_concrete,residential_type,no_family_residing,public_place_type,industrial_use_type,govermental_use_type,flexible_superstructure,wall_binding,wall_material,damage_grade
0,1.0,7,3,1,3.705136,3.921709,3.747907,3.838859,2.981242,3.637114,...,0.0,3.692359,0.0,3.64726,3.635343,3.644915,3.689582,5.0,2.0,5.0
1,3.0,7,5,4,3.629034,3.925715,3.806631,3.838859,3.899777,3.637114,...,0.0,3.692359,1.0,3.64726,3.635343,3.644915,3.689582,5.0,2.0,4.0
2,2.0,3,6,3,3.629034,3.925715,3.743924,3.7418,3.745711,3.637114,...,0.0,3.692359,1.0,3.64726,3.635343,3.644915,3.689582,5.0,2.0,5.0
3,2.0,7,2,2,3.629034,3.925715,3.747907,3.838859,3.899777,3.637114,...,0.0,3.692359,0.0,3.64726,3.635343,3.644915,3.689582,5.0,2.0,5.0
4,2.0,2,4,2,3.629034,3.921709,3.747907,3.838859,3.899777,3.637114,...,0.0,3.692359,1.0,3.64726,3.635343,3.644915,3.138329,5.0,2.0,2.0


In [59]:
X = ec_train.drop(['damage_grade'], axis=1)
y = ec_train['damage_grade']
y -= 1

In [60]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=69420)

In [61]:
import optuna
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score

In [62]:
def lgbm_objective(trial):

    params = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "num_class": 5,
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }

    l = LGBMClassifier(**params, random_state=69420)
    l.fit(X_train, y_train)

    y_pred = l.predict(X_test)

    return f1_score(y_test, y_pred, average="macro")


l_study = optuna.create_study(direction='maximize')
l_study.optimize(lgbm_objective, n_trials=1000, n_jobs=6,
                 timeout=600, show_progress_bar=True)

[32m[I 2023-04-12 12:50:29,719][0m A new study created in memory with name: no-name-86397a1f-322c-430e-9082-fa19377e8779[0m
[32m[I 2023-04-12 12:50:35,267][0m Trial 2 finished with value: 0.6622489215810156 and parameters: {'lambda_l1': 0.002981940600711814, 'lambda_l2': 0.0007794181417511942, 'num_leaves': 107, 'feature_fraction': 0.6570083065462546, 'bagging_fraction': 0.9825147645714506, 'bagging_freq': 2, 'min_child_samples': 96}. Best is trial 2 with value: 0.6622489215810156.[0m
[32m[I 2023-04-12 12:50:35,785][0m Trial 1 finished with value: 0.6575493890316146 and parameters: {'lambda_l1': 3.227121985789857e-08, 'lambda_l2': 0.3099221237559527, 'num_leaves': 106, 'feature_fraction': 0.5525707176652443, 'bagging_fraction': 0.5924785433075628, 'bagging_freq': 7, 'min_child_samples': 51}. Best is trial 2 with value: 0.6622489215810156.[0m
[32m[I 2023-04-12 12:50:35,910][0m Trial 4 finished with value: 0.6574974391862856 and parameters: {'lambda_l1': 0.0008407476064821677,

In [7]:
l_study.best_params

{'lambda_l1': 1.684829292324147e-08,
 'lambda_l2': 0.018416385634331654,
 'num_leaves': 133,
 'feature_fraction': 0.7582373738177911,
 'bagging_fraction': 0.9860563262737466,
 'bagging_freq': 6,
 'min_child_samples': 53}

In [None]:
{'lambda_l1': 1.684829292324147e-08,
 'lambda_l2': 0.018416385634331654,
 'num_leaves': 133,
 'feature_fraction': 0.7582373738177911,
 'bagging_fraction': 0.9860563262737466,
 'bagging_freq': 6,
 'min_child_samples': 53} # 67.3
