In [1]:

from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')


tqdm.pandas()

In [2]:
train = pd.read_csv("../../dataset/processed/cleanDataset.csv", index_col=0)
test = pd.read_csv("../../dataset/processed/cleanTest.csv", index_col=0)

train.head()

Unnamed: 0,floors_before_eq (total),old_building,plinth_area (ft^2),height_before_eq (ft),land_surface_condition,type_of_foundation,type_of_roof,type_of_ground_floor,type_of_other_floor,position,...,type_of_reinforcement_concrete,residential_type,no_family_residing,public_place_type,industrial_use_type,govermental_use_type,flexible_superstructure,wall_binding,wall_material,damage_grade
0,2,1.0,17.0688,22.0,Flat,bamboo or timber,bamboo/timber light roof,clay,timber/bamboo-mud,Not attached,...,No reinforcement concrete,Non-residential,1.0,Non-public,Non-industrial,Non-govermental,unavailable,Unknown/not stated,Unknown/not stated,1.0
1,3,3.0,300.228,18.0,Flat,clay sand mixed mortar-stone/brick,wood light roof or bamboo heavy roof,clay,timber/bamboo-mud,Not attached,...,No reinforcement concrete,Non-residential,1.0,Non-public,Non-industrial,Non-govermental,unavailable,"Mud + Mortar,Clay",Stone Bricks,5.0
3,2,18.0,56.388,15.0,Flat,clay sand mixed mortar-stone/brick,wood light roof or bamboo light roof,clay,timber/bamboo-mud,Not attached,...,No reinforcement concrete,Non-residential,1.0,Non-public,Non-industrial,Non-govermental,unavailable,"Mud + Mortar,Clay",Stone Bricks,4.0
4,2,22.0,27.432,17.0,Flat,clay sand mixed mortar-stone/brick,bamboo or timber light roof,clay,timber/bamboo-mud,Not attached,...,No reinforcement concrete,Non-residential,1.0,Non-public,Non-industrial,Non-govermental,unavailable,"Mud + Mortar,Clay",Stone Bricks,1.0
7,2,40.0,153.6192,14.0,Flat,clay mortar-stone/brick,bamboo/timber light roof,clay,timber/bamboo-mud,Not attached,...,No reinforcement concrete,Non-residential,1.0,Non-public,Non-industrial,Non-govermental,unavailable,"Mud + Mortar,Clay",Stone Bricks,4.0


In [3]:
dict_type_of_roof = {
    "bamboo/timber-heavy roof": "bamboo/timber heavy roof",
    "bamboo or timber light roof": "bamboo/timber-light roof",
    "bamboo/timber light roof ": "bamboo/timber-light roof",
    "bamboo or timber heavy roof": "bamboo/timber heavy roof",
    "bamboo/timber light roof": "bamboo/timber-light roof",
    "reinforced brick concrete/rcc/rbc": "reinforced cement concrete/rb/rbc"
}

train["type_of_roof"].replace(dict_type_of_roof, inplace=True)
test["type_of_roof"].replace(dict_type_of_roof, inplace=True)

In [4]:
train["type_of_roof"].value_counts()

bamboo/timber-light roof                139695
wood light roof or bamboo heavy roof     70095
wood light roof or bamboo light roof     59880
reinforced cement concrete/rb/rbc        15116
bamboo/timber heavy roof                 14353
reinforced brick slab/rcc/rbc             2468
Name: type_of_roof, dtype: int64

In [5]:
train.shape

(301607, 24)

## FE

In [6]:
from sklearn.model_selection import train_test_split

objectCol = list(train.select_dtypes(include=['object']).columns)
numCol = list(train.select_dtypes(exclude=['object']).columns)[:-1]

X = train.iloc[:, :-1]
y = train.iloc[:, -1]
# y -= 1
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [7]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
numeric_features = numCol
categorical_features = objectCol
numeric_transformer = make_pipeline(StandardScaler())
categorical_transformer = make_pipeline(OneHotEncoder())
Preprocessor = ColumnTransformer(
    transformers=[
        ('numeric_transformer', numeric_transformer, numeric_features),
        ('categorical_transformer', categorical_transformer, categorical_features)
    ])

In [8]:
# from imblearn.under_sampling import RandomUnderSampler

# rus = RandomUnderSampler()
# X_rus, y_rus = rus.fit_resample(X_train, y_train)
# X_train = Preprocessor.fit_transform(X_rus)
# X_test = Preprocessor.transform(X_test)

In [14]:
X_train = Preprocessor.fit_transform(X_train)
X_test = Preprocessor.transform(X_test)
y_rus = y_train

In [15]:
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score

l = LGBMClassifier()
l.fit(X_train, y_rus)

y_pred = l.predict(X_test)
print(f1_score(y_test, y_pred, average='macro'))

0.7940132749568173


In [None]:
# y_pred_sub = l.predict(Preprocessor.transform(test))
# submission = pd.read_csv("../../dataset/raw/sample_submission.csv")
# submission["damage_grade"] = y_pred_sub + 1
# submission["damage_grade"] = submission["damage_grade"].astype(int)
# submission.head()

Unnamed: 0,id,damage_grade
0,0,4
1,1,5
2,2,5
3,3,4
4,4,2


In [None]:
# submission.to_csv("../../dataset/submission_jeki_lgbm.csv", index=False)

In [16]:
import optuna

In [17]:
def lgbm_objective(trial):

    params = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "num_class": 5,
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }

    l = LGBMClassifier(**params, random_state=69420)
    l.fit(X_train, y_rus)

    y_pred = l.predict(X_test)

    return f1_score(y_test, y_pred, average="macro")


l_study = optuna.create_study(direction='maximize')
l_study.optimize(lgbm_objective, n_trials=1000, n_jobs=6, timeout=60*15, show_progress_bar=True)

[32m[I 2023-04-14 08:49:46,696][0m A new study created in memory with name: no-name-b8bd5087-f5a9-4843-a7e6-d6c8d702e9e2[0m
[32m[I 2023-04-14 08:50:00,999][0m Trial 4 finished with value: 0.8026636357084115 and parameters: {'lambda_l1': 0.000773085782478493, 'lambda_l2': 0.0021750686263723594, 'num_leaves': 85, 'feature_fraction': 0.8965018192519545, 'bagging_fraction': 0.7246813812181809, 'bagging_freq': 1, 'min_child_samples': 53}. Best is trial 4 with value: 0.8026636357084115.[0m
[32m[I 2023-04-14 08:50:02,796][0m Trial 2 finished with value: 0.8013803059267494 and parameters: {'lambda_l1': 2.8170774105019836e-07, 'lambda_l2': 1.5461057974773343e-06, 'num_leaves': 90, 'feature_fraction': 0.7316840529523945, 'bagging_fraction': 0.798332833119805, 'bagging_freq': 6, 'min_child_samples': 60}. Best is trial 4 with value: 0.8026636357084115.[0m
[32m[I 2023-04-14 08:50:07,011][0m Trial 0 finished with value: 0.7988488189219011 and parameters: {'lambda_l1': 0.000399003878974879

In [20]:
l_study.best_value

0.8088915698431872

In [21]:
l_study.best_params

{'lambda_l1': 0.0064107987442678345,
 'lambda_l2': 5.113815569784563,
 'num_leaves': 246,
 'feature_fraction': 0.7184199960135934,
 'bagging_fraction': 0.5279363243152616,
 'bagging_freq': 5,
 'min_child_samples': 99}

In [22]:
params = {'lambda_l1': 0.0064107987442678345,
          'lambda_l2': 5.113815569784563,
          'num_leaves': 246,
          'feature_fraction': 0.7184199960135934,
          'bagging_fraction': 0.5279363243152616,
          'bagging_freq': 5,
          'min_child_samples': 99}

l_tuned = LGBMClassifier(**params, random_state=69420)
l_tuned.fit(X_train, y_rus)

y_pred = l_tuned.predict(X_test)

print(f1_score(y_test, y_pred, average='macro'))


0.8088915698431872


In [23]:
y_pred_sub_tuned = l_tuned.predict(Preprocessor.transform(test))

In [24]:
y_pred_sub = l.predict(Preprocessor.transform(test))
submission = pd.read_csv("../../dataset/raw/sample_submission.csv")
submission["damage_grade"] = y_pred_sub_tuned
submission["damage_grade"] = submission["damage_grade"].astype(int)
submission.head()

Unnamed: 0,id,damage_grade
0,0,5
1,1,5
2,2,5
3,3,4
4,4,2


In [25]:
submission.to_csv("../../dataset/submission_jeki_lgbm_tanpa_undersampling.csv", index=False)