In [2]:
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')


tqdm.pandas()

In [3]:
train = pd.read_csv("../../dataset/processed/cleanDataset.csv", index_col=0)
test = pd.read_csv("../../dataset/processed/cleanTest.csv", index_col=0)

train.head()

Unnamed: 0,floors_before_eq (total),old_building,plinth_area (ft^2),height_before_eq (ft),land_surface_condition,type_of_foundation,type_of_roof,type_of_ground_floor,type_of_other_floor,position,...,type_of_reinforcement_concrete,residential_type,no_family_residing,public_place_type,industrial_use_type,govermental_use_type,flexible_superstructure,wall_binding,wall_material,damage_grade
0,2,1.0,17.0688,22.0,Flat,bamboo or timber,bamboo/timber light roof,clay,timber/bamboo-mud,Not attached,...,No reinforcement concrete,Non-residential,1.0,Non-public,Non-industrial,Non-govermental,unavailable,Unknown/not stated,Unknown/not stated,1.0
1,3,3.0,300.228,18.0,Flat,clay sand mixed mortar-stone/brick,wood light roof or bamboo heavy roof,clay,timber/bamboo-mud,Not attached,...,No reinforcement concrete,Non-residential,1.0,Non-public,Non-industrial,Non-govermental,unavailable,"Mud + Mortar,Clay",Stone Bricks,5.0
3,2,18.0,56.388,15.0,Flat,clay sand mixed mortar-stone/brick,wood light roof or bamboo light roof,clay,timber/bamboo-mud,Not attached,...,No reinforcement concrete,Non-residential,1.0,Non-public,Non-industrial,Non-govermental,unavailable,"Mud + Mortar,Clay",Stone Bricks,4.0
4,2,22.0,27.432,17.0,Flat,clay sand mixed mortar-stone/brick,bamboo or timber light roof,clay,timber/bamboo-mud,Not attached,...,No reinforcement concrete,Non-residential,1.0,Non-public,Non-industrial,Non-govermental,unavailable,"Mud + Mortar,Clay",Stone Bricks,1.0
7,2,40.0,153.6192,14.0,Flat,clay mortar-stone/brick,bamboo/timber light roof,clay,timber/bamboo-mud,Not attached,...,No reinforcement concrete,Non-residential,1.0,Non-public,Non-industrial,Non-govermental,unavailable,"Mud + Mortar,Clay",Stone Bricks,4.0


In [4]:
dict_type_of_roof = {
    "bamboo/timber-heavy roof": "bamboo/timber heavy roof",
    "bamboo or timber light roof": "bamboo/timber-light roof",
    "bamboo/timber light roof ": "bamboo/timber-light roof",
    "bamboo or timber heavy roof": "bamboo/timber heavy roof",
    "bamboo/timber light roof": "bamboo/timber-light roof",
    "reinforced brick concrete/rcc/rbc": "reinforced cement concrete/rb/rbc"
}

train["type_of_roof"].replace(dict_type_of_roof, inplace=True)
test["type_of_roof"].replace(dict_type_of_roof, inplace=True)

In [5]:
train["type_of_roof"].value_counts()

bamboo/timber-light roof                139695
wood light roof or bamboo heavy roof     70095
wood light roof or bamboo light roof     59880
reinforced cement concrete/rb/rbc        15116
bamboo/timber heavy roof                 14353
reinforced brick slab/rcc/rbc             2468
Name: type_of_roof, dtype: int64

In [6]:
train.shape

(301607, 24)

## FE

In [22]:
from sklearn.model_selection import train_test_split

objectCol = list(train.select_dtypes(include=['object']).columns)
numCol = list(train.select_dtypes(exclude=['object']).columns)[:-1]

X = train.iloc[:, :-1]
y = train.iloc[:, -1]
y -= 1
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [23]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from category_encoders import TargetEncoder
numeric_features = numCol
categorical_features = objectCol
numeric_transformer = make_pipeline(StandardScaler())
categorical_transformer = make_pipeline(TargetEncoder())
Preprocessor = ColumnTransformer(
    transformers=[
        ('numeric_transformer', numeric_transformer, numeric_features),
        ('categorical_transformer', categorical_transformer, categorical_features)
    ])

In [24]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler()
X_rus, y_rus = rus.fit_resample(X_train, y_train)
X_train = Preprocessor.fit_transform(X_rus, y_rus)
X_test = Preprocessor.transform(X_test)

In [25]:
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

x = XGBClassifier()
x.fit(X_train, y_rus)

y_pred = x.predict(X_test)
print(f1_score(y_test, y_pred, average='macro'))

0.8147239334272716


In [27]:
from lightgbm import LGBMClassifier

l = LGBMClassifier()
l.fit(X_train, y_rus)

y_pred = l.predict(X_test)
print(f1_score(y_test, y_pred, average='macro'))

0.8156723152881638


In [44]:
y_pred_sub = l.predict(Preprocessor.transform(test))
submission = pd.read_csv("../../dataset/raw/sample_submission.csv")
submission["damage_grade"] = y_pred_sub + 1
submission["damage_grade"] = submission["damage_grade"].astype(int)
submission.head()

Unnamed: 0,id,damage_grade
0,0,4
1,1,5
2,2,5
3,3,4
4,4,2


In [45]:
# submission.to_csv("../../dataset/submission_jeki_lgbm.csv", index=False)

In [29]:
import optuna

In [30]:
def lgbm_objective(trial):

    params = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "num_class": 5,
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }

    l = LGBMClassifier(**params, random_state=69420)
    l.fit(X_train, y_rus)

    y_pred = l.predict(X_test)

    return f1_score(y_test, y_pred, average="macro")


l_study = optuna.create_study(direction='maximize')
l_study.optimize(lgbm_objective, n_trials=1000, n_jobs=6, timeout=60*15, show_progress_bar=True)

[32m[I 2023-04-13 20:03:29,320][0m A new study created in memory with name: no-name-426ba131-cc51-4172-b1a4-2819f2d3e8a4[0m
[32m[I 2023-04-13 20:03:37,286][0m Trial 1 finished with value: 0.8165224607109908 and parameters: {'lambda_l1': 1.983951289910384e-08, 'lambda_l2': 1.780030368375846e-08, 'num_leaves': 26, 'feature_fraction': 0.5794990130825164, 'bagging_fraction': 0.871482896603495, 'bagging_freq': 2, 'min_child_samples': 93}. Best is trial 1 with value: 0.8165224607109908.[0m
[32m[I 2023-04-13 20:03:39,224][0m Trial 5 finished with value: 0.8161343505785318 and parameters: {'lambda_l1': 4.7157796800040687e-07, 'lambda_l2': 7.97584384901542e-05, 'num_leaves': 37, 'feature_fraction': 0.9070720497765552, 'bagging_fraction': 0.7078914425845146, 'bagging_freq': 7, 'min_child_samples': 61}. Best is trial 1 with value: 0.8165224607109908.[0m
[32m[I 2023-04-13 20:03:46,140][0m Trial 4 finished with value: 0.8146318439732116 and parameters: {'lambda_l1': 5.965156070170147, 'l

In [51]:
l_study.best_value

0.8177861188172632

In [52]:
l_study.best_params

{'lambda_l1': 0.09918938374098207,
 'lambda_l2': 0.12147618595222721,
 'num_leaves': 81,
 'feature_fraction': 0.5375403412968238,
 'bagging_fraction': 0.9903473914047031,
 'bagging_freq': 4,
 'min_child_samples': 78}