In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')


In [16]:
train_cp = pd.read_csv("../../dataset/processed/train_cleaned_v4_139k.csv")

In [17]:
from category_encoders.target_encoder import TargetEncoder


def clean(df, mode="test"):
    try:
        df.drop(["no_family_residing"], axis=1, inplace=True)
    except:
        pass
    ordinal_col = ["land_surface_condition", "technical_solution_proposed"]

    binary_col = ["flexible_superstructure", "public_place_type",
                "govermental_use_type", "has_secondary_use"]
    numerical_col = ["floors_before_eq (total)", "old_building",
                    "plinth_area (ft^2)", "height_before_eq (ft)"]
    target_col = ["damage_grade"]
    categorical_col_lt5 = [col for col in train_cp.columns if col not in ordinal_col +
                        binary_col + numerical_col + target_col and train_cp[col].nunique() <= 4]
    cat_col = [col for col in train_cp.columns if col not in ordinal_col +
            binary_col + numerical_col + target_col and train_cp[col].nunique() >= 5]
    angka = {
        "one": 1,
        "two": 2,
        "three": 3,
        "four": 4,
        "five": 5,
        "six": 6,
        "seven": 7,
        "eight": 8,
        "nine": 9,
        "1st": 1,
        "2nd": 2,
        "3rd": 3,
        "fifth": 5,
        "second": 2,
        "third": 3,
        "3.00": 3,
    }

    delete = {
        "story": "",
        "stories": "",
        "floor": "",
        "floors": "",
        "has": "",
        "there is": "",
        "just": "",
        "-": "",
        "fl": "",
        "/": "",
    }

    df["floors_before_eq (total)"] = df["floors_before_eq (total)"]\
        .str.lower()\
        .replace(delete, regex=True)\
        .str.strip()\
        .replace(angka, regex=True)

    df["floors_before_eq (total)"] = df["floors_before_eq (total)"].astype(
        "float")
    df["plinth_area (ft^2)"] = df["plinth_area (ft^2)"].str.lower()

    df["plinth_area (ft^2)"] = df["plinth_area (ft^2)"].apply(
        lambda x: x.replace(" ft^2", ""))
    df["plinth_area (ft^2)"] = df["plinth_area (ft^2)"].apply(
        lambda x: x.replace("more than ", ""))
    df["plinth_area (ft^2)"] = df["plinth_area (ft^2)"].astype(
        "float")

    type_of_foundation_dict = {
        # "Mud mortar-Stone/Brick": "Clay mortar-Stone/Brick",
        "Bamboo/TImber": "Bamboo or Timber",
        "Bamboo/Timber": "Bamboo or Timber",
        "RC": "Reinforced Concrete",
        "Others": "Other",
        "Cement-Stone or Cement-Brick": "Cement-Stone/Brick"
    }

    df["type_of_foundation"] = df["type_of_foundation"]\
        .replace(type_of_foundation_dict)\
        .str.strip()

    type_of_roof_dict = {
        "Bamboo/TImber-Heavy Roof": "Bamboo/Timber Heavy roof",
        "Bamboo/TImber-Light Roof": "Bamboo or Timber Light roof",
        "Bamboo/Timber Light roof": "Bamboo or Timber Light roof ",
        "reinforced cement concrete/rb/rbc": "rcc/rb/rbc",
        "Reinforced brick concrete/rcc/rbc": "rcc/rb/rbc",
        "Bamboo or Timber Heavy roof": "Bamboo/Timber Heavy roof",
        "Reinforced Brick Slab/rcc/rbc": "rcc/rb/rbc",
    }

    df["type_of_roof"] = df["type_of_roof"].replace(
        type_of_roof_dict).str.strip()

    type_of_ground_floor_dict = {
        "rc": "reinforced concrete",
        "brick/stone": "brick or stone",
        # "lumber": "wood",
        # "timber": "wood",
        # "mud": "clay"
    }

    df["type_of_ground_floor"] = df["type_of_ground_floor"].str.lower()\
        .replace(type_of_ground_floor_dict)\


    type_of_other_floor_dict = {
        # "lumber-plank": "wood-plank",
        # "timber-planck": "wood-plank",
        "timber/bamboo-mud": "wood-mud or bamboo mud",
        "reinforced cement concrete/rb/rbc": "rcc/rb/rbc",
        "wood or bamboo mud": "wood-mud or bamboo mud",
        "timber mud or bamboo-mud": "wood-mud or bamboo mud"
    }

    df["type_of_other_floor"] = df["type_of_other_floor"]\
        .str.lower()\
        .replace(type_of_other_floor_dict)

    # df['no_family_residing'] = df['no_family_residing'].replace(
    #     'None', '0.0').astype('float')

    df["residential_type"] = df["residential_type"].replace({
        "Other Residential Type": "Other"
    })
    df['govermental_use_type'] = df['govermental_use_type'].replace(
        ['Police Offices'], 'Govermental Buildings')
    df["public_place_type"] = df["public_place_type"]\
        .apply(lambda x: "Public" if x != "Non-public" else "Non-public")

    legal_ownership_status_dict = {
        "Private": "Private",
        "Private Use": "Private",
        "Prvt": "Private",
        "Privste": "Private",
        "Public Use": "Public",
        "Public Space": "Public",
        "Institutional Use": "Institutional",
        "Institutionals": "Institutional",
        "Unknown": "Other",
        "Unspecified": "Other",
    }

    df["legal_ownership_status"] = df["legal_ownership_status"].str.strip(
    ).replace(legal_ownership_status_dict)

    land_surface_condition_dict = {
        "Steep slope": 0,
        "Moderate slope": 1,
        "Flat": 2,
    }

    df["land_surface_condition"] = df["land_surface_condition"].replace(
        land_surface_condition_dict)

    technical_solution_proposed_dict = {
        "Reconstruction": 0,
        "Major repair": 1,
        "Minor repair": 2,
        "No need": 3,
    }

    df["technical_solution_proposed"] = df["technical_solution_proposed"].replace(
        technical_solution_proposed_dict)

    flexible_superstructure_dict = {
        "unavailable": 0,
        "available": 1,
    }

    df["flexible_superstructure"] = df["flexible_superstructure"].replace(
        flexible_superstructure_dict)

    public_place_type_dict = {
        "Public": 0,
        "Non-public": 1,
    }

    df["public_place_type"] = df["public_place_type"].replace(
        public_place_type_dict)

    governmental_use_type_dict = {
        "Govermental Buildings": 0,
        "Non-govermental": 1,
    }

    df["govermental_use_type"] = df["govermental_use_type"].replace(
        governmental_use_type_dict)

    # ohe_df = pd.DataFrame(ohe.transform(train_cp[categorical_col_lt5]))
    # ohe_df.columns = ohe.get_feature_names(categorical_col_lt5)

    # train_ohe = train_cp.copy()
    # for cat in categorical_col_lt5:
    #     train_ohe.drop(cat, axis=1, inplace=True)

    # print(train_ohe.shape)

    # train_ohe_merge = pd.concat([train_ohe, ohe_df], axis=1)

    # print(train_ohe_merge.shape)
    train_ohe_merge = df.copy()
    # train_ohe_merge[numerical_col] = s.transform(
    #     train_ohe_merge[numerical_col])

    enc_df = train_ohe_merge.copy()
    if mode == "test":
        enc_df[cat_col + categorical_col_lt5] = enc.transform(
            train_ohe_merge[cat_col + categorical_col_lt5])
    else:
        enc = TargetEncoder(cols=cat_col + categorical_col_lt5)
        enc.fit(train_ohe_merge[cat_col + categorical_col_lt5],
                train_ohe_merge["damage_grade"])
        enc_df[cat_col + categorical_col_lt5] = enc.transform(
            train_ohe_merge[cat_col + categorical_col_lt5])

    return enc_df


In [18]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

train_enc = clean(train_cp, mode="train")

X = train_enc.drop(["damage_grade"], axis=1)
y = train_enc["damage_grade"].copy()
y -= 1

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=69420, stratify=y)

print(f"""\
X_train: {X_train.shape}
X_test: {X_test.shape}""")

X_train: (104267, 22)
X_test: (34756, 22)


In [20]:
import optuna
from sklearn.ensemble import RandomForestClassifier

def rf_objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 10, 1000),
        "max_depth": trial.suggest_int("max_depth", 1, 100),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "max_features": trial.suggest_categorical("max_features", ["auto", "sqrt", "log2"]),
    }

    rf = RandomForestClassifier(**params, random_state=69420,  n_jobs=6)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    return f1_score(y_test, y_pred, average="macro")


study = optuna.create_study(direction="maximize")
study.optimize(rf_objective, n_trials=100, timeout=600, n_jobs=6)


[32m[I 2023-04-12 10:49:21,934][0m A new study created in memory with name: no-name-c90c3d7c-bc2d-497f-b3fa-8332a4545c40[0m
[32m[I 2023-04-12 10:49:34,125][0m Trial 2 finished with value: 0.646479245153788 and parameters: {'n_estimators': 49, 'max_depth': 51, 'min_samples_split': 8, 'min_samples_leaf': 7, 'max_features': 'log2'}. Best is trial 2 with value: 0.646479245153788.[0m
[32m[I 2023-04-12 10:49:45,933][0m Trial 5 finished with value: 0.6455516229852611 and parameters: {'n_estimators': 121, 'max_depth': 61, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'auto'}. Best is trial 2 with value: 0.646479245153788.[0m
[32m[I 2023-04-12 10:49:54,157][0m Trial 4 finished with value: 0.6486850978729032 and parameters: {'n_estimators': 203, 'max_depth': 77, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'auto'}. Best is trial 4 with value: 0.6486850978729032.[0m
[32m[I 2023-04-12 10:49:55,068][0m Trial 6 finished with value: 0.17595926063204034

In [21]:
study.best_params

{'n_estimators': 702,
 'max_depth': 47,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'auto'}

In [22]:
from xgboost import XGBClassifier

def xgb_objective(trial):

    params = {
        # 'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
        # 'subsample': trial.suggest_categorical('subsample', [0.4, 0.5, 0.6, 0.7, 0.8, 1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008, 0.01, 0.012, 0.014, 0.016, 0.018, 0.02]),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 5, 15),
    }

    model = XGBClassifier(**params, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    return f1_score(y_test, y_pred, average='macro')


study = optuna.create_study(direction='maximize')
study.optimize(xgb_objective, n_trials=100, n_jobs=6, timeout=600, show_progress_bar=True)

[32m[I 2023-04-12 11:00:57,907][0m A new study created in memory with name: no-name-f958a6fc-f5d9-4d73-84d2-f772ec58fa9f[0m
[32m[I 2023-04-12 11:02:02,293][0m Trial 2 finished with value: 0.6490085020658836 and parameters: {'learning_rate': 0.01, 'n_estimators': 103, 'max_depth': 9}. Best is trial 2 with value: 0.6490085020658836.[0m
[32m[I 2023-04-12 11:02:29,846][0m Trial 0 finished with value: 0.6489523098551897 and parameters: {'learning_rate': 0.014, 'n_estimators': 261, 'max_depth': 6}. Best is trial 2 with value: 0.6490085020658836.[0m
[32m[I 2023-04-12 11:03:36,004][0m Trial 6 finished with value: 0.6486355833274727 and parameters: {'learning_rate': 0.018, 'n_estimators': 392, 'max_depth': 5}. Best is trial 2 with value: 0.6490085020658836.[0m
[32m[I 2023-04-12 11:03:37,998][0m Trial 5 finished with value: 0.648772994311064 and parameters: {'learning_rate': 0.01, 'n_estimators': 454, 'max_depth': 6}. Best is trial 2 with value: 0.6490085020658836.[0m
[32m[I 2023