In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

In [105]:
# train_cp = pd.read_csv("../../dataset/processed/train_cleaned_v4_139k.csv")
train_cp = pd.read_csv("../../dataset/raw/train.csv")

train_cp.dropna(inplace=True)
train_cp.drop(columns=['no_family_residing'], inplace=True)

In [151]:
from category_encoders.target_encoder import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA

ordinal_col = ["land_surface_condition", "technical_solution_proposed"]

binary_col = ["flexible_superstructure", "public_place_type",
            "govermental_use_type", "has_secondary_use"]
numerical_col = ["floors_before_eq (total)", "old_building",
                "plinth_area (ft^2)", "height_before_eq (ft)"]
target_col = ["damage_grade"]
categorical_col_lt5 = [col for col in train_cp.columns if col not in ordinal_col +
                    binary_col + numerical_col + target_col and train_cp[col].nunique() <= 4]
cat_col = [col for col in train_cp.columns if col not in ordinal_col +
        binary_col + numerical_col + target_col and train_cp[col].nunique() >= 5 and col != "Unnamed: 0"]
def clean(df, mode="train"):
    try:
        df.drop(["no_family_residing"], axis=1, inplace=True)
    except:
        pass
    angka = {
        "one": 1,
        "two": 2,
        "three": 3,
        "four": 4,
        "five": 5,
        "six": 6,
        "seven": 7,
        "eight": 8,
        "nine": 9,
        "1st": 1,
        "2nd": 2,
        "3rd": 3,
        "fifth": 5,
        "second": 2,
        "third": 3,
        "3.00": 3,
    }

    delete = {
        "story": "",
        "stories": "",
        "floor": "",
        "floors": "",
        "has": "",
        "there is": "",
        "just": "",
        "-": "",
        "fl": "",
        "/": "",
    }

    df["floors_before_eq (total)"] = df["floors_before_eq (total)"]\
        .str.lower()\
        .replace(delete, regex=True)\
        .str.strip()\
        .replace(angka, regex=True)

    df["floors_before_eq (total)"] = df["floors_before_eq (total)"].astype(
        "float")
    df["plinth_area (ft^2)"] = df["plinth_area (ft^2)"].str.lower()

    df["plinth_area (ft^2)"] = df["plinth_area (ft^2)"].apply(
        lambda x: x.replace(" ft^2", ""))
    df["plinth_area (ft^2)"] = df["plinth_area (ft^2)"].apply(
        lambda x: x.replace("more than ", ""))
    df["plinth_area (ft^2)"] = df["plinth_area (ft^2)"].astype(
        "float")

    type_of_foundation_dict = {
        # "Mud mortar-Stone/Brick": "Clay mortar-Stone/Brick",
        "Bamboo/TImber": "Bamboo or Timber",
        "Bamboo/Timber": "Bamboo or Timber",
        "RC": "Reinforced Concrete",
        "Others": "Other",
        "Cement-Stone or Cement-Brick": "Cement-Stone/Brick"
    }

    df["type_of_foundation"] = df["type_of_foundation"]\
        .replace(type_of_foundation_dict)\
        .str.strip()

    type_of_roof_dict = {
        "Bamboo/TImber-Heavy Roof": "Bamboo/Timber Heavy roof",
        "Bamboo/TImber-Light Roof": "Bamboo or Timber Light roof",
        "Bamboo/Timber Light roof": "Bamboo or Timber Light roof ",
        "reinforced cement concrete/rb/rbc": "rcc/rb/rbc",
        "Reinforced brick concrete/rcc/rbc": "rcc/rb/rbc",
        "Bamboo or Timber Heavy roof": "Bamboo/Timber Heavy roof",
        "Reinforced Brick Slab/rcc/rbc": "rcc/rb/rbc",
    }

    df["type_of_roof"] = df["type_of_roof"].replace(
        type_of_roof_dict).str.strip()

    type_of_ground_floor_dict = {
        "rc": "reinforced concrete",
        "brick/stone": "brick or stone",
        # "lumber": "wood",
        # "timber": "wood",
        # "mud": "clay"
    }

    df["type_of_ground_floor"] = df["type_of_ground_floor"].str.lower()\
        .replace(type_of_ground_floor_dict)\


    type_of_other_floor_dict = {
        # "lumber-plank": "wood-plank",
        # "timber-planck": "wood-plank",
        "timber/bamboo-mud": "wood-mud or bamboo mud",
        "reinforced cement concrete/rb/rbc": "rcc/rb/rbc",
        "wood or bamboo mud": "wood-mud or bamboo mud",
        "timber mud or bamboo-mud": "wood-mud or bamboo mud"
    }

    df["type_of_other_floor"] = df["type_of_other_floor"]\
        .str.lower()\
        .replace(type_of_other_floor_dict)

    # df['no_family_residing'] = df['no_family_residing'].replace(
    #     'None', '0.0').astype('float')

    df["residential_type"] = df["residential_type"].replace({
        "Other Residential Type": "Other"
    })
    df['govermental_use_type'] = df['govermental_use_type'].replace(
        ['Police Offices'], 'Govermental Buildings')
    df["public_place_type"] = df["public_place_type"]\
        .apply(lambda x: "Public" if x != "Non-public" else "Non-public")

    legal_ownership_status_dict = {
        "Private": "Private",
        "Private Use": "Private",
        "Prvt": "Private",
        "Privste": "Private",
        "Public Use": "Public",
        "Public Space": "Public",
        "Institutional Use": "Institutional",
        "Institutionals": "Institutional",
        "Unknown": "Other",
        "Unspecified": "Other",
    }

    df["legal_ownership_status"] = df["legal_ownership_status"].str.strip(
    ).replace(legal_ownership_status_dict)

    land_surface_condition_dict = {
        "Steep slope": 0,
        "Moderate slope": 1,
        "Flat": 2,
    }

    df["land_surface_condition"] = df["land_surface_condition"].replace(
        land_surface_condition_dict)

    technical_solution_proposed_dict = {
        "Reconstruction": 0,
        "Major repair": 1,
        "Minor repair": 2,
        "No need": 3,
    }

    df["technical_solution_proposed"] = df["technical_solution_proposed"].replace(
        technical_solution_proposed_dict)

    flexible_superstructure_dict = {
        "unavailable": 0,
        "available": 1,
    }

    df["flexible_superstructure"] = df["flexible_superstructure"].replace(
        flexible_superstructure_dict)

    public_place_type_dict = {
        "Public": 0,
        "Non-public": 1,
    }

    df["public_place_type"] = df["public_place_type"].replace(
        public_place_type_dict)

    governmental_use_type_dict = {
        "Govermental Buildings": 0,
        "Non-govermental": 1,
    }

    df["govermental_use_type"] = df["govermental_use_type"].replace(
        governmental_use_type_dict)

    
        

    return df

ohe = OneHotEncoder(handle_unknown='ignore')
enc = TargetEncoder(cols=cat_col)
pca = PCA(n_components=0.95)
def encode(df, mode="train"):
    if mode == "train":
        ohe_df = pd.DataFrame(ohe.fit_transform(df[categorical_col_lt5]).toarray())        
        pca_df = pd.DataFrame(pca.fit_transform(ohe_df))
        pca_df.columns = [f"pca_{i}" for i in range(pca_df.shape[1])]
    
    else:
        ohe_df = pd.DataFrame(ohe.transform(df[categorical_col_lt5]).toarray())
        pca_df = pd.DataFrame(pca.transform(ohe_df))
        pca_df.columns = [f"pca_{i}" for i in range(pca_df.shape[1])]

    train_ohe = pca_df.copy()
    

    df.drop(categorical_col_lt5, axis=1, inplace=True)
    train_ohe_merge = pd.concat([df.reset_index(), train_ohe.reset_index()], axis=1)
        
    enc_df = train_ohe_merge.copy()
    if mode == "train":
        enc.fit(train_ohe_merge[cat_col],
                train_ohe_merge["damage_grade"])
        enc_df[cat_col] = enc.transform(
            train_ohe_merge[cat_col])
    else:
        enc_df[cat_col] = enc.transform(
            train_ohe_merge[cat_col])
    
    try:
        enc_df.drop(["index", "Unnamed: 0"], axis=1, inplace=True)
    except:
        pass
    return enc_df


In [152]:
train_clean = train_cp.copy()
train_clean = clean(train_clean)
train_enc = encode(train_clean)

train_enc.head()

Unnamed: 0,floors_before_eq (total),old_building,plinth_area (ft^2),height_before_eq (ft),land_surface_condition,type_of_foundation,type_of_roof,type_of_ground_floor,type_of_other_floor,building_plan_configuration,...,govermental_use_type,flexible_superstructure,wall_binding,damage_grade,pca_0,pca_1,pca_2,pca_3,pca_4,pca_5
0,1.0,22.0,300.0,10.0,1,3.921709,3.747907,3.84786,2.981242,3.664172,...,1,0,3.96323,5.0,-0.327557,-0.147526,-0.006454,0.012881,-0.008573,0.007853
1,3.0,22.0,360.0,18.0,2,3.924681,3.806631,3.837207,3.899777,3.664172,...,1,0,3.96323,4.0,-0.327557,-0.147526,-0.006454,0.012881,-0.008573,0.007853
2,2.0,8.0,380.0,17.0,2,3.924681,3.743924,3.7418,3.725698,3.664172,...,1,0,3.96323,5.0,-0.327557,-0.147526,-0.006454,0.012881,-0.008573,0.007853
3,2.0,22.0,250.0,14.0,2,3.924681,3.747907,3.837207,3.899777,3.664172,...,1,0,3.96323,5.0,-0.327557,-0.147526,-0.006454,0.012881,-0.008573,0.007853
4,2.0,7.0,312.0,13.0,2,3.921709,3.747907,3.837207,3.899777,3.664172,...,1,1,3.96323,2.0,-0.327557,-0.147526,-0.006454,0.012881,-0.008573,0.007853


In [153]:

test = pd.read_csv("../../dataset/raw/test.csv")
test_clean = clean(test)
test_enc = encode(test_clean, mode="test")
test_enc.drop(["index"], axis=1, inplace=True)
test_enc.head()


Unnamed: 0,index,id,floors_before_eq (total),old_building,plinth_area (ft^2),height_before_eq (ft),land_surface_condition,type_of_foundation,type_of_roof,type_of_ground_floor,...,govermental_use_type,flexible_superstructure,wall_binding,index.1,pca_0,pca_1,pca_2,pca_3,pca_4,pca_5
0,0,0,2.0,7,418.0,14,2,3.924681,3.747907,3.837207,...,1,0,3.96323,0,-0.327557,-0.147526,-0.006454,0.012881,-0.008573,0.007853
1,1,1,3.0,13,396.0,21,2,3.924681,3.810115,3.7418,...,1,0,3.96323,1,-0.327557,-0.147526,-0.006454,0.012881,-0.008573,0.007853
2,2,2,3.0,40,400.0,18,2,3.924681,3.747907,3.837207,...,1,0,3.96323,2,-0.327557,-0.147526,-0.006454,0.012881,-0.008573,0.007853
3,3,3,2.0,25,378.0,20,2,2.491733,3.810115,3.84786,...,1,0,3.96323,3,0.467252,-0.289476,0.002503,0.365752,1.095728,-0.044322
4,4,4,2.0,5,375.0,20,2,3.92995,3.743924,3.837207,...,1,0,3.96323,4,1.014481,-0.569998,-0.043639,-0.098948,-0.088093,-0.00598


In [155]:
train_enc.shape, test_enc.shape

((46801, 26), (242082, 26))

In [116]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score


X = train_enc.drop(["damage_grade"], axis=1)
y = train_enc["damage_grade"].copy()
y -= 1

In [117]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=69420, stratify=y)

print(f"""\
X_train: {X_train.shape}
X_test: {X_test.shape}""")


X_train: (35100, 25)
X_test: (11701, 25)


In [118]:
from xgboost import XGBClassifier

xgb = XGBClassifier(random_state=69420)
xgb.fit(X_train, y_train)

y_pred = xgb.predict(X_test)

print(f1_score(y_test, y_pred, average="macro"))

0.655396928306659


In [None]:
from xgboost import XGBClassifier
import optuna

def xgb_objective(trial):

    params = {
        # 'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
        # 'subsample': trial.suggest_categorical('subsample', [0.4, 0.5, 0.6, 0.7, 0.8, 1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008, 0.01, 0.012, 0.014, 0.016, 0.018, 0.02]),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 5, 15),
    }

    model = XGBClassifier(**params, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    return f1_score(y_test, y_pred, average='macro')


study = optuna.create_study(direction='maximize')
study.optimize(xgb_objective, n_trials=100, n_jobs=6,
               timeout=600, show_progress_bar=True)


In [121]:
from lightgbm import LGBMClassifier

l = LGBMClassifier(random_state=69420)
l.fit(X_train, y_train)

y_pred = l.predict(X_test)

print(f1_score(y_test, y_pred, average="macro"))

0.6620426919142173


In [None]:
def lgbm_objective(trial):
    
    params = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "num_class": 5,
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }
    
    l = LGBMClassifier(**params, random_state=69420)
    l.fit(X_train, y_train)
    
    y_pred = l.predict(X_test)
    
    return f1_score(y_test, y_pred, average="macro")

l_study = optuna.create_study(direction='maximize')
l_study.optimize(lgbm_objective, n_trials=1000, n_jobs=6, timeout=600, show_progress_bar=True)


In [127]:
l_study.best_params

{'lambda_l1': 8.386932414451237e-05,
 'lambda_l2': 0.18503504675424937,
 'num_leaves': 242,
 'feature_fraction': 0.5594507978947841,
 'bagging_fraction': 0.9848409189977901,
 'bagging_freq': 7,
 'min_child_samples': 92}

67% {'lambda_l1': 1.1230499970601132e-05,
 'lambda_l2': 9.960255518498727,
 'num_leaves': 213,
 'feature_fraction': 0.5395087891258938,
 'bagging_fraction': 0.9924827152529022,
 'bagging_freq': 6,
 'min_child_samples': 100}

\\

67.2 {'lambda_l1': 8.386932414451237e-05,
 'lambda_l2': 0.18503504675424937,
 'num_leaves': 242,
 'feature_fraction': 0.5594507978947841,
 'bagging_fraction': 0.9848409189977901,
 'bagging_freq': 7,
 'min_child_samples': 92}

In [133]:
param_ = {'lambda_l1': 8.386932414451237e-05,
          'lambda_l2': 0.18503504675424937,
          'num_leaves': 242,
          'feature_fraction': 0.5594507978947841,
          'bagging_fraction': 0.9848409189977901,
          'bagging_freq': 7,
          'min_child_samples': 92}

l = LGBMClassifier(**param_, random_state=69420)
l.fit(X_train, y_train)

y_pred = l.predict(X_test)

print(f1_score(y_test, y_pred, average="macro"))

0.6726994845786596


In [158]:
y_sub = l.predict(test_enc.drop("id", axis=1))

submission = pd.DataFrame({
    "id": test_enc["id"],
    "damage_grade": y_sub + 1
})

submission["damage_grade"] = submission["damage_grade"].astype(int)
submission.head()

Unnamed: 0,id,damage_grade
0,0,5
1,1,5
2,2,5
3,3,5
4,4,2


In [159]:
submission.to_csv("../../dataset/submission_ohepcate_lgbm.csv", index=False)