In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

In [14]:
train = pd.read_csv("../../dataset/processed/technicalImpute.csv")
train.drop(["Unnamed: 0", "no_family_residing"], axis=1, inplace=True)
train.shape


(286527, 23)

In [24]:
train.head()

Unnamed: 0,floors_before_eq (total),old_building,plinth_area (ft^2),height_before_eq (ft),land_surface_condition,type_of_foundation,type_of_roof,type_of_ground_floor,type_of_other_floor,position,...,has_secondary_use,type_of_reinforcement_concrete,residential_type,public_place_type,industrial_use_type,govermental_use_type,flexible_superstructure,wall_binding,wall_material,damage_grade
0,2.0,1.0,256 ft^2,22.0,Flat,Bamboo or Timber,Bamboo/Timber Light roof,Clay,TImber/Bamboo-Mud,Not attached,...,0.0,0.0,Non-residential,Non-public,Non-industrial,Non-govermental,unavailable,0.0,0.0,1.0
1,3.0,3.0,985 ft^2,18.0,Flat,Clay Sand Mixed mortar-Stone/Brick,Wood Light Roof or Bamboo Heavy Roof,Clay,TImber/Bamboo-Mud,Not attached,...,0.0,0.0,Non-residential,Non-public,Non-industrial,Non-govermental,unavailable,5.0,2.0,5.0
2,2.0,18.0,185 ft^2,15.0,Flat,Clay Sand Mixed mortar-Stone/Brick,Wood Light Roof or Bamboo Light Roof,Clay,TImber/Bamboo-Mud,Not attached,...,0.0,0.0,Non-residential,Non-public,Non-industrial,Non-govermental,unavailable,5.0,2.0,4.0
3,2.0,22.0,290 ft^2,17.0,Flat,Clay Sand Mixed mortar-Stone/Brick,Bamboo or Timber Light roof,Clay,Timber Mud or Bamboo-Mud,Not attached,...,0.0,0.0,Non-residential,Non-public,Non-industrial,Non-govermental,unavailable,5.0,2.0,1.0
4,2.0,40.0,504 ft^2,14.0,Flat,Clay mortar-Stone/Brick,Bamboo/Timber Light roof,Clay,TImber/Bamboo-Mud,Not attached,...,0.0,0.0,Non-residential,Non-public,Non-industrial,Non-govermental,unavailable,5.0,2.0,4.0


In [15]:
from category_encoders.target_encoder import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA

ordinal_col = ["land_surface_condition", "technical_solution_proposed"]

binary_col = ["flexible_superstructure", "public_place_type",
              "govermental_use_type", "has_secondary_use"]
numerical_col = ["floors_before_eq (total)", "old_building",
                 "plinth_area (ft^2)", "height_before_eq (ft)"]
target_col = ["damage_grade"]
categorical_col_lt5 = [col for col in train.columns if col not in ordinal_col +
                       binary_col + numerical_col + target_col and train[col].nunique() <= 4]
cat_col = [col for col in train.columns if col not in ordinal_col +
           binary_col + numerical_col + target_col and train[col].nunique() >= 5 and col != "Unnamed: 0"]


def clean(df, mode="train", f=True):
    try:
        df.drop(["no_family_residing"], axis=1, inplace=True)
    except:
        pass
    if f:
        angka = {
            "one": 1,
            "two": 2,
            "three": 3,
            "four": 4,
            "five": 5,
            "six": 6,
            "seven": 7,
            "eight": 8,
            "nine": 9,
            "1st": 1,
            "2nd": 2,
            "3rd": 3,
            "fifth": 5,
            "second": 2,
            "third": 3,
            "3.00": 3,
        }

        delete = {
            "story": "",
            "stories": "",
            "floor": "",
            "floors": "",
            "has": "",
            "there is": "",
            "just": "",
            "-": "",
            "fl": "",
            "/": "",
        }

        df["floors_before_eq (total)"] = df["floors_before_eq (total)"]\
            .str.lower()\
            .replace(delete, regex=True)\
            .str.strip()\
            .replace(angka, regex=True)

    df["floors_before_eq (total)"] = df["floors_before_eq (total)"].astype(
        "float")
    df["plinth_area (ft^2)"] = df["plinth_area (ft^2)"].str.lower()

    df["plinth_area (ft^2)"] = df["plinth_area (ft^2)"].apply(
        lambda x: x.replace(" ft^2", ""))
    df["plinth_area (ft^2)"] = df["plinth_area (ft^2)"].apply(
        lambda x: x.replace("more than ", ""))
    df["plinth_area (ft^2)"] = df["plinth_area (ft^2)"].astype(
        "float")

    type_of_foundation_dict = {
        # "Mud mortar-Stone/Brick": "Clay mortar-Stone/Brick",
        "Bamboo/TImber": "Bamboo or Timber",
        "Bamboo/Timber": "Bamboo or Timber",
        "RC": "Reinforced Concrete",
        "Others": "Other",
        "Cement-Stone or Cement-Brick": "Cement-Stone/Brick"
    }

    df["type_of_foundation"] = df["type_of_foundation"]\
        .replace(type_of_foundation_dict)\
        .str.strip()

    type_of_roof_dict = {
        "Bamboo/TImber-Heavy Roof": "Bamboo/Timber Heavy roof",
        "Bamboo/TImber-Light Roof": "Bamboo or Timber Light roof",
        "Bamboo/Timber Light roof": "Bamboo or Timber Light roof ",
        "reinforced cement concrete/rb/rbc": "rcc/rb/rbc",
        "Reinforced brick concrete/rcc/rbc": "rcc/rb/rbc",
        "Bamboo or Timber Heavy roof": "Bamboo/Timber Heavy roof",
        "Reinforced Brick Slab/rcc/rbc": "rcc/rb/rbc",
    }

    df["type_of_roof"] = df["type_of_roof"].replace(
        type_of_roof_dict).str.strip()

    type_of_ground_floor_dict = {
        "rc": "reinforced concrete",
        "brick/stone": "brick or stone",
        # "lumber": "wood",
        # "timber": "wood",
        # "mud": "clay"
    }

    df["type_of_ground_floor"] = df["type_of_ground_floor"].str.lower()\
        .replace(type_of_ground_floor_dict)\


    type_of_other_floor_dict = {
        # "lumber-plank": "wood-plank",
        # "timber-planck": "wood-plank",
        "timber/bamboo-mud": "wood-mud or bamboo mud",
        "reinforced cement concrete/rb/rbc": "rcc/rb/rbc",
        "wood or bamboo mud": "wood-mud or bamboo mud",
        "timber mud or bamboo-mud": "wood-mud or bamboo mud"
    }

    df["type_of_other_floor"] = df["type_of_other_floor"]\
        .str.lower()\
        .replace(type_of_other_floor_dict)

    # df['no_family_residing'] = df['no_family_residing'].replace(
    #     'None', '0.0').astype('float')

    df["residential_type"] = df["residential_type"].replace({
        "Other Residential Type": "Other"
    })
    df['govermental_use_type'] = df['govermental_use_type'].replace(
        ['Police Offices'], 'Govermental Buildings')
    df["public_place_type"] = df["public_place_type"]\
        .apply(lambda x: "Public" if x != "Non-public" else "Non-public")

    legal_ownership_status_dict = {
        "Private": "Private",
        "Private Use": "Private",
        "Prvt": "Private",
        "Privste": "Private",
        "Public Use": "Public",
        "Public Space": "Public",
        "Institutional Use": "Institutional",
        "Institutionals": "Institutional",
        "Unknown": "Other",
        "Unspecified": "Other",
    }

    df["legal_ownership_status"] = df["legal_ownership_status"].str.strip(
    ).replace(legal_ownership_status_dict)

    land_surface_condition_dict = {
        "Steep slope": 0,
        "Moderate slope": 1,
        "Flat": 2,
    }

    df["land_surface_condition"] = df["land_surface_condition"].replace(
        land_surface_condition_dict)

    technical_solution_proposed_dict = {
        "Reconstruction": 0,
        "Major repair": 1,
        "Minor repair": 2,
        "No need": 3,
    }

    df["technical_solution_proposed"] = df["technical_solution_proposed"].replace(
        technical_solution_proposed_dict)

    flexible_superstructure_dict = {
        "unavailable": 0,
        "available": 1,
    }

    df["flexible_superstructure"] = df["flexible_superstructure"].replace(
        flexible_superstructure_dict)

    public_place_type_dict = {
        "Public": 0,
        "Non-public": 1,
    }

    df["public_place_type"] = df["public_place_type"].replace(
        public_place_type_dict)

    governmental_use_type_dict = {
        "Govermental Buildings": 0,
        "Non-govermental": 1,
    }

    df["govermental_use_type"] = df["govermental_use_type"].replace(
        governmental_use_type_dict)

    return df


ohe = OneHotEncoder(handle_unknown='ignore')
enc = TargetEncoder(cols=cat_col+categorical_col_lt5)
pca = PCA(n_components=0.95)


def encode(df, mode="train"):
    enc_df = df.copy()
    if mode == "train":
        enc.fit(df[cat_col + categorical_col_lt5],
                df["damage_grade"])
        enc_df[cat_col + categorical_col_lt5] = enc.transform(
            df[cat_col + categorical_col_lt5])
    else:
        enc_df[cat_col + categorical_col_lt5] = enc.transform(
            df[cat_col + categorical_col_lt5])

    try:
        enc_df.drop(["index", "Unnamed: 0"], axis=1, inplace=True)
    except:
        pass
    return enc_df

In [17]:
train_clean = train.copy()
train_clean = clean(train_clean, f=False)
train_enc = encode(train_clean, mode="train")

train_enc.head()


Unnamed: 0,floors_before_eq (total),old_building,plinth_area (ft^2),height_before_eq (ft),land_surface_condition,type_of_foundation,type_of_roof,type_of_ground_floor,type_of_other_floor,position,...,has_secondary_use,type_of_reinforcement_concrete,residential_type,public_place_type,industrial_use_type,govermental_use_type,flexible_superstructure,wall_binding,wall_material,damage_grade
0,2.0,1.0,256.0,22.0,2,2.497153,3.73356,3.82385,3.88611,3.610866,...,0.0,3.72214,3.674244,1,3.615194,1,0,2.288225,2.243314,1.0
1,3.0,3.0,985.0,18.0,2,3.911473,3.782195,3.82385,3.88611,3.610866,...,0.0,3.72214,3.674244,1,3.615194,1,0,3.954282,3.959826,5.0
2,2.0,18.0,185.0,15.0,2,3.911473,3.733544,3.82385,3.88611,3.610866,...,0.0,3.72214,3.674244,1,3.615194,1,0,3.954282,3.959826,4.0
3,2.0,22.0,290.0,17.0,2,3.911473,3.73356,3.82385,3.88611,3.610866,...,0.0,3.72214,3.674244,1,3.615194,1,0,3.954282,3.959826,1.0
4,2.0,40.0,504.0,14.0,2,3.914136,3.73356,3.82385,3.88611,3.610866,...,0.0,3.72214,3.674244,1,3.615194,1,0,3.954282,3.959826,4.0


In [8]:
X = train_enc.drop("damage_grade", axis=1)
y = train_enc["damage_grade"]

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [10]:
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score

l = LGBMClassifier(random_state=42)
l.fit(X_train, y_train)

y_pred = l.predict(X_test)

print(f1_score(y_test, y_pred, average="macro"))

0.7924491324710738


In [18]:
test = pd.read_csv('../../dataset/raw/test.csv')
test_clean = test.copy()
test_clean = clean(test_clean)
test_enc = encode(test_clean, mode="test")

In [21]:
y_sub = l.predict(test_enc.drop("id", axis=1))

submission = pd.DataFrame({
    "id": test_enc["id"],
    "damage_grade": y_sub 
})

submission["damage_grade"] = submission["damage_grade"].astype(int)
submission.head()

Unnamed: 0,id,damage_grade
0,0,5
1,1,5
2,2,5
3,3,4
4,4,2


In [22]:
submission.to_csv("../../dataset/submission_teimpute_lgbm.csv", index=False)

In [23]:
train_enc.to_csv("technical_impute_encoded.csv", index=False)
test_enc.to_csv("technical_impute_encoded_test.csv", index=False)