In [1]:
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import numpy as np

In [2]:
train_values = pd.read_csv("../data/raw/train_values.csv")
train_labels = pd.read_csv("../data/raw/train_labels.csv")

Absolute baseline model: Use the mode of damage values (2)

In [3]:
base_mode = train_labels.copy()

In [4]:
base_mode.loc[:, "damage_grade"] = base_mode.loc[:, "damage_grade"].mode()
base_mode.loc[:, "damage_grade"].unique()

array([2])

In [5]:
mode_f1 = f1_score(
    y_true=train_labels.loc[:, "damage_grade"],
    y_pred=base_mode.loc[:, "damage_grade"],
    average="micro",
)
print(
    f"The F1 micro score for the absolute baseline model (all values are 2) is: {mode_f1}."
)

The F1 micro score for the absolute baseline model (all values are 2) is: 0.5689118614280068.


Basic model: simple decision tree based on those columns that are not dtype object

In [6]:
train_values_int = train_values.select_dtypes(include="int64")

In [7]:
print(train_values_int.dtypes)
print(train_values_int.shape)

building_id                               int64
geo_level_1_id                            int64
geo_level_2_id                            int64
geo_level_3_id                            int64
count_floors_pre_eq                       int64
age                                       int64
area_percentage                           int64
height_percentage                         int64
has_superstructure_adobe_mud              int64
has_superstructure_mud_mortar_stone       int64
has_superstructure_stone_flag             int64
has_superstructure_cement_mortar_stone    int64
has_superstructure_mud_mortar_brick       int64
has_superstructure_cement_mortar_brick    int64
has_superstructure_timber                 int64
has_superstructure_bamboo                 int64
has_superstructure_rc_non_engineered      int64
has_superstructure_rc_engineered          int64
has_superstructure_other                  int64
count_families                            int64
has_secondary_use                       

In [8]:
train_values_int = train_values_int.drop(columns="building_id")
train_labels = train_labels.drop(columns="building_id")

In [9]:
unique, counts = np.unique(train_labels, return_counts=True)
dict(zip(unique, counts))

{1: 25124, 2: 148259, 3: 87218}

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    train_values_int, train_labels, test_size=0.2, random_state=42
)

In [11]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(208480, 30)
(52121, 30)
(208480, 1)
(52121, 1)


In [12]:
unique, counts = np.unique(y_train, return_counts=True)
dict(zip(unique, counts))

{1: 19954, 2: 118772, 3: 69754}

In [13]:
unique, counts = np.unique(y_test, return_counts=True)
dict(zip(unique, counts))

{1: 5170, 2: 29487, 3: 17464}

In [14]:
dt_classifier = DecisionTreeClassifier(
    random_state=42, max_depth=5, max_leaf_nodes=1000
)
dt_classifier.fit(X_train, y_train)

In [16]:
y_pred = dt_classifier.predict(X_test)

In [17]:
unique, counts = np.unique(y_pred, return_counts=True)
dict(zip(unique, counts))

{1: 1997, 2: 43777, 3: 6347}

In [34]:
f1_dct_c = f1_score(y_test, y_pred, average="micro")
print(
    f"The F1 micro score for the Decision Tree Classifier with max depth = 5 and max leaf = 1000 is: {f1_dct_c}."
)

The F1 micro score for the Decision Tree Classifier with max depth = 5 and max leaf = 1000 is: 0.6362886360584026.


In [19]:
test_values = pd.read_csv("../data/raw/test_values.csv")

In [20]:
test_values = test_values.select_dtypes(include="int64")

In [21]:
test_values_wo_ID = test_values.drop(columns="building_id")

In [22]:
test_values_wo_ID.shape

(86868, 30)

In [23]:
test_predict = dt_classifier.predict(test_values_wo_ID)

In [24]:
test_predict

array([3, 2, 2, ..., 2, 2, 2])

In [25]:
unique, counts = np.unique(test_predict, return_counts=True)
dict(zip(unique, counts))

{1: 3402, 2: 72630, 3: 10836}

In [26]:
test_submit = pd.concat(
    [test_values.loc[:, "building_id"], pd.Series(test_predict)], axis=1
)

In [27]:
test_submit

Unnamed: 0,building_id,0
0,300051,3
1,99355,2
2,890251,2
3,745817,1
4,421793,3
...,...,...
86863,310028,2
86864,663567,2
86865,1049160,2
86866,442785,2


In [28]:
test_submit.columns = ["building_id", "damage_grade"]

In [31]:
test_submit.to_csv(path_or_buf="../data/processed/20231005_dct_c_01.csv", index=False)

In [35]:
print(f"The submission score was 0.6391")

The submission score was 0.6391
