In [1]:
from src.Models import MyModel

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import os

## 1. Load and split data

In [2]:
train_values = pd.read_csv("./data/raw/train_values.csv")
train_labels = pd.read_csv("./data/raw/train_labels.csv")
test_values = pd.read_csv("./data/raw/test_values.csv")

# !!! DROP building_id !!!
train_values.drop(columns='building_id',inplace=True)
train_labels.drop(columns='building_id',inplace=True)
test_wo_id = test_values.drop(columns='building_id')


In [3]:
X_train,X_test,y_train,y_test = train_test_split(train_values,train_labels,random_state=42,test_size=0.2)

In [4]:
columns_to_target_encode = ['geo_level_1_id','geo_level_2_id','geo_level_3_id','ground_floor_type']
columns_to_label_encode = ['land_surface_condition','foundation_type','roof_type','other_floor_type','position','plan_configuration','legal_ownership_status']

## 2. DecisionTree-Pipeline

In [5]:
# Initialize Model
dt = MyModel(model="DecisionTree", columns_to_labelencode=columns_to_label_encode, columns_to_targetencode=columns_to_target_encode)

In [6]:
dt.fit(X=X_train,y=y_train)

  y = column_or_1d(y, warn=True)


[Pipeline] ..... (step 1 of 2) Processing label_encoder, total=   0.7s
[Pipeline] ......... (step 2 of 2) Processing estimator, total=   1.7s


In [15]:
dt.get_feature_importance(X_train.columns)

{'roof_type': 0.7450329159751323,
 'foundation_type': 0.06381221182995707,
 'geo_level_2_id': 0.03959463230609724,
 'position': 0.03367368415224366,
 'land_surface_condition': 0.02668650123258803,
 'ground_floor_type': 0.023228202578782144,
 'geo_level_3_id': 0.013799054129055254,
 'plan_configuration': 0.010093964941882465,
 'has_superstructure_stone_flag': 0.00797140013811492,
 'has_superstructure_timber': 0.004866506035513119,
 'has_superstructure_adobe_mud': 0.004150494168958291,
 'count_families': 0.003710253871560692,
 'count_floors_pre_eq': 0.003564032094528476,
 'other_floor_type': 0.00279472101838733,
 'has_superstructure_cement_mortar_brick': 0.0026781329831438894,
 'has_secondary_use': 0.0024362023305204467,
 'age': 0.0015973663895440156,
 'geo_level_1_id': 0.0015028649736778578,
 'has_superstructure_bamboo': 0.0011710778544867164,
 'has_superstructure_other': 0.001063494200594398,
 'has_superstructure_mud_mortar_brick': 0.0009999400120678874,
 'has_superstructure_mud_mortar

In [7]:
dt.get_f1_score(X_test,y_test)

0.7290151762245544

In [8]:
prediction = dt.predict(X=X_test)

## 3. XGBoost-Pipeline

In [9]:
xg = MyModel(model="XGBoost",columns_to_labelencode=columns_to_label_encode, columns_to_targetencode=columns_to_target_encode)

In [10]:
xg.fit(X=X_train,y=y_train)

  y = column_or_1d(y, warn=True)


[Pipeline] ..... (step 1 of 2) Processing label_encoder, total=   0.5s


  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


[Pipeline] ......... (step 2 of 2) Processing estimator, total=  34.7s


In [11]:
xg.get_feature_importance(X_train.columns)

{'roof_type': 0.17494015,
 'geo_level_3_id': 0.087452106,
 'has_superstructure_stone_flag': 0.061802138,
 'geo_level_2_id': 0.05159849,
 'has_superstructure_timber': 0.049260247,
 'foundation_type': 0.043266203,
 'ground_floor_type': 0.03957217,
 'position': 0.031832606,
 'has_superstructure_cement_mortar_brick': 0.028083744,
 'has_superstructure_mud_mortar_stone': 0.026463572,
 'count_floors_pre_eq': 0.023313362,
 'has_secondary_use': 0.022341942,
 'land_surface_condition': 0.021719104,
 'count_families': 0.020473188,
 'has_superstructure_cement_mortar_stone': 0.02005862,
 'has_superstructure_other': 0.01825314,
 'legal_ownership_status': 0.017602082,
 'has_superstructure_mud_mortar_brick': 0.017257812,
 'other_floor_type': 0.01714942,
 'has_secondary_use_agriculture': 0.015362011,
 'has_superstructure_bamboo': 0.014780173,
 'height_percentage': 0.014701442,
 'area_percentage': 0.014466862,
 'has_superstructure_rc_non_engineered': 0.014285784,
 'plan_configuration': 0.013939156,
 'has

In [12]:
xg.get_f1_score(X_test,y_test)

0.7406995261027224

In [13]:
xg.predict2submit(test_wo_id, test_values)


Unnamed: 0,building_id,damage_grade
0,300051,3
1,99355,2
2,890251,2
3,745817,1
4,421793,3
...,...,...
86863,310028,2
86864,663567,3
86865,1049160,2
86866,442785,2


## 99. Export this Jupyter Notebook as an HTML-file

In [14]:
# This command exports the Jupyter-Notebook as an HTML-File an saves it to ./reports/jupyter_html
#os.system('DATE=$(date +%s); jupyter nbconvert --to html main.ipynb && mv ./main.html ./reports/jupyter_html/ && mv ./reports/jupyter_html/main.html ./reports/jupyter_html/notebook_${DATE}.html')