In [8]:
# import required packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from src.data.make_dataset import merge_data 
from src.features.build_features import features,split_data,feature_engineering, drop_unnecessary_ft, get_unnecessary_ft, drop_correlated_features
from xgboost import XGBClassifier
pd.options.display.max_columns =70

# I. Import whole dataset

In [9]:
# merge the dataset at a whole (train, test & target data all together)
train_values = pd.read_csv('../data/external/train_values.csv', index_col='building_id')
train_target = pd.read_csv('../data/external/train_labels.csv', index_col='building_id')
test_values = pd.read_csv('../data/external/test_values.csv', index_col='building_id')

df=merge_data(train_values, train_target, test_values)
df=df.reset_index()
display(df)

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
0,802906,6,487,12198,2,30,6,5,t,r,n,f,q,t,d,1,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,3.0
1,28830,8,900,2812,2,10,8,7,o,r,n,x,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,2.0
2,94947,21,363,8973,2,10,5,5,t,r,n,f,x,t,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,3.0
3,590882,22,418,10694,2,10,6,5,t,r,n,f,x,s,d,0,1,0,0,0,0,1,1,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,2.0
4,201944,11,131,1488,3,30,8,9,t,r,n,f,x,s,d,1,0,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
347464,310028,4,605,3623,3,70,20,6,t,r,q,f,q,t,d,0,1,0,0,0,0,1,0,0,0,0,w,1,1,1,0,0,0,0,0,0,0,0,0,
347465,663567,10,1407,11907,3,25,6,7,n,r,n,f,q,s,d,1,1,1,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,
347466,1049160,22,1136,7712,1,50,3,3,t,r,n,f,j,s,d,0,1,0,0,0,0,1,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,
347467,442785,6,1041,912,2,5,9,5,t,r,n,f,q,s,d,1,1,0,0,0,0,0,0,0,0,0,a,1,0,0,0,0,0,0,0,0,0,0,0,


# II. Preprocess data

In [10]:
df=features(df)
display(df)

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade,land_surface_condition_n,land_surface_condition_o,land_surface_condition_t,foundation_type_h,foundation_type_i,foundation_type_r,foundation_type_u,foundation_type_w,roof_type_n,roof_type_q,roof_type_x,ground_floor_type_f,ground_floor_type_m,ground_floor_type_v,ground_floor_type_x,ground_floor_type_z,other_floor_type_j,other_floor_type_q,other_floor_type_s,other_floor_type_x,position_j,position_o,position_s,position_t,plan_configuration_a,plan_configuration_c,plan_configuration_d,plan_configuration_f,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_a,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
0,802906,6,487,12198,0.125,30,6,5,1,1,0,0,0,0,0,0,0,0,0,0.111111,0,0,0,0,0,0,0,0,0,0,0,3.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,28830,8,900,2812,0.125,10,8,7,0,1,0,0,0,0,0,0,0,0,0,0.111111,0,0,0,0,0,0,0,0,0,0,0,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,94947,21,363,8973,0.125,10,5,5,0,1,0,0,0,0,0,0,0,0,0,0.111111,0,0,0,0,0,0,0,0,0,0,0,3.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,590882,22,418,10694,0.125,10,6,5,0,1,0,0,0,0,1,1,0,0,0,0.111111,0,0,0,0,0,0,0,0,0,0,0,2.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,201944,11,131,1488,0.250,30,8,9,1,0,0,0,0,0,0,0,0,0,0,0.111111,0,0,0,0,0,0,0,0,0,0,0,3.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
347464,310028,4,605,3623,0.250,70,20,6,0,1,0,0,0,0,1,0,0,0,0,0.111111,1,1,0,0,0,0,0,0,0,0,0,,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
347465,663567,10,1407,11907,0.250,25,6,7,1,1,1,0,0,0,0,0,0,0,0,0.111111,0,0,0,0,0,0,0,0,0,0,0,,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
347466,1049160,22,1136,7712,0.000,50,3,3,0,1,0,0,0,0,1,0,0,0,0,0.111111,0,0,0,0,0,0,0,0,0,0,0,,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
347467,442785,6,1041,912,0.125,5,9,5,1,1,0,0,0,0,0,0,0,0,0,0.111111,0,0,0,0,0,0,0,0,0,0,0,,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


# III. Add new features

In [11]:
pd.set_option('display.max_columns', None)
df = feature_engineering(df)
display(df)

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade,land_surface_condition_n,land_surface_condition_o,land_surface_condition_t,foundation_type_h,foundation_type_i,foundation_type_r,foundation_type_u,foundation_type_w,roof_type_n,roof_type_q,roof_type_x,ground_floor_type_f,ground_floor_type_m,ground_floor_type_v,ground_floor_type_x,ground_floor_type_z,other_floor_type_j,other_floor_type_q,other_floor_type_s,other_floor_type_x,position_j,position_o,position_s,position_t,plan_configuration_a,plan_configuration_c,plan_configuration_d,plan_configuration_f,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_a,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w,district_class_2,district_class_1,district_class_3,district_class_4,age_u_40,age_40_100,age_ue_100,low_mortar_percentage,high_mortar_percentage,low_percentage_r,high_percentage_r,fragile,stable,ft_imp_1_pos,ft_high_imp_1_pos,dens_1,dens_2,dens_3
0,802906,6,0.341275,0.970637,0.125,30,6,5,1,1,0,0,0,0,0,0,0,0,0,0.111111,0,0,0,0,0,0,0,0,0,0,0,3.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0,1,0,1,1.0,0.0,0.0,0.0,1.000000,0.066157,0.054971
1,28830,8,0.630694,0.223761,0.125,10,8,7,0,1,0,0,0,0,0,0,0,0,0,0.111111,0,0,0,0,0,0,0,0,0,0,0,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0,1,0,1,1.0,0.0,0.0,0.0,0.781553,0.047708,0.024561
2,94947,21,0.254380,0.714013,0.125,10,5,5,0,1,0,0,0,0,0,0,0,0,0,0.111111,0,0,0,0,0,0,0,0,0,0,0,3.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0,1,0,1,1.0,0.0,0.0,0.0,0.609752,0.436452,0.198830
3,590882,22,0.292922,0.850959,0.125,10,6,5,0,1,0,0,0,0,1,1,0,0,0,0.111111,0,0,0,0,0,0,0,0,0,0,0,2.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,1,0,1,1.0,0.0,0.0,0.0,0.249222,0.051994,0.045614
4,201944,11,0.091801,0.118405,0.250,30,8,9,1,0,0,0,0,0,0,0,0,0,0,0.111111,0,0,0,0,0,0,0,0,0,0,0,3.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0,0,1,1.0,0.0,0.0,0.0,0.328168,0.260157,0.185965
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
347464,310028,4,0.423966,0.288295,0.250,70,20,6,0,1,0,0,0,0,1,0,0,0,0,0.111111,1,1,0,0,0,0,0,0,0,0,0,,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0,1,0,1,1.0,0.0,1.0,0.0,0.594754,0.076966,0.052632
347465,663567,10,0.985985,0.947481,0.250,25,6,7,1,1,1,0,0,0,0,0,0,0,0,0.111111,0,0,0,0,0,0,0,0,0,0,0,,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0,1,0,1,1.0,0.0,0.0,0.0,0.903971,0.090570,0.028070
347466,1049160,22,0.796076,0.613671,0.000,50,3,3,0,1,0,0,0,0,1,0,0,0,0,0.111111,0,0,0,0,0,0,0,0,0,0,0,,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,1,0,1,1.0,1.0,0.0,1.0,0.249222,0.049012,0.016374
347467,442785,6,0.729502,0.072571,0.125,5,9,5,1,1,0,0,0,0,0,0,0,0,0,0.111111,0,0,0,0,0,0,0,0,0,0,0,,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0,1,0,1,1.0,0.0,0.0,0.0,1.000000,0.079202,0.113450


In [12]:
# drop unnecessary features
display(get_unnecessary_ft(df))
df = drop_unnecessary_ft(df)

['has_secondary_use_institution',
 'has_secondary_use_school',
 'has_secondary_use_industry',
 'has_secondary_use_health_post',
 'has_secondary_use_gov_office',
 'has_secondary_use_use_police',
 'has_secondary_use_other',
 'ground_floor_type_m',
 'ground_floor_type_z',
 'plan_configuration_a',
 'plan_configuration_c',
 'plan_configuration_f',
 'plan_configuration_m',
 'plan_configuration_n',
 'plan_configuration_o',
 'plan_configuration_s',
 'legal_ownership_status_r']

In [13]:
#drop the features with high correlation
df = drop_correlated_features(df)
df

The following features are correlated: land_surface_condition_t and land_surface_condition_n. Correlation = 0.88
The following features are correlated: roof_type_q and roof_type_n. Correlation = 0.85
The following features are correlated: position_t and position_s. Correlation = 0.83
The following features are correlated: age_40_100 and age_u_40. Correlation = 0.96
The following features are correlated: high_mortar_percentage and low_mortar_percentage. Correlation = 1.0
The following features are correlated: high_percentage_r and low_percentage_r. Correlation = 1.0
The following features are correlated: ft_high_imp_1_pos and stable. Correlation = 0.9
Drop the following features: {'position_s', 'low_percentage_r', 'stable', 'age_u_40', 'roof_type_n', 'low_mortar_percentage', 'land_surface_condition_n'}


Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,damage_grade,land_surface_condition_o,land_surface_condition_t,foundation_type_h,foundation_type_i,foundation_type_r,foundation_type_u,foundation_type_w,roof_type_q,roof_type_x,ground_floor_type_f,ground_floor_type_v,ground_floor_type_x,other_floor_type_j,other_floor_type_q,other_floor_type_s,other_floor_type_x,position_j,position_o,position_t,plan_configuration_d,plan_configuration_q,plan_configuration_u,legal_ownership_status_a,legal_ownership_status_v,legal_ownership_status_w,district_class_2,district_class_1,district_class_3,district_class_4,age_40_100,age_ue_100,high_mortar_percentage,high_percentage_r,fragile,ft_imp_1_pos,ft_high_imp_1_pos,dens_1,dens_2,dens_3
0,802906,6,0.341275,0.970637,0.125,30,6,5,1,1,0,0,0,0,0,0,0,0,0,0.111111,0,0,0,0,3.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,1,1.0,0.0,0.0,1.000000,0.066157,0.054971
1,28830,8,0.630694,0.223761,0.125,10,8,7,0,1,0,0,0,0,0,0,0,0,0,0.111111,0,0,0,0,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,1,1.0,0.0,0.0,0.781553,0.047708,0.024561
2,94947,21,0.254380,0.714013,0.125,10,5,5,0,1,0,0,0,0,0,0,0,0,0,0.111111,0,0,0,0,3.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,1,1.0,0.0,0.0,0.609752,0.436452,0.198830
3,590882,22,0.292922,0.850959,0.125,10,6,5,0,1,0,0,0,0,1,1,0,0,0,0.111111,0,0,0,0,2.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1.0,0.0,0.0,0.249222,0.051994,0.045614
4,201944,11,0.091801,0.118405,0.250,30,8,9,1,0,0,0,0,0,0,0,0,0,0,0.111111,0,0,0,0,3.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,1.0,0.0,0.0,0.328168,0.260157,0.185965
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
347464,310028,4,0.423966,0.288295,0.250,70,20,6,0,1,0,0,0,0,1,0,0,0,0,0.111111,1,1,0,0,,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1,1,1.0,1.0,0.0,0.594754,0.076966,0.052632
347465,663567,10,0.985985,0.947481,0.250,25,6,7,1,1,1,0,0,0,0,0,0,0,0,0.111111,0,0,0,0,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,1,1.0,0.0,0.0,0.903971,0.090570,0.028070
347466,1049160,22,0.796076,0.613671,0.000,50,3,3,0,1,0,0,0,0,1,0,0,0,0,0.111111,0,0,0,0,,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,1,1.0,0.0,1.0,0.249222,0.049012,0.016374
347467,442785,6,0.729502,0.072571,0.125,5,9,5,1,1,0,0,0,0,0,0,0,0,0,0.111111,0,0,0,0,,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,1,1.0,0.0,0.0,1.000000,0.079202,0.113450


# IV Modellation

In [14]:
train_data, train_target, test_data = split_data(df)
train_data

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,land_surface_condition_o,land_surface_condition_t,foundation_type_h,foundation_type_i,foundation_type_r,foundation_type_u,foundation_type_w,roof_type_q,roof_type_x,ground_floor_type_f,ground_floor_type_v,ground_floor_type_x,other_floor_type_j,other_floor_type_q,other_floor_type_s,other_floor_type_x,position_j,position_o,position_t,plan_configuration_d,plan_configuration_q,plan_configuration_u,legal_ownership_status_a,legal_ownership_status_v,legal_ownership_status_w,district_class_2,district_class_1,district_class_3,district_class_4,age_40_100,age_ue_100,high_mortar_percentage,high_percentage_r,fragile,ft_imp_1_pos,ft_high_imp_1_pos,dens_1,dens_2,dens_3
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1
802906,6,0.341275,0.970637,0.125,30,6,5,1,1,0,0,0,0,0,0,0,0,0,0.111111,0,0,0,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,1,1.0,0.0,0.0,1.000000,0.066157,0.054971
28830,8,0.630694,0.223761,0.125,10,8,7,0,1,0,0,0,0,0,0,0,0,0,0.111111,0,0,0,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,1,1.0,0.0,0.0,0.781553,0.047708,0.024561
94947,21,0.254380,0.714013,0.125,10,5,5,0,1,0,0,0,0,0,0,0,0,0,0.111111,0,0,0,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,1,1.0,0.0,0.0,0.609752,0.436452,0.198830
590882,22,0.292922,0.850959,0.125,10,6,5,0,1,0,0,0,0,1,1,0,0,0,0.111111,0,0,0,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1.0,0.0,0.0,0.249222,0.051994,0.045614
201944,11,0.091801,0.118405,0.250,30,8,9,1,0,0,0,0,0,0,0,0,0,0,0.111111,0,0,0,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,1.0,0.0,0.0,0.328168,0.260157,0.185965
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688636,25,0.935529,0.128989,0.000,55,6,3,0,1,0,0,0,0,0,0,0,0,0,0.111111,0,0,0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,1,1.0,0.0,1.0,0.222181,0.027767,0.018713
669485,17,0.501051,0.163921,0.125,0,6,5,0,1,0,0,0,0,0,0,0,0,0,0.111111,0,0,0,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,1,1.0,0.0,0.0,0.899801,0.064107,0.069006
602512,17,0.035739,0.649558,0.250,55,6,7,0,1,0,0,0,0,0,0,0,0,0,0.111111,0,0,0,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,1,1.0,0.0,0.0,0.899801,0.078643,0.037427
151409,26,0.027330,0.147291,0.125,10,14,6,0,0,0,0,0,1,0,0,0,0,0,0.111111,0,0,0,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0,1.0,1.0,1.0,0.922735,1.000000,0.360234


In [15]:
data_train, data_test, target_train, target_test = train_test_split(train_data, train_target, test_size=0.2, random_state=42, stratify=train_target)

In [None]:
# A parameter grid for XGBoost
random_grid = {
        'n_estimators': [ 200, 250],
        'gamma': [0.4, 0.6,],
        'max_depth': range(9, 10),
        'learning_rate': [0.1, 0.2, 0.3]
        }
# First create the base model to tune
xgbm = XGBClassifier()
# Random search of parameters, using 3 fold cross validation, search across 100 different combinations
rf_random = GridSearchCV(xgbm, random_grid, cv=5, scoring ='f1_micro', verbose=2, refit=True)
# Fit the random search model
rf_random.fit(data_train, target_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits




[CV] END gamma=0.4, learning_rate=0.1, max_depth=9, n_estimators=200; total time= 4.5min




[CV] END gamma=0.4, learning_rate=0.1, max_depth=9, n_estimators=200; total time= 4.2min




[CV] END gamma=0.4, learning_rate=0.1, max_depth=9, n_estimators=200; total time= 4.5min




[CV] END gamma=0.4, learning_rate=0.1, max_depth=9, n_estimators=200; total time= 4.1min




[CV] END gamma=0.4, learning_rate=0.1, max_depth=9, n_estimators=200; total time= 4.4min




[CV] END gamma=0.4, learning_rate=0.1, max_depth=9, n_estimators=250; total time= 5.6min




[CV] END gamma=0.4, learning_rate=0.1, max_depth=9, n_estimators=250; total time= 5.2min




[CV] END gamma=0.4, learning_rate=0.1, max_depth=9, n_estimators=250; total time= 5.1min




[CV] END gamma=0.4, learning_rate=0.1, max_depth=9, n_estimators=250; total time= 5.0min




[CV] END gamma=0.4, learning_rate=0.1, max_depth=9, n_estimators=250; total time= 4.5min




[CV] END gamma=0.4, learning_rate=0.2, max_depth=9, n_estimators=200; total time= 3.9min




[CV] END gamma=0.4, learning_rate=0.2, max_depth=9, n_estimators=200; total time= 3.7min




[CV] END gamma=0.4, learning_rate=0.2, max_depth=9, n_estimators=200; total time= 3.6min




[CV] END gamma=0.4, learning_rate=0.2, max_depth=9, n_estimators=200; total time= 3.6min




[CV] END gamma=0.4, learning_rate=0.2, max_depth=9, n_estimators=200; total time= 3.6min




[CV] END gamma=0.4, learning_rate=0.2, max_depth=9, n_estimators=250; total time= 4.5min




[CV] END gamma=0.4, learning_rate=0.2, max_depth=9, n_estimators=250; total time= 4.4min




[CV] END gamma=0.4, learning_rate=0.2, max_depth=9, n_estimators=250; total time= 4.4min




[CV] END gamma=0.4, learning_rate=0.2, max_depth=9, n_estimators=250; total time= 4.5min




[CV] END gamma=0.4, learning_rate=0.2, max_depth=9, n_estimators=250; total time= 4.5min




[CV] END gamma=0.4, learning_rate=0.3, max_depth=9, n_estimators=200; total time= 3.6min




[CV] END gamma=0.4, learning_rate=0.3, max_depth=9, n_estimators=200; total time= 3.5min




[CV] END gamma=0.4, learning_rate=0.3, max_depth=9, n_estimators=200; total time= 3.6min




[CV] END gamma=0.4, learning_rate=0.3, max_depth=9, n_estimators=200; total time= 3.6min




[CV] END gamma=0.4, learning_rate=0.3, max_depth=9, n_estimators=200; total time= 3.6min




[CV] END gamma=0.4, learning_rate=0.3, max_depth=9, n_estimators=250; total time= 4.4min




[CV] END gamma=0.4, learning_rate=0.3, max_depth=9, n_estimators=250; total time= 5.3min




[CV] END gamma=0.4, learning_rate=0.3, max_depth=9, n_estimators=250; total time= 4.7min




[CV] END gamma=0.4, learning_rate=0.3, max_depth=9, n_estimators=250; total time= 4.5min




[CV] END gamma=0.4, learning_rate=0.3, max_depth=9, n_estimators=250; total time= 6.3min




[CV] END gamma=0.6, learning_rate=0.1, max_depth=9, n_estimators=200; total time= 3.9min




[CV] END gamma=0.6, learning_rate=0.1, max_depth=9, n_estimators=200; total time= 3.9min




[CV] END gamma=0.6, learning_rate=0.1, max_depth=9, n_estimators=200; total time= 4.0min




[CV] END gamma=0.6, learning_rate=0.1, max_depth=9, n_estimators=200; total time= 3.9min




[CV] END gamma=0.6, learning_rate=0.1, max_depth=9, n_estimators=200; total time= 4.3min




[CV] END gamma=0.6, learning_rate=0.1, max_depth=9, n_estimators=250; total time= 5.4min




[CV] END gamma=0.6, learning_rate=0.1, max_depth=9, n_estimators=250; total time= 5.2min




[CV] END gamma=0.6, learning_rate=0.1, max_depth=9, n_estimators=250; total time= 5.2min




[CV] END gamma=0.6, learning_rate=0.1, max_depth=9, n_estimators=250; total time= 5.2min




[CV] END gamma=0.6, learning_rate=0.1, max_depth=9, n_estimators=250; total time= 5.5min




[CV] END gamma=0.6, learning_rate=0.2, max_depth=9, n_estimators=200; total time= 4.3min




[CV] END gamma=0.6, learning_rate=0.2, max_depth=9, n_estimators=200; total time= 4.1min




[CV] END gamma=0.6, learning_rate=0.2, max_depth=9, n_estimators=200; total time= 4.1min




[CV] END gamma=0.6, learning_rate=0.2, max_depth=9, n_estimators=200; total time= 4.1min




[CV] END gamma=0.6, learning_rate=0.2, max_depth=9, n_estimators=200; total time= 4.1min




[CV] END gamma=0.6, learning_rate=0.2, max_depth=9, n_estimators=250; total time= 5.6min




[CV] END gamma=0.6, learning_rate=0.2, max_depth=9, n_estimators=250; total time= 5.7min






In [None]:
best_random = rf_random.best_estimator_
best_random.fit(data_train, target_train)
prediction = best_random.predict(data_test)
acc = accuracy_score(target_test,prediction) 
f1_mic= f1_score(target_test,prediction,average='micro') 
f1_mac= f1_score(target_test,prediction,average='macro') 
pr= precision_score(target_test,prediction,average='micro') 
re= recall_score(target_test,prediction,average='micro') 
print("The Accuracy on test set: {:.4f}".format(acc))
print("The F1-Score micro on test set: {:.4f}".format(f1_mic))
print("The F1-Score macro on test set: {:.4f}".format(f1_mac))
print("The Precision-Score on test set: {:.4f}".format(pr))
print("The Recall-Score on test set: {:.4f}".format(re))

In [None]:
model = XGBClassifier(n_estimators=250, gamma=0.6, max_depth= 10, learning_rate=0.2)
model.fit(data_train, target_train)
pred = model.predict(data_test)

acc = accuracy_score(target_test, pred)
f1_mic = f1_score(target_test, pred, average='micro')
f1_mac = f1_score(target_test, pred, average='macro')
print('The Accuracy is:{:,.4f}'.format(acc))
print('The f1_mic is:{:,.4f}'.format(f1_mic))
print('The f1_mac is:{:,.4f}'.format(f1_mac))

In [None]:
model = XGBClassifier(n_estimators=250, gamma=0.6, max_depth= 10, learning_rate=0.2)
model.fit(data_train, target_train)
pred = model.predict(data_test)

acc = accuracy_score(target_test, pred)
f1_mic = f1_score(target_test, pred, average='micro')
f1_mac = f1_score(target_test, pred, average='macro')
print('The Accuracy is:{:,.4f}'.format(acc))
print('The f1_mic is:{:,.4f}'.format(f1_mic))
print('The f1_mac is:{:,.4f}'.format(f1_mac))

In [None]:
model = XGBClassifier(use_label_encoder = True, verbosity=2, gamma = 0.65, eta=0.2, max_depth=9, min_child_weight=0.3)
model.fit(data_train, target_train)
pred = model.predict(data_test)

acc = accuracy_score(target_test, pred)
f1_mic = f1_score(target_test, pred, average='micro')
f1_mac = f1_score(target_test, pred, average='macro')
print('The Accuracy is:{:,.4f}'.format(acc))
print('The f1_mic is:{:,.4f}'.format(f1_mic))
print('The f1_mac is:{:,.4f}'.format(f1_mac))

In [None]:
model = XGBClassifier(use_label_encoder = True, verbosity=2, gamma = 0.65, eta=0.2, max_depth=9, min_child_weight=0.3
                     , n_jobs= 12, n_estimators=900, num_parallel_tree = 12)
model.fit(data_train, target_train)
pred = model.predict(data_test)

acc = accuracy_score(target_test, pred)
f1_mic = f1_score(target_test, pred, average='micro')
f1_mac = f1_score(target_test, pred, average='macro')
print('The Accuracy is:{:,.4f}'.format(acc))
print('The f1_mic is:{:,.4f}'.format(f1_mic))
print('The f1_mac is:{:,.4f}'.format(f1_mac))

In [None]:
model = XGBClassifier(use_label_encoder = True, verbosity=2, gamma = 0.65, eta=0.2, max_depth=10, min_child_weight=0.3
                     , n_jobs= 12, n_estimators=250, importance_type='gain', colsample_bytree=0.4, subsample=0.7, max_delta_step = 4, 
                      alpha =0.0, num_parallel_tree = 20, max_leaves = 4)
model.fit(data_train, target_train)
pred = model.predict(data_test)

acc = accuracy_score(target_test, pred)
f1_mic = f1_score(target_test, pred, average='micro')
f1_mac = f1_score(target_test, pred, average='macro')
print('The Accuracy is:{:,.4f}'.format(acc))
print('The f1_mic is:{:,.4f}'.format(f1_mic))
print('The f1_mac is:{:,.4f}'.format(f1_mac))

In [None]:
model = XGBClassifier(use_label_encoder = True, verbosity=2, gamma = 0.65, eta=0.2, max_depth=9, min_child_weight=0.3
                     , n_jobs= 12, n_estimators=900, importance_type='gain', colsample_bytree=0.4, subsample=0.7, max_delta_step = 4, 
                      alpha =0.0, num_parallel_tree = 12)
model.fit(data_train, target_train)
pred = model.predict(data_test)

acc = accuracy_score(target_test, pred)
f1_mic = f1_score(target_test, pred, average='micro')
f1_mac = f1_score(target_test, pred, average='macro')
print('The Accuracy is:{:,.4f}'.format(acc))
print('The f1_mic is:{:,.4f}'.format(f1_mic))
print('The f1_mac is:{:,.4f}'.format(f1_mac))