In [2]:
import pandas as pd
pd.set_option('display.max_columns', 100)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
%matplotlib inline

In [3]:
X = pd.read_csv('train_values.csv')
y = pd.read_csv('train_labels.csv')
building_id = X['building_id'].values.tolist()
X.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,t,r,n,f,q,t,d,1,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
1,28830,8,900,2812,2,10,8,7,o,r,n,x,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
2,94947,21,363,8973,2,10,5,5,t,r,n,f,x,t,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
3,590882,22,418,10694,2,10,6,5,t,r,n,f,x,s,d,0,1,0,0,0,0,1,1,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
4,201944,11,131,1488,3,30,8,9,t,r,n,f,x,s,d,1,0,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0


In [4]:
X = X.merge(y, on = 'building_id')
category_cols = ['roof_type', 'foundation_type', 'land_surface_condition', 'ground_floor_type', 'other_floor_type', 'position', 'plan_configuration', 'legal_ownership_status']

In [14]:
replace_global = {}
for col in category_cols:
    replace_local = {}
    z = X[col].value_counts(normalize = True)
    for key, value in z.items():
        replace_local[key] = value
    replace_global[col] = replace_local
    
replace_global

{'roof_type': {'n': 0.7016166476721117,
  'q': 0.2362845883170057,
  'x': 0.06209876401088254},
 'foundation_type': {'r': 0.8411172635561645,
  'w': 0.058012056745753084,
  'u': 0.054719667230747386,
  'i': 0.04059462550028588,
  'h': 0.005556386967049244},
 'land_surface_condition': {'t': 0.8317581283264454,
  'n': 0.13633101945119167,
  'o': 0.03191085222236292},
 'ground_floor_type': {'f': 0.8043675964405356,
  'x': 0.09546010951608014,
  'v': 0.09437032091204561,
  'z': 0.00385263295229105,
  'm': 0.001949340179047663},
 'other_floor_type': {'q': 0.6342339438451886,
  'x': 0.1667223072820135,
  'j': 0.15288889912164574,
  's': 0.046154849751152145},
 'position': {'s': 0.7754766865821697,
  't': 0.16460412661501683,
  'j': 0.050966803657698935,
  'o': 0.008952383145114562},
 'plan_configuration': {'d': 0.9595972386905652,
  'q': 0.02184181948649468,
  'u': 0.014002248648316775,
  's': 0.0013277002006899436,
  'c': 0.0012471172405324614,
  'a': 0.0009669955218897855,
  'o': 0.0006101

In [15]:
X.replace(replace_global, inplace = True)
X.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
0,802906,6,487,12198,2,30,6,5,0.831758,0.841117,0.701617,0.804368,0.634234,0.164604,0.959597,1,1,0,0,0,0,0,0,0,0,0,0.962924,1,0,0,0,0,0,0,0,0,0,0,0,3
1,28830,8,900,2812,2,10,8,7,0.031911,0.841117,0.701617,0.09546,0.634234,0.775477,0.959597,0,1,0,0,0,0,0,0,0,0,0,0.962924,1,0,0,0,0,0,0,0,0,0,0,0,2
2,94947,21,363,8973,2,10,5,5,0.831758,0.841117,0.701617,0.804368,0.166722,0.164604,0.959597,0,1,0,0,0,0,0,0,0,0,0,0.962924,1,0,0,0,0,0,0,0,0,0,0,0,3
3,590882,22,418,10694,2,10,6,5,0.831758,0.841117,0.701617,0.804368,0.166722,0.775477,0.959597,0,1,0,0,0,0,1,1,0,0,0,0.962924,1,0,0,0,0,0,0,0,0,0,0,0,2
4,201944,11,131,1488,3,30,8,9,0.831758,0.841117,0.701617,0.804368,0.166722,0.775477,0.959597,1,0,0,0,0,0,0,0,0,0,0,0.962924,1,0,0,0,0,0,0,0,0,0,0,0,3


In [None]:
#lb = LabelBinarizer()
#for col in category_cols:
    #sample = X[col].values.tolist()
    #sample_new = lb.fit_transform(sample)
    #sample_splits = np.hsplit(sample_new, sample_new.shape[1])
    #X.drop([col], axis = 1, inplace = True)
    #for i in range(len(sample_splits)):
        #heading = col + '_v' + str(i)
        #X[heading] = sample_splits[i]
        
#X.head()

In [None]:
#temp = pd.DataFrame()
#for col in X.columns:
    #if col in category_cols:
        #temp[col] = X[col]
#temp = pd.get_dummies(temp, columns = temp.columns)
#len(temp.columns)

In [None]:
#pca = PCA(n_components = 5)
#pca_data = pca.fit_transform(temp)
#pca_data_splits = np.hsplit(pca_data, pca_data.shape[1])
#count = 0
#s = 'category_vec_'
#for i in range(len(pca_data_splits)):
    #if max(pca_data_splits[i] > 1):
        #X[s + str(count)] = pca_data_splits[i]
        #count += 1
#X.drop(category_cols, axis = 1, inplace = True)
#X.head()

In [16]:
y.head()

Unnamed: 0,building_id,damage_grade
0,802906,3
1,28830,2
2,94947,3
3,590882,2
4,201944,3


In [17]:
y.drop(['building_id'], axis = 1, inplace = True)
y = np.asarray(y)
y = y.ravel()

In [None]:
#X.dtypes

In [None]:
X_category = X[[x for x in X.columns if X[x].dtype == 'object' or x == 'building_id']]
X_category.head()

In [None]:
X_int = X[['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id', 'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage', 'damage_grade']]
X_int.head()

In [None]:
#fig, axs = plt.subplots(4, 2, figsize = (12, 12))
#fig.subplots_adjust(hspace = 0.5)
#sns.despine(left=True)

#cols = [x for x in X_int.columns]

#for i in range(0, 8):
    #sns.distplot(X[cols[i]], color = 'red', kde = True, ax = axs[i // 2, i % 2])
    
#plt.setp(axs)
#plt.tight_layout()

In [None]:
#fig, axs = plt.subplots(4, 2, figsize = (12, 12))
#fig.subplots_adjust(hspace = 0.5)
#sns.despine(left=True)

#cols = [x for x in X_int.columns]
#axs = axs.ravel()

#for i in range(0, 8):
    #axs[i].scatter(X[cols[7]], X[cols[i]])
    #axs[i].set_title('{} vs {}'.format(cols[7], cols[i]))
    
#plt.setp(axs)
#plt.tight_layout()

In [None]:
#X_int[['height_percentage', 'damage_grade']][X['height_percentage'] > 27].sum()

In [18]:
for col in ['geo_level_1_id', 'height_percentage', 'area_percentage']:
    sample = X[col].values.tolist()
    sample_normal = []
    m1 = min(sample)
    m2 = max(sample)
    for x in sample:
        z = (x - m1) / (m2 - m1)
        sample_normal.append(z)
    X[col] = np.array(sample_normal).T
    

In [19]:
for col in ['age', 'geo_level_2_id', 'geo_level_3_id']:
    sample = X[col].values.tolist()
    sample_normal = []
    m1 = min(sample)
    m2 = max(sample)
    for x in sample:
        z = (x - m1) / (m2 - m1)
        sample_normal.append(z)
    X[col] = np.array(sample_normal).T
    
X.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
0,802906,0.2,0.341275,0.970637,2,0.030151,0.050505,0.1,0.831758,0.841117,0.701617,0.804368,0.634234,0.164604,0.959597,1,1,0,0,0,0,0,0,0,0,0,0.962924,1,0,0,0,0,0,0,0,0,0,0,0,3
1,28830,0.266667,0.630694,0.223761,2,0.01005,0.070707,0.166667,0.031911,0.841117,0.701617,0.09546,0.634234,0.775477,0.959597,0,1,0,0,0,0,0,0,0,0,0,0.962924,1,0,0,0,0,0,0,0,0,0,0,0,2
2,94947,0.7,0.25438,0.714013,2,0.01005,0.040404,0.1,0.831758,0.841117,0.701617,0.804368,0.166722,0.164604,0.959597,0,1,0,0,0,0,0,0,0,0,0,0.962924,1,0,0,0,0,0,0,0,0,0,0,0,3
3,590882,0.733333,0.292922,0.850959,2,0.01005,0.050505,0.1,0.831758,0.841117,0.701617,0.804368,0.166722,0.775477,0.959597,0,1,0,0,0,0,1,1,0,0,0,0.962924,1,0,0,0,0,0,0,0,0,0,0,0,2
4,201944,0.366667,0.091801,0.118405,3,0.030151,0.070707,0.233333,0.831758,0.841117,0.701617,0.804368,0.166722,0.775477,0.959597,1,0,0,0,0,0,0,0,0,0,0,0.962924,1,0,0,0,0,0,0,0,0,0,0,0,3


In [20]:
X.drop(['building_id', 'damage_grade'], axis = 1, inplace = True)
X.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,0.2,0.341275,0.970637,2,0.030151,0.050505,0.1,0.831758,0.841117,0.701617,0.804368,0.634234,0.164604,0.959597,1,1,0,0,0,0,0,0,0,0,0,0.962924,1,0,0,0,0,0,0,0,0,0,0,0
1,0.266667,0.630694,0.223761,2,0.01005,0.070707,0.166667,0.031911,0.841117,0.701617,0.09546,0.634234,0.775477,0.959597,0,1,0,0,0,0,0,0,0,0,0,0.962924,1,0,0,0,0,0,0,0,0,0,0,0
2,0.7,0.25438,0.714013,2,0.01005,0.040404,0.1,0.831758,0.841117,0.701617,0.804368,0.166722,0.164604,0.959597,0,1,0,0,0,0,0,0,0,0,0,0.962924,1,0,0,0,0,0,0,0,0,0,0,0
3,0.733333,0.292922,0.850959,2,0.01005,0.050505,0.1,0.831758,0.841117,0.701617,0.804368,0.166722,0.775477,0.959597,0,1,0,0,0,0,1,1,0,0,0,0.962924,1,0,0,0,0,0,0,0,0,0,0,0
4,0.366667,0.091801,0.118405,3,0.030151,0.070707,0.233333,0.831758,0.841117,0.701617,0.804368,0.166722,0.775477,0.959597,1,0,0,0,0,0,0,0,0,0,0,0.962924,1,0,0,0,0,0,0,0,0,0,0,0


In [41]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.225, random_state = 42)
#xgb = XGBClassifier()
#svc = SVC()
#lsvc = LinearSVC()
rf = RandomForestClassifier()
#dt = DecisionTreeClassifier()

In [42]:
#xgb.fit(X_train, y_train)
#lsvc.fit(X_train, y_train)
rf.fit(X_train, y_train)
#dt.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [43]:
#y_pred = xgb.predict(X_val)
#y_pred = svc.predict(X_val)
y_pred = rf.predict(X_val)
#y_pred = dt.predict(X_val)

In [44]:
f1_score(y_val, y_pred, average = 'micro')

0.6989733269663688

In [45]:
X_test = pd.read_csv('test_values.csv')
building_id = X_test['building_id'].values.tolist()
X_test.drop(['building_id'], axis = 1, inplace = True)

X_test.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,17,596,11307,3,20,7,6,t,r,n,f,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
1,6,141,11987,2,25,13,5,t,r,n,f,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,1,1,0,0,0,0,0,0,0,0,0
2,22,19,10044,2,5,4,5,t,r,n,f,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
3,26,39,633,1,0,19,3,t,r,x,v,j,t,d,0,0,0,0,0,1,0,0,0,0,0,v,2,1,0,0,1,0,0,0,0,0,0,0
4,17,289,7970,3,15,8,7,t,r,q,f,q,t,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0


In [46]:
for col in ['geo_level_1_id', 'height_percentage', 'area_percentage']:
    sample = X_test[col].values.tolist()
    sample_normal = []
    m1 = min(sample)
    m2 = max(sample)
    for x in sample:
        z = (x - m1) / (m2 - m1)
        sample_normal.append(z)
    X_test[col] = np.array(sample_normal).T

In [47]:
for col in ['age', 'geo_level_2_id', 'geo_level_3_id']:
    sample = X_test[col].values.tolist()
    sample_normal = []
    m1 = min(sample)
    m2 = max(sample)
    for x in sample:
        z = (x - m1) / (m2 - m1)
        sample_normal.append(z)
    X_test[col] = np.array(sample_normal).T
    
X_test.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,0.566667,0.417659,0.899737,3,0.020101,0.065934,0.133333,t,r,n,f,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
1,0.2,0.098809,0.953847,2,0.025126,0.131868,0.1,t,r,n,f,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,1,1,0,0,0,0,0,0,0,0,0
2,0.733333,0.013315,0.799236,2,0.005025,0.032967,0.1,t,r,n,f,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
3,0.866667,0.02733,0.05037,1,0.0,0.197802,0.033333,t,r,x,v,j,t,d,0,0,0,0,0,1,0,0,0,0,0,v,2,1,0,0,1,0,0,0,0,0,0,0
4,0.566667,0.202523,0.634201,3,0.015075,0.076923,0.166667,t,r,q,f,q,t,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0


In [48]:
replace_global = {}
for col in category_cols:
    replace_local = {}
    z = X_test[col].value_counts(normalize = True)
    for key, value in z.items():
        replace_local[key] = value
    replace_global[col] = replace_local
    
replace_global

{'roof_type': {'n': 0.7037459133397799,
  'q': 0.2340217341253396,
  'x': 0.062232352534880506},
 'foundation_type': {'r': 0.8424045678500713,
  'w': 0.0567527743242621,
  'u': 0.05350646958603859,
  'i': 0.0414767233043238,
  'h': 0.00585946493530414},
 'land_surface_condition': {'t': 0.8309158723580605,
  'n': 0.1368167794815122,
  'o': 0.03226734816042731},
 'ground_floor_type': {'f': 0.8054979969609062,
  'x': 0.09476447023069484,
  'v': 0.09368236865128701,
  'z': 0.0037988672468573007,
  'm': 0.002256296910254639},
 'other_floor_type': {'q': 0.633190588018603,
  'x': 0.1691186627987291,
  'j': 0.150446654694479,
  's': 0.047244094488188976},
 'position': {'s': 0.7755790394621724,
  't': 0.16533130727080167,
  'j': 0.050248653128885205,
  'o': 0.008841000138140627},
 'plan_configuration': {'d': 0.9584081595063775,
  'q': 0.022436340194317814,
  'u': 0.014504765851636967,
  'c': 0.001438964866233826,
  's': 0.0011857070497766726,
  'a': 0.0011626836119169315,
  'o': 0.0004144218814

In [49]:
X_test.replace(replace_global, inplace = True)

In [50]:
#lb = LabelBinarizer()
#category_cols = ['roof_type', 'foundation_type', 'land_surface_condition', 'ground_floor_type', 'other_floor_type', 'position', 'plan_configuration', 'legal_ownership_status']
#for col in category_cols:
    #sample = X_test[col].values.tolist()
    #sample_new = lb.fit_transform(sample)
    #sample_splits = np.hsplit(sample_new, sample_new.shape[1])
    #X_test.drop([col], axis = 1, inplace = True)
    #for i in range(len(sample_splits)):
        #heading = col + '_v' + str(i)
        #X_test[heading] = sample_splits[i]
        
X_test.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,0.566667,0.417659,0.899737,3,0.020101,0.065934,0.133333,0.830916,0.842405,0.703746,0.805498,0.633191,0.775579,0.958408,0,1,0,0,0,0,0,0,0,0,0,0.963462,1,0,0,0,0,0,0,0,0,0,0,0
1,0.2,0.098809,0.953847,2,0.025126,0.131868,0.1,0.830916,0.842405,0.703746,0.805498,0.633191,0.775579,0.958408,0,1,0,0,0,0,0,0,0,0,0,0.963462,1,1,1,0,0,0,0,0,0,0,0,0
2,0.733333,0.013315,0.799236,2,0.005025,0.032967,0.1,0.830916,0.842405,0.703746,0.805498,0.633191,0.775579,0.958408,0,1,0,0,0,0,0,0,0,0,0,0.963462,1,0,0,0,0,0,0,0,0,0,0,0
3,0.866667,0.02733,0.05037,1,0.0,0.197802,0.033333,0.830916,0.842405,0.062232,0.093682,0.150447,0.165331,0.958408,0,0,0,0,0,1,0,0,0,0,0,0.963462,2,1,0,0,1,0,0,0,0,0,0,0
4,0.566667,0.202523,0.634201,3,0.015075,0.076923,0.166667,0.830916,0.842405,0.234022,0.805498,0.633191,0.165331,0.958408,0,1,0,0,0,0,0,0,0,0,0,0.963462,1,0,0,0,0,0,0,0,0,0,0,0


In [51]:
y_test = rf.predict(X_test)
y_test.shape

(86868,)

In [52]:
df = pd.DataFrame()
df['building_id'] = np.array(building_id).T
df['damage_grade'] = y_test.T
df.to_csv('solution.csv', index = False)