In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split

from sklearn.utils import resample

from xgboost import XGBClassifier


In [2]:
# Pre set values for max cols and chart size

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

plt.rcParams["figure.figsize"] = (15,5)

In [3]:
# Read training data

data=pd.read_csv('train_values.csv')
data.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,t,r,n,f,q,t,d,1,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
1,28830,8,900,2812,2,10,8,7,o,r,n,x,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
2,94947,21,363,8973,2,10,5,5,t,r,n,f,x,t,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
3,590882,22,418,10694,2,10,6,5,t,r,n,f,x,s,d,0,1,0,0,0,0,1,1,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
4,201944,11,131,1488,3,30,8,9,t,r,n,f,x,s,d,1,0,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0


In [4]:
# Read table with target variable 

label=pd.read_csv('train_labels.csv')
label.head()

Unnamed: 0,building_id,damage_grade
0,802906,3
1,28830,2
2,94947,3
3,590882,2
4,201944,3


In [5]:
# Add Target variable to training data 
data['damage']=label['damage_grade']


In [6]:
# Check distribution of target variable to find out if there is class imbaance problem
data['damage'].value_counts()

2    148259
3     87218
1     25124
Name: damage, dtype: int64

In [7]:
# Define X and y variables
X=pd.get_dummies(data.loc[:,:'has_secondary_use_other'])
y=data['damage'].astype(int)

In [9]:
# Parameters list

class_weight=['balanced']
n_jobs=[-1]
n_estimators=np.arange(100,1000,100)
max_features = ['auto', 'sqrt','log2']
max_depth=np.arange(10,200,10)
bootstrap = [True, False]

In [10]:
param_grid={'n_jobs':n_jobs,
            'class_weight':class_weight,
            'n_estimators':n_estimators,
            'max_features':max_features,
            'max_depth':max_depth,
            'bootstrap':bootstrap
           }

In [76]:
%%time
# Fit scaled traing data on Random Forest Classifier
clf=RandomForestClassifier()
kf=KFold(n_splits=3,shuffle=True)
rs=RandomizedSearchCV(clf,param_distributions=param_grid,cv=kf,scoring='f1_micro')

rs.fit(X,y)

Wall time: 1h 55min 26s


RandomizedSearchCV(cv=KFold(n_splits=3, random_state=None, shuffle=True),
                   error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_...
                   param_distributions={'bootstrap': [True, False],
      

In [79]:
rs.best_params_

{'n_jobs': -1,
 'n_estimators': 900,
 'max_features': 'sqrt',
 'max_depth': 40,
 'class_weight': 'balanced',
 'bootstrap': True}

In [11]:
# Initiate the classifier using best parameters selected from grid search

clf=RandomForestClassifier(n_jobs= -1,
 n_estimators= 900,
 max_features= 'sqrt',
 max_depth= 40,
 class_weight= 'balanced',
 bootstrap= True)

In [None]:
# Train on full training data
clf.fit(X,y)

In [None]:
# Read Test dataset
X_test=pd.read_csv('test_values.csv')

# X_test.reset_index(inplace=True,drop=True)
X_test.head()

In [None]:
# Predict
prediction=clf.predict(pd.get_dummies(X_test))

In [None]:
result=pd.DataFrame(prediction)

In [None]:
# Format the prediction as per submission requirement
result['building_id']=X_test['building_id']
result.rename(columns={0:'damage_grade'},inplace=True)
result=result[['building_id','damage_grade']]
result.head()

In [None]:
result.to_csv('result.csv',index=False)