In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier

from sklearn.model_selection import cross_validate,KFold, RandomizedSearchCV
from sklearn.model_selection import train_test_split

from sklearn.utils import resample

from xgboost import XGBClassifier


In [2]:
# Pre set values for max cols and chart size
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

plt.rcParams["figure.figsize"] = (15,5)

In [3]:
X_test=pd.read_csv('Data/test_values.csv')

In [4]:
# Read training data
data=pd.read_csv('Data/train_values.csv')
data.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,t,r,n,f,q,t,d,1,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
1,28830,8,900,2812,2,10,8,7,o,r,n,x,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
2,94947,21,363,8973,2,10,5,5,t,r,n,f,x,t,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
3,590882,22,418,10694,2,10,6,5,t,r,n,f,x,s,d,0,1,0,0,0,0,1,1,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
4,201944,11,131,1488,3,30,8,9,t,r,n,f,x,s,d,1,0,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0


In [5]:
# Read table with target variable 
label=pd.read_csv('Data/train_labels.csv')
label.head()

Unnamed: 0,building_id,damage_grade
0,802906,3
1,28830,2
2,94947,3
3,590882,2
4,201944,3


In [6]:
# Add Target variable to training data 
data['damage']=label['damage_grade']


In [7]:
# Check distribution of target variable to find out if there is class imbaance problem
data['damage'].value_counts()

2    148259
3     87218
1     25124
Name: damage, dtype: int64

In [None]:
data['volume_percentage']=data['area_percentage']* data['height_percentage']
data.columns

In [8]:
# Define X and y variables
X=pd.get_dummies(data.loc[:,:'has_secondary_use_other'])
y=data['damage'].astype(int)

## Voting Classifier

In [9]:
#Initializing 4 classifiers for voting classifier
clf1=RandomForestClassifier(n_estimators=600,max_depth=12,n_jobs=-1,class_weight='balanced',criterion='gini',oob_score=True)

clf2=RandomForestClassifier(n_estimators=600,max_depth=12,n_jobs=-1,class_weight='balanced',criterion='entropy',oob_score=True)

clf3=XGBClassifier(n_jobs=-1,n_estimators= 600, max_depth= 10,learning_rate= 0.1)

clf4 = GradientBoostingClassifier()

In [10]:
vc1=VotingClassifier(estimators=[('rc1',clf1),
                               ('rc2',clf2),
                               ('xgb',clf3),
                               ('gb',clf4)],
                    voting='soft',n_jobs=-1)

In [11]:
# Fit the training data
vc1.fit(X,y)

VotingClassifier(estimators=[('rc1',
                              RandomForestClassifier(class_weight='balanced',
                                                     max_depth=12,
                                                     n_estimators=600,
                                                     n_jobs=-1,
                                                     oob_score=True)),
                             ('rc2',
                              RandomForestClassifier(class_weight='balanced',
                                                     criterion='entropy',
                                                     max_depth=12,
                                                     n_estimators=600,
                                                     n_jobs=-1,
                                                     oob_score=True)),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                      

In [12]:
# Predict for test data
pred=vc1.predict(pd.get_dummies(X_test))

In [13]:
# Format the prediction as per submission requirement
p=pd.DataFrame(pred)
X_test2=X_test.copy()
X_test2.reset_index(inplace=True)
p['building_id']=X_test2['building_id']
p['damage_grade']=p[0]
p.drop(columns=[0],inplace=True)
p.head()

Unnamed: 0,building_id,damage_grade
0,300051,3
1,99355,2
2,890251,2
3,745817,1
4,421793,3


In [15]:
# Save result
p.to_csv('result.csv',index=False)

In [14]:
#Accuracy Score
accuracy_score(y_test,pred)

NameError: name 'y_test' is not defined