In [11]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, train_test_split , KFold

from sklearn.utils import resample

from xgboost import XGBClassifier


In [2]:
# Pre set values for max cols and chart size

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

plt.rcParams["figure.figsize"] = (15,5)

In [3]:
# Read training data

data=pd.read_csv('train_values.csv')
data.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,t,r,n,f,q,t,d,1,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
1,28830,8,900,2812,2,10,8,7,o,r,n,x,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
2,94947,21,363,8973,2,10,5,5,t,r,n,f,x,t,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
3,590882,22,418,10694,2,10,6,5,t,r,n,f,x,s,d,0,1,0,0,0,0,1,1,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
4,201944,11,131,1488,3,30,8,9,t,r,n,f,x,s,d,1,0,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0


In [4]:
# Read table with target variable 

label=pd.read_csv('train_labels.csv')
label.head()

Unnamed: 0,building_id,damage_grade
0,802906,3
1,28830,2
2,94947,3
3,590882,2
4,201944,3


In [5]:
# Add Target variable to training data 

data['damage']=label['damage_grade']
data.columns

Index(['building_id', 'geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
       'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage',
       'land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'legal_ownership_status', 'count_families', 'has_secondary_use',
       'has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_i

In [6]:
# Check distribution of target variable to find out if there is class imbaance problem
data['damage'].value_counts()

2    148259
3     87218
1     25124
Name: damage, dtype: int64

In [7]:
# Define X and y variables
X=pd.get_dummies(data.loc[:,:'has_secondary_use_other'])
y=data['damage'].astype(int)

In [8]:
# Split the data into training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=256)

#### Downsample the majority training data

In [9]:
X_train['damage']=y_train
X_train['damage'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


2    118679
3     69666
1     20135
Name: damage, dtype: int64

In [12]:
# Separate majority and minority classes

filt1=X_train['damage']==1
filt2=X_train['damage']==2
filt3=X_train['damage']==3

df_majority2 = X_train[filt2]
df_majority3 = X_train[filt3]
df_minority  = X_train[filt1]
 
# Downsample majority class, damage=2 
df_majority_downsampled2 = resample(df_majority2, 
                                 replace=True,     # sample with replacement
                                 n_samples=20135,    # to match majority class
                                 random_state=123) # reproducible results
 
# Downsample majority class, damage=3 
df_majority_downsampled3 = resample(df_majority3, 
                                 replace=True,     # sample with replacement
                                 n_samples=20135,    # to match majority class
                                 random_state=123) # reproducible results

# Combine majority class with upsampled minority classes
df_downsampled = pd.concat([df_minority, df_majority_downsampled2,df_majority_downsampled3])
 
# Display new class counts
df_downsampled.damage.value_counts()


3    20135
2    20135
1    20135
Name: damage, dtype: int64

In [13]:
X_train=pd.get_dummies(df_downsampled.drop(columns=['damage']))
y_train=df_downsampled['damage'].astype(int)

In [14]:
X_train.shape

(60405, 69)

In [15]:
y_train.shape

(60405,)

In [16]:
X_test.shape

(52121, 69)

In [17]:
# Random Forest Classifier
clf=RandomForestClassifier(n_estimators = 500, oob_score = True)
clf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [18]:
# Cross validate

kf=KFold(n_splits=3, shuffle=True)
cross_validate(clf,X_train,y_train,cv=kf)

{'fit_time': array([47.83293128, 47.5169332 , 47.51393938]),
 'score_time': array([3.63228703, 3.62430787, 3.68913412]),
 'test_score': array([0.71904644, 0.71611622, 0.71522225])}

In [19]:
# Predict for test dataset
prediction=clf.predict(pd.get_dummies(X_test))

In [20]:
result=pd.DataFrame(prediction)
result[0].value_counts()

2    21174
3    20363
1    10584
Name: 0, dtype: int64

In [21]:
# Format the prediction as per submission requirement
result['building_id']=data['building_id']
result.rename(columns={0:'damage_grade'},inplace=True)
result=result[['building_id','damage_grade']]
result.head()

Unnamed: 0,building_id,damage_grade
0,802906,1
1,28830,2
2,94947,2
3,590882,1
4,201944,3


In [22]:
# Metric required by the competition
f1_score(y_test,prediction,average='micro')

0.6243548665605034