In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split

from sklearn.utils import resample

from xgboost import XGBClassifier


In [2]:
# Pre set values for max cols and chart size

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

plt.rcParams["figure.figsize"] = (15,5)

In [16]:
# Read training data

data=pd.read_csv('train_values.csv')
data.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,t,r,n,f,q,t,d,1,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
1,28830,8,900,2812,2,10,8,7,o,r,n,x,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
2,94947,21,363,8973,2,10,5,5,t,r,n,f,x,t,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
3,590882,22,418,10694,2,10,6,5,t,r,n,f,x,s,d,0,1,0,0,0,0,1,1,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
4,201944,11,131,1488,3,30,8,9,t,r,n,f,x,s,d,1,0,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0


In [17]:
# Read table with target variable 

label=pd.read_csv('train_labels.csv')
label.head()

Unnamed: 0,building_id,damage_grade
0,802906,3
1,28830,2
2,94947,3
3,590882,2
4,201944,3


In [18]:
# Add Target variable to training data 
data['damage']=label['damage_grade']


In [19]:
# Check distribution of target variable to find out if there is class imbaance problem
data['damage'].value_counts()

2    148259
3     87218
1     25124
Name: damage, dtype: int64

In [20]:
# Define X and y variables
X=pd.get_dummies(data.loc[:,:'has_secondary_use_other'])
y=data['damage'].astype(int)

In [21]:
# Split the data into training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=256)

#### Downsample the minority training data ( downsampling only training data)

In [9]:
X_train['damage']=y_train
X_train['damage'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


2    118679
3     69666
1     20135
Name: damage, dtype: int64

In [10]:

# Separate majority and minority classes

filt1=X_train['damage']==1
filt2=X_train['damage']==2
filt3=X_train['damage']==3

df_majority2 = X_train[filt2]
df_majority3 = X_train[filt3]
df_minority  = X_train[filt1]
 
# Downsample majority class, damage=2 
df_majority_downsampled2 = resample(df_majority2, 
                                 replace=True,     # sample with replacement
                                 n_samples=20135,    # to match majority class
                                 random_state=123) # reproducible results
 
# Downsample majority class, damage=3 
df_majority_downsampled3 = resample(df_majority3, 
                                 replace=True,     # sample with replacement
                                 n_samples=20135,    # to match majority class
                                 random_state=123) # reproducible results

# Combine majority class with upsampled minority classes
df_downsampled = pd.concat([df_minority, df_majority_downsampled2,df_majority_downsampled3])
 
# Display new class counts
df_downsampled.damage.value_counts()


3    20135
2    20135
1    20135
Name: damage, dtype: int64

In [11]:
df_downsampled.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,land_surface_condition_n,land_surface_condition_o,land_surface_condition_t,foundation_type_h,foundation_type_i,foundation_type_r,foundation_type_u,foundation_type_w,roof_type_n,roof_type_q,roof_type_x,ground_floor_type_f,ground_floor_type_m,ground_floor_type_v,ground_floor_type_x,ground_floor_type_z,other_floor_type_j,other_floor_type_q,other_floor_type_s,other_floor_type_x,position_j,position_o,position_s,position_t,plan_configuration_a,plan_configuration_c,plan_configuration_d,plan_configuration_f,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_a,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w,damage
52321,343965,26,39,633,1,15,34,4,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1
88897,55463,3,1229,11320,2,0,19,6,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1
158780,71131,26,491,1386,2,10,6,6,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1
183247,13644,22,165,5452,2,15,7,5,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1
227532,245491,26,39,9133,1,5,19,3,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1


In [12]:
X_train=pd.get_dummies(df_downsampled.drop(columns=['damage']))
y_train=df_downsampled['damage'].astype(int)

In [13]:
X_train.shape

(60405, 69)

In [14]:
y_train.shape

(60405,)

In [15]:
X_test.shape

(52121, 69)

In [23]:
# Random Forest Classifier
clf=RandomForestClassifier(n_estimators = 1000,class_weight='balanced' ,oob_score = True,n_jobs=16)
clf.fit(X_train,y_train)

prediction=clf.predict(pd.get_dummies(X_test))

pred=pd.DataFrame(prediction)


In [27]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
kf = KFold(n_splits=3, shuffle=True)

scores = cross_val_score(clf,X_train,y_train, cv=kf, scoring='f1_micro')

In [62]:
# Format the prediction as per submission requirement
pred['building_id']=data['building_id']
pred.rename(columns={0:'damage_grade'},inplace=True)
pred=pred[['building_id','damage_grade']]
pred.head()

Unnamed: 0,building_id,damage_grade
0,802906,1
1,28830,2
2,94947,2
3,590882,2
4,201944,3


In [63]:
# cross_validate(clf,X_train,y_train,cv=5)

In [64]:
# Metric required by the competition
f1_score(y_test,prediction,average='micro')

0.7210145622685674

#### XGBOOST


In [66]:
# Random Forest Classifier
#.67 for XGBC()
clf=XGBClassifier(objective="multi:softprob", random_state=1254,n_jobs=8)
clf.fit(X_train,y_train)

prediction=clf.predict(pd.get_dummies(X_test))


In [67]:
pred['building_id']=data['building_id']
pred.rename(columns={0:'damage_grade'},inplace=True)
pred=pred[['building_id','damage_grade']]


# Metric required by the competition
f1_score(y_test,prediction,average='micro')

0.6753707718577925

#### Parameter tuning for XBGBoost

In [None]:
import xgboost as xgb
from scipy.stats import uniform, randint
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split

xgb_model = xgb.XGBRegressor(n_jobs=4)

params = {
    "colsample_bytree": uniform(0.7, 0.3),
    "gamma": uniform(0, 0.5),
    "learning_rate": uniform(0.03, 0.3), # default 0.1 
    "max_depth": randint(2, 25), # default 3
    "n_estimators": randint(100, 150), # default 100
    "subsample": uniform(0.6, 0.4)
}

search = RandomizedSearchCV(xgb_model, param_distributions=params, random_state=42, n_iter=10, cv=3, verbose=1, n_jobs=8, return_train_score=True)

search.fit(X_train,y_train)



Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.


In [None]:
def report_best_scores(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}:".format(results['params'][candidate]))
            print("")
            
report_best_scores(search.cv_results_, 1)

In [43]:
clf=XGBClassifier(colsample_bytree= 0.7467983561008608, gamma= 0.02904180608409973, learning_rate= 0.28985284373248055, 
                  max_depth= 5, n_estimators= 139, subsample= 0.8832290311184181,n_jobs=12)

clf.fit(X_train, y_train, early_stopping_rounds=5, 
             eval_set=[(X_test, y_test)], verbose=False, )

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7467983561008608,
              gamma=0.02904180608409973, learning_rate=0.28985284373248055,
              max_delta_step=0, max_depth=5, min_child_weight=1, missing=None,
              n_estimators=139, n_jobs=12, nthread=None,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
              subsample=0.8832290311184181, verbosity=1)

#### Another XGBoost 

In [48]:
clf = XGBClassifier(n_estimators=1000, learning_rate=0.05,n_jobs=12,max_depth=100)
clf.fit(X_train, y_train, early_stopping_rounds=5, 
             eval_set=[(X_test, y_test)], verbose=False)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.05, max_delta_step=0, max_depth=100,
              min_child_weight=1, missing=None, n_estimators=1000, n_jobs=12,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [49]:
prediction=clf.predict(pd.get_dummies(X_test))
pred['building_id']=data['building_id']
pred.rename(columns={0:'damage_grade'},inplace=True)
pred=pred[['building_id','damage_grade']]


# Metric required by the competition
f1_score(y_test,prediction,average='micro')

0.6328351336313578

### Bagging Classifier

##### Decision Tree

In [1]:
from sklearn.ensemble import BaggingClassifier
from sklearn import tree
model = BaggingClassifier(tree.DecisionTreeClassifier(random_state=1),n_jobs=-1)
model.fit(X_train, y_train)
model.score(X_test,y_test)
# 0.7109418468563535

NameError: name 'X_train' is not defined

##### Random Forest

In [11]:
from sklearn.ensemble import BaggingClassifier
from sklearn import tree
model = BaggingClassifier(RandomForestClassifier(random_state=1,max_depth=50),n_jobs=-1,n_estimators=50)
model.fit(X_train, y_train)
model.score(X_test,y_test)
# 0.7173691985955757

0.722165729744249

##### Ada Boost

In [10]:
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier(random_state=1)
model.fit(X_train, y_train)
model.score(X_test,y_test)
# 0.81081081081081086

TypeError: __init__() got an unexpected keyword argument 'n_jobs'

##### Gradient Boost

In [25]:
from sklearn.ensemble import GradientBoostingClassifier
model= GradientBoostingClassifier(learning_rate=0.01,random_state=1)
model.fit(X_train, y_train)
model.score(X_test,y_test)
# 0.6326049001362215
# 0.556551102242858 on downsampled data

0.6326049001362215

##### XGBoost

In [16]:
import xgboost as xgb
model=xgb.XGBClassifier(random_state=1,learning_rate=0.01,n_jobs=-1)
model.fit(X_train, y_train)
model.score(X_test,y_test)
# 0.6376892231538152
# 0.5479365322998407 on downsampled data

0.5479365322998407

### Learn on  full training data and predict Test data

In [36]:
# Define X and y variables
X=pd.get_dummies(data.loc[:,:'has_secondary_use_other'])
y=data['damage'].astype(int)

In [25]:
# clf=RandomForestClassifier(n_estimators = 100, oob_score = True)
# clf.fit(X,y)


In [26]:
# Read Test dataset
testdata=pd.read_csv('test_values.csv')
testdata.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,300051,17,596,11307,3,20,7,6,t,r,n,f,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
1,99355,6,141,11987,2,25,13,5,t,r,n,f,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,1,1,0,0,0,0,0,0,0,0,0
2,890251,22,19,10044,2,5,4,5,t,r,n,f,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
3,745817,26,39,633,1,0,19,3,t,r,x,v,j,t,d,0,0,0,0,0,1,0,0,0,0,0,v,2,1,0,0,1,0,0,0,0,0,0,0
4,421793,17,289,7970,3,15,8,7,t,r,q,f,q,t,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0


In [32]:
# Predict Damage_Grade for test dataset
prediction=clf.predict(pd.get_dummies(testdata))
print(np.unique(prediction))


[1 2 3]


In [33]:
pred=pd.DataFrame(prediction)
pred[0].value_counts()

2    76646
3     7968
1     2254
Name: 0, dtype: int64

In [34]:
# Format the prediction as per submission requirement
pred['building_id']=testdata['building_id']

pred.rename(columns={0:'damage_grade'},inplace=True)

pred=pred[['building_id','damage_grade']]
pred.head()


Unnamed: 0,building_id,damage_grade
0,300051,3
1,99355,2
2,890251,2
3,745817,2
4,421793,3


In [35]:
# Save result to .csv file 
pred.to_csv('result.csv',index=False)

#### Voting Classifer

In [None]:
from sklearn import model_selection
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
# create the sub models
estimators = []
model1 = RandomForestClassifier()
estimators.append(('randomforest', model1))
model2 = XGBClassifier()
estimators.append(('xgb', model2))
model3 = SVC()
estimators.append(('svm', model3))
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, X, y, cv=kfold)
print(results.mean())

