In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_validate,KFold, RandomizedSearchCV
from sklearn.model_selection import train_test_split

from sklearn.utils import resample

from xgboost import XGBClassifier


In [2]:
# Pre set values for max cols and chart size
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

plt.rcParams["figure.figsize"] = (15,5)

In [3]:
# Read training data
data=pd.read_csv('Data/train_values.csv')
data.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,t,r,n,f,q,t,d,1,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
1,28830,8,900,2812,2,10,8,7,o,r,n,x,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
2,94947,21,363,8973,2,10,5,5,t,r,n,f,x,t,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
3,590882,22,418,10694,2,10,6,5,t,r,n,f,x,s,d,0,1,0,0,0,0,1,1,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
4,201944,11,131,1488,3,30,8,9,t,r,n,f,x,s,d,1,0,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0


In [4]:
# Read table with target variable 
label=pd.read_csv('Data/train_labels.csv')
label.head()

Unnamed: 0,building_id,damage_grade
0,802906,3
1,28830,2
2,94947,3
3,590882,2
4,201944,3


In [5]:
# Add Target variable to training data 
data['damage']=label['damage_grade']


In [6]:
# Check distribution of target variable to find out if there is class imbaance problem
data['damage'].value_counts()

2    148259
3     87218
1     25124
Name: damage, dtype: int64

In [7]:
# Define X and y variables
X=pd.get_dummies(data.loc[:,:'has_secondary_use_other'])
y=data['damage'].astype(int)-1

### Light XGBoost

In [9]:
import lightgbm as lgbm

In [10]:
# Split data into train and test sections
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=48796)

In [11]:
train_data=lgbm.Dataset(X,label=y)

In [12]:
#setting parameters for lightgbm
param = {'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class':3,
    'learning_rate': 0.1,
    'max_depth': 10,
    'num_leaves': 20,
    'is_unbalanced':True,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7}

In [13]:
# Function to calculate F-1 Score
def lgb_f1_score(y_pred,data):
    y_true = data.get_label().astype('int')
    y_pred =y_pred.reshape((3,-1)).argmax(axis=0) 
    y_pred=np.round(y_pred)
    return 'f1', f1_score(y_true, y_pred,average='micro'), True

In [15]:
# Train the model
clf=lgbm.train(param,
                       train_data,
                       num_boost_round=5000,                      
                       verbose_eval=500)

In [19]:
#Predicting on test dataset
ypred2=clf.predict(pd.get_dummies(X_test))

In [20]:
# Adding 1 to the prediction because the prediction results are starting from 0 instead of 1
p=pd.DataFrame(ypred2.argmax(axis=1) +1)
p.head()

Unnamed: 0,0
0,1
1,3
2,3
3,2
4,2


In [21]:
# Format the prediction as per submission requirement
p['building_id']=X_test['building_id']
p['damage_grade']=p[0].round().astype(int)
p.drop(columns=[0],inplace=True)
p.head()

Unnamed: 0,building_id,damage_grade
0,,1
1,,3
2,94947.0,3
3,,2
4,,2


In [22]:
# Save result
p.to_csv('result.csv',index=False)

In [23]:
# F-1 Score
f1_score(y_test,p,average='micro')

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').