In [1]:
import numpy as np
import pandas as pd
import seaborn as sns 
%matplotlib inline

In [2]:
train_data = pd.read_csv("./Dataset/train.csv")

In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 631761 entries, 0 to 631760
Data columns (total 14 columns):
area_assesed                             631761 non-null object
building_id                              631761 non-null object
damage_grade                             631761 non-null object
district_id                              631761 non-null int64
has_geotechnical_risk                    631761 non-null float64
has_geotechnical_risk_fault_crack        631761 non-null int64
has_geotechnical_risk_flood              631761 non-null int64
has_geotechnical_risk_land_settlement    631761 non-null int64
has_geotechnical_risk_landslide          631761 non-null int64
has_geotechnical_risk_liquefaction       631761 non-null int64
has_geotechnical_risk_other              631761 non-null int64
has_geotechnical_risk_rock_fall          631761 non-null int64
has_repair_started                       598344 non-null float64
vdcmun_id                                631761 non-null int64


In [4]:
train_data.head(5)

Unnamed: 0,area_assesed,building_id,damage_grade,district_id,has_geotechnical_risk,has_geotechnical_risk_fault_crack,has_geotechnical_risk_flood,has_geotechnical_risk_land_settlement,has_geotechnical_risk_landslide,has_geotechnical_risk_liquefaction,has_geotechnical_risk_other,has_geotechnical_risk_rock_fall,has_repair_started,vdcmun_id
0,Both,24385bfd2a2,Grade 4,24,0.0,0,0,0,0,0,0,0,0.0,2489
1,Both,405d1bbebbf,Grade 2,44,0.0,0,0,0,0,0,0,0,1.0,4423
2,Both,351d9bc71f6,Grade 1,36,0.0,0,0,0,0,0,0,0,0.0,3650
3,Building removed,2be3a971166,Grade 5,30,0.0,0,0,0,0,0,0,0,0.0,3016
4,Both,34c7d073ea6,Grade 3,36,0.0,0,0,0,0,0,0,0,0.0,3627


In [5]:
train_data['area_assesed'].unique()

array(['Both', 'Building removed', 'Exterior', 'Not able to inspect',
       'Interior'], dtype=object)

In [6]:
train_data['damage_grade'].unique()

array(['Grade 4', 'Grade 2', 'Grade 1', 'Grade 5', 'Grade 3'], dtype=object)

In [7]:
train_data['has_geotechnical_risk'].unique()

array([ 0.,  1.])

In [8]:
len(train_data['vdcmun_id'].unique())

1420

In [9]:
len(train_data['district_id'].unique())

31

In [10]:
## filling missing values ##
train_data['has_repair_started'].fillna(0,inplace = True)

In [11]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 631761 entries, 0 to 631760
Data columns (total 14 columns):
area_assesed                             631761 non-null object
building_id                              631761 non-null object
damage_grade                             631761 non-null object
district_id                              631761 non-null int64
has_geotechnical_risk                    631761 non-null float64
has_geotechnical_risk_fault_crack        631761 non-null int64
has_geotechnical_risk_flood              631761 non-null int64
has_geotechnical_risk_land_settlement    631761 non-null int64
has_geotechnical_risk_landslide          631761 non-null int64
has_geotechnical_risk_liquefaction       631761 non-null int64
has_geotechnical_risk_other              631761 non-null int64
has_geotechnical_risk_rock_fall          631761 non-null int64
has_repair_started                       631761 non-null float64
vdcmun_id                                631761 non-null int64


In [12]:
## dropping columns which are not useful ##
def drop_features(data,features):
    data.drop(features,inplace = True, axis = 1)

In [13]:
def convert_categorical_to_numerical(data,feature):
    return pd.get_dummies(data[feature], drop_first = True, prefix = feature)

In [14]:
new_col = convert_categorical_to_numerical(train_data,'area_assesed')
train_data = pd.concat([train_data,new_col],axis=1)
drop_features(train_data,'area_assesed')

In [15]:
## deep analysis ##
building_structure = pd.read_csv('./Dataset/Building_Structure.csv')

In [16]:
building_structure.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1052948 entries, 0 to 1052947
Data columns (total 29 columns):
building_id                               1052948 non-null object
district_id                               1052948 non-null int64
vdcmun_id                                 1052948 non-null int64
ward_id                                   1052948 non-null int64
count_floors_pre_eq                       1052948 non-null int64
count_floors_post_eq                      1052948 non-null int64
age_building                              1052948 non-null int64
plinth_area_sq_ft                         1052948 non-null int64
height_ft_pre_eq                          1052948 non-null int64
height_ft_post_eq                         1052948 non-null int64
land_surface_condition                    1052948 non-null object
foundation_type                           1052948 non-null object
roof_type                                 1052948 non-null object
ground_floor_type                     

In [17]:
building_structure.dropna(inplace = True)

In [20]:
categorical_values = [col for col in building_structure.columns if building_structure[col].dtype == 'O']
print(categorical_values)

['building_id', 'land_surface_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 'other_floor_type', 'position', 'plan_configuration', 'condition_post_eq']


In [21]:
for col in categorical_values[1:]:
    new_col = convert_categorical_to_numerical(building_structure,col)
    building_structure = pd.concat([building_structure,new_col],axis=1)

In [24]:
common_cols = list(set(train_data.columns).intersection(set(building_structure.columns)))
common_cols

['vdcmun_id', 'district_id', 'building_id']

In [26]:
common_cols.pop(2)

'building_id'

In [27]:
common_cols

['vdcmun_id', 'district_id']

In [28]:
drop_features(building_structure,categorical_values[1:] + common_cols)

In [29]:
## merging with train data ##

In [30]:
[col for col in building_structure.columns if building_structure[col].dtype == 'O']

['building_id']

In [31]:
train_data = pd.merge(train_data,building_structure,how="left",on=['building_id'])

In [32]:
[col for col in train_data.columns if train_data[col].dtype == 'O']

['building_id', 'damage_grade']

In [33]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 631761 entries, 0 to 631760
Data columns (total 69 columns):
building_id                                                  631761 non-null object
damage_grade                                                 631761 non-null object
district_id                                                  631761 non-null int64
has_geotechnical_risk                                        631761 non-null float64
has_geotechnical_risk_fault_crack                            631761 non-null int64
has_geotechnical_risk_flood                                  631761 non-null int64
has_geotechnical_risk_land_settlement                        631761 non-null int64
has_geotechnical_risk_landslide                              631761 non-null int64
has_geotechnical_risk_liquefaction                           631761 non-null int64
has_geotechnical_risk_other                                  631761 non-null int64
has_geotechnical_risk_rock_fall                        

In [34]:
## deep analysis on building ownership ##

In [35]:
building_ownership = pd.read_csv('./Dataset/Building_Ownership_Use.csv')

In [36]:
building_ownership.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1052948 entries, 0 to 1052947
Data columns (total 17 columns):
building_id                      1052948 non-null object
district_id                      1052948 non-null int64
vdcmun_id                        1052948 non-null int64
ward_id                          1052948 non-null int64
legal_ownership_status           1052948 non-null object
count_families                   1052946 non-null float64
has_secondary_use                1052938 non-null float64
has_secondary_use_agriculture    1052948 non-null int64
has_secondary_use_hotel          1052948 non-null int64
has_secondary_use_rental         1052948 non-null int64
has_secondary_use_institution    1052948 non-null int64
has_secondary_use_school         1052948 non-null int64
has_secondary_use_industry       1052948 non-null int64
has_secondary_use_health_post    1052948 non-null int64
has_secondary_use_gov_office     1052948 non-null int64
has_secondary_use_use_police     1052948 

In [37]:
building_ownership['legal_ownership_status'].unique()

array(['Private', 'Other', 'Public', 'Institutional'], dtype=object)

In [38]:
new_col = convert_categorical_to_numerical(building_ownership,'legal_ownership_status')
building_ownership = pd.concat([building_ownership,new_col],axis=1)

In [40]:
common_cols = list(set(train_data.columns).intersection(set(building_ownership.columns)))
common_cols

['building_id', 'vdcmun_id', 'district_id', 'ward_id']

In [41]:
common_cols.pop(0)

'building_id'

In [42]:
common_cols

['vdcmun_id', 'district_id', 'ward_id']

In [43]:
drop_features(building_ownership,['legal_ownership_status'] + common_cols)

In [44]:
building_ownership.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1052948 entries, 0 to 1052947
Data columns (total 16 columns):
building_id                       1052948 non-null object
count_families                    1052946 non-null float64
has_secondary_use                 1052938 non-null float64
has_secondary_use_agriculture     1052948 non-null int64
has_secondary_use_hotel           1052948 non-null int64
has_secondary_use_rental          1052948 non-null int64
has_secondary_use_institution     1052948 non-null int64
has_secondary_use_school          1052948 non-null int64
has_secondary_use_industry        1052948 non-null int64
has_secondary_use_health_post     1052948 non-null int64
has_secondary_use_gov_office      1052948 non-null int64
has_secondary_use_use_police      1052948 non-null int64
has_secondary_use_other           1052948 non-null int64
legal_ownership_status_Other      1052948 non-null uint8
legal_ownership_status_Private    1052948 non-null uint8
legal_ownership_status_Publ

In [45]:
## merging building_ownership with train data ##

In [47]:
train_data = pd.merge(train_data,building_ownership,how="left",on="building_id")

In [48]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 631761 entries, 0 to 631760
Data columns (total 84 columns):
building_id                                                  631761 non-null object
damage_grade                                                 631761 non-null object
district_id                                                  631761 non-null int64
has_geotechnical_risk                                        631761 non-null float64
has_geotechnical_risk_fault_crack                            631761 non-null int64
has_geotechnical_risk_flood                                  631761 non-null int64
has_geotechnical_risk_land_settlement                        631761 non-null int64
has_geotechnical_risk_landslide                              631761 non-null int64
has_geotechnical_risk_liquefaction                           631761 non-null int64
has_geotechnical_risk_other                                  631761 non-null int64
has_geotechnical_risk_rock_fall                        

In [57]:
## we have one null value in count_families ##
train_data['count_families'].fillna(train_data['count_families'].mode()[0],inplace=True)

In [58]:
## fitting the model with the train data ##

In [59]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

In [60]:
model.fit(train_data.drop(['building_id','damage_grade'],axis = 1),train_data['damage_grade'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [61]:
## reading test data ##
test_data = pd.read_csv('./Dataset/test.csv')

In [62]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421175 entries, 0 to 421174
Data columns (total 13 columns):
area_assesed                             421175 non-null object
building_id                              421175 non-null object
district_id                              421175 non-null int64
has_geotechnical_risk                    421175 non-null float64
has_geotechnical_risk_fault_crack        421175 non-null int64
has_geotechnical_risk_flood              421175 non-null int64
has_geotechnical_risk_land_settlement    421175 non-null int64
has_geotechnical_risk_landslide          421175 non-null int64
has_geotechnical_risk_liquefaction       421175 non-null int64
has_geotechnical_risk_other              421175 non-null int64
has_geotechnical_risk_rock_fall          421175 non-null int64
has_repair_started                       399253 non-null float64
vdcmun_id                                421175 non-null int64
dtypes: float64(2), int64(9), object(2)
memory usage: 41.8+ MB


In [63]:
## filling missing values ##
test_data['has_repair_started'].fillna(test_data['has_repair_started'].mode()[0], inplace = True)

In [64]:
area_assesed = convert_categorical_to_numerical(test_data,'area_assesed')
test_data = pd.concat([test_data,area_assesed], axis = 1)

In [66]:
drop_features(test_data,['area_assesed'])

In [71]:
common_cols = list(set(test_data.columns).intersection(set(building_structure.columns)))
common_cols

['building_id']

In [67]:
## merging building structure data with test data ##

In [68]:
test_data = pd.merge(test_data,building_structure,how='left',on='building_id')

In [69]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 421175 entries, 0 to 421174
Data columns (total 68 columns):
building_id                                                  421175 non-null object
district_id                                                  421175 non-null int64
has_geotechnical_risk                                        421175 non-null float64
has_geotechnical_risk_fault_crack                            421175 non-null int64
has_geotechnical_risk_flood                                  421175 non-null int64
has_geotechnical_risk_land_settlement                        421175 non-null int64
has_geotechnical_risk_landslide                              421175 non-null int64
has_geotechnical_risk_liquefaction                           421175 non-null int64
has_geotechnical_risk_other                                  421175 non-null int64
has_geotechnical_risk_rock_fall                              421175 non-null int64
has_repair_started                                      

In [72]:
common_cols = list(set(test_data.columns).intersection(set(building_ownership.columns)))
common_cols

['building_id']

In [73]:
## merge building ownership data with test data ##

In [74]:
test_data = pd.merge(test_data,building_ownership,how='left',on='building_id')

In [75]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 421175 entries, 0 to 421174
Data columns (total 83 columns):
building_id                                                  421175 non-null object
district_id                                                  421175 non-null int64
has_geotechnical_risk                                        421175 non-null float64
has_geotechnical_risk_fault_crack                            421175 non-null int64
has_geotechnical_risk_flood                                  421175 non-null int64
has_geotechnical_risk_land_settlement                        421175 non-null int64
has_geotechnical_risk_landslide                              421175 non-null int64
has_geotechnical_risk_liquefaction                           421175 non-null int64
has_geotechnical_risk_other                                  421175 non-null int64
has_geotechnical_risk_rock_fall                              421175 non-null int64
has_repair_started                                      

In [76]:
building_id = test_data['building_id']
drop_features(test_data,['building_id'])

In [77]:
predictions = model.predict(test_data)

In [78]:
final_result = pd.DataFrame({ 'building_id' : building_id , 'damage_grade' : predictions})
final_result.to_csv('output.csv', index = False)