# Dependecies

In [1]:
import os 
import numpy as np
import pandas as pd

import lightgbm as lgb

import keras 
from keras.layers import *
from keras.optimizers import *
from keras.models import Model

from sklearn.metrics import f1_score 
from sklearn.model_selection import KFold

In [2]:
DIR  = "data/"
SEED = 1881

if not os.path.isdir("models/"):
    os.makedirs("models")
    
print(os.listdir(DIR))

['Rapport.docx', 'submission_format.csv', 'test_values.csv', 'train_labels.csv', 'train_values.csv']


In [3]:
train_x = pd.read_csv(DIR+"train_values.csv")
train_y = pd.read_csv(DIR+"train_labels.csv")
test_x  = pd.read_csv(DIR+"test_values.csv")
sub_csv = pd.read_csv(DIR+"submission_format.csv")

# Geographic Location ID Embedding w/ Autoencoder

In [4]:
geo1 = np.array(pd.get_dummies(pd.concat([train_x["geo_level_1_id"], test_x["geo_level_1_id"]])))
geo2 = np.array(pd.get_dummies(pd.concat([train_x["geo_level_2_id"], test_x["geo_level_2_id"]])))
geo3 = np.array(pd.get_dummies(pd.concat([train_x["geo_level_3_id"], test_x["geo_level_3_id"]])))

In [5]:
geo3.shape

(347469, 11861)

In [6]:
def NET():
    inp = Input( shape=(geo3.shape[1],))
    i1 = Dense(16, name="intermediate")(inp)
    x2 = Dense(geo2.shape[1], activation='sigmoid')(i1)
    x1 = Dense(geo1.shape[1], activation='sigmoid')(i1)
    
    model = Model(inp, [x2,x1])
    model.compile(loss="binary_crossentropy", optimizer="adam")
    return model

In [7]:
model = NET()
model.fit(geo3, [geo2, geo1], batch_size=128, epochs=10, verbose=2)
model.save("geo_embed.h5")

Epoch 1/10
2715/2715 - 15s - loss: 0.2057 - dense_loss: 0.0532 - dense_1_loss: 0.1525
Epoch 2/10
2715/2715 - 16s - loss: 0.0537 - dense_loss: 0.0052 - dense_1_loss: 0.0486
Epoch 3/10
2715/2715 - 16s - loss: 0.0142 - dense_loss: 0.0041 - dense_1_loss: 0.0101
Epoch 4/10
2715/2715 - 15s - loss: 0.0066 - dense_loss: 0.0035 - dense_1_loss: 0.0031
Epoch 5/10
2715/2715 - 15s - loss: 0.0043 - dense_loss: 0.0030 - dense_1_loss: 0.0012
Epoch 6/10
2715/2715 - 16s - loss: 0.0030 - dense_loss: 0.0024 - dense_1_loss: 5.7560e-04
Epoch 7/10
2715/2715 - 15s - loss: 0.0020 - dense_loss: 0.0017 - dense_1_loss: 2.9792e-04
Epoch 8/10
2715/2715 - 16s - loss: 0.0013 - dense_loss: 0.0011 - dense_1_loss: 1.5836e-04
Epoch 9/10
2715/2715 - 15s - loss: 8.4721e-04 - dense_loss: 7.6138e-04 - dense_1_loss: 8.5835e-05
Epoch 10/10
2715/2715 - 17s - loss: 5.7838e-04 - dense_loss: 5.3077e-04 - dense_1_loss: 4.7616e-05


In [8]:
# Load GEO-Embed Model
model = NET()
model.load_weights("geo_embed.h5")

In [9]:
# "Extract Intermediate Layer" Function
from keras import backend as K

get_int_layer_output = K.function([model.layers[0].input],
                                  [model.layers[1].output])

In [10]:
geo3Expanded = tf.expand_dims(geo3, axis=1)

In [11]:
# Extract GEO-Embeds for all train data points.
# Then assign with train_data


out = []
for dat in geo3Expanded[:260601]:
    layer_output = get_int_layer_output([[dat]])[0]
    out.append(layer_output)

out = np.array(out)
out = np.squeeze(out)

train_data = pd.get_dummies(train_x.copy())
train_data = train_data.drop(['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id'], axis=1)
train_data = train_data.assign(geo_feat1=out[:,0],
                               geo_feat2=out[:,1],
                               geo_feat3=out[:,2],  
                               geo_feat4=out[:,3],
                               geo_feat5=out[:,4],    
                               geo_feat6=out[:,5],
                               geo_feat7=out[:,6],
                               geo_feat8=out[:,7],
                               geo_feat9=out[:,8],
                               geo_feat10=out[:,9],
                               geo_feat11=out[:,10],
                               geo_feat12=out[:,11],
                               geo_feat13=out[:,12],
                               geo_feat14=out[:,13],
                               geo_feat15=out[:,14],           
                               geo_feat16=out[:,15])

In [12]:
train_data.head()

Unnamed: 0,building_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,...,geo_feat7,geo_feat8,geo_feat9,geo_feat10,geo_feat11,geo_feat12,geo_feat13,geo_feat14,geo_feat15,geo_feat16
0,802906,2,30,6,5,1,1,0,0,0,...,0.844075,1.629543,-1.541913,2.062092,-0.730574,0.891505,-0.115479,1.107712,-0.831422,-1.40173
1,28830,2,10,8,7,0,1,0,0,0,...,-1.631102,-0.40613,0.601423,1.796202,-1.391089,1.531671,-0.643732,-0.822086,-1.54503,0.019457
2,94947,2,10,5,5,0,1,0,0,0,...,-1.986496,1.35311,-2.093531,1.86432,0.532991,1.786665,-1.018599,1.354901,0.208749,-1.243128
3,590882,2,10,6,5,0,1,0,0,0,...,-0.129595,0.732131,-0.381742,1.121952,-1.730679,1.098826,-1.57163,0.912978,-0.981634,0.832327
4,201944,3,30,8,9,1,0,0,0,0,...,-1.846991,1.724917,-1.710472,1.02241,0.794807,1.87563,-0.017773,0.062014,-1.933042,-1.983573


In [13]:
train_data.columns

Index(['building_id', 'count_floors_pre_eq', 'age', 'area_percentage',
       'height_percentage', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'count_families', 'has_secondary_use', 'has_secondary_use_agriculture',
       'has_secondary_use_hotel', 'has_secondary_use_rental',
       'has_secondary_use_institution', 'has_secondary_use_school',
       'has_secondary_use_industry', 'has_secondary_use_health_post',
       'has_secondary_use_gov_office', 'has_secondary_use_use_police',
       'has_secondary_use_other', 'land_surface_condition_n',
       'land_surface_condition_o', 'land_surface_condition

In [14]:
# Extract GEO-Embeds for all test data points.
# Then assign with test_data

out = []
for dat in geo3Expanded[260601:]:
    layer_output = get_int_layer_output([[dat]])[0]
    out.append(layer_output)

out = np.array(out)
out = np.squeeze(out)

test_data = pd.get_dummies(test_x.copy())
test_data = test_data.drop(['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id'], axis=1)
test_data = test_data.assign(geo_feat1=out[:,0],
                             geo_feat2=out[:,1],
                             geo_feat3=out[:,2],  
                             geo_feat4=out[:,3],
                             geo_feat5=out[:,4],    
                             geo_feat6=out[:,5],
                             geo_feat7=out[:,6],
                             geo_feat8=out[:,7],
                             geo_feat9=out[:,8],
                             geo_feat10=out[:,9],
                             geo_feat11=out[:,10],
                             geo_feat12=out[:,11],
                             geo_feat13=out[:,12],
                             geo_feat14=out[:,13],
                             geo_feat15=out[:,14],           
                             geo_feat16=out[:,15])

In [15]:
test_data.head()

Unnamed: 0,building_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,...,geo_feat7,geo_feat8,geo_feat9,geo_feat10,geo_feat11,geo_feat12,geo_feat13,geo_feat14,geo_feat15,geo_feat16
0,300051,3,20,7,6,0,1,0,0,0,...,-0.675,0.614349,-1.389414,1.320463,-1.732539,1.060726,0.55413,0.308889,-0.575239,0.375221
1,99355,2,25,13,5,0,1,0,0,0,...,0.259388,1.013949,0.518523,0.51164,0.242684,0.535861,-0.683099,0.727221,-1.412074,-1.150415
2,890251,2,5,4,5,0,1,0,0,0,...,-0.42738,-0.299714,0.249709,0.200205,-1.337208,0.851066,-0.681611,0.188826,-1.07359,-0.2465
3,745817,1,0,19,3,0,0,0,0,0,...,-1.792493,2.004001,-2.410599,-0.576685,-0.977422,2.359995,-1.858793,0.477453,1.662198,-0.735731
4,421793,3,15,8,7,0,1,0,0,0,...,0.59899,1.364992,-1.006493,1.054921,-1.392896,1.093669,0.717468,1.190503,-1.531369,0.445548


In [16]:
test_data.columns

Index(['building_id', 'count_floors_pre_eq', 'age', 'area_percentage',
       'height_percentage', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'count_families', 'has_secondary_use', 'has_secondary_use_agriculture',
       'has_secondary_use_hotel', 'has_secondary_use_rental',
       'has_secondary_use_institution', 'has_secondary_use_school',
       'has_secondary_use_industry', 'has_secondary_use_health_post',
       'has_secondary_use_gov_office', 'has_secondary_use_use_police',
       'has_secondary_use_other', 'land_surface_condition_n',
       'land_surface_condition_o', 'land_surface_condition

In [17]:
def threshold_arr(array):
    # Get major confidence-scored predicted value.
    new_arr = []
    for ix, val in enumerate(array):
        loc = np.array(val).argmax(axis=0)
        k = list(np.zeros((len(val))))
        k[loc]=1
        new_arr.append(k)
        
    return np.array(new_arr)

In [24]:
# Add Target variable to training data 
y = np.array(train_y["damage_grade"])-1
df = train_data.drop(["building_id"], axis=1)
x = np.array(df)
x['damage']=y['damage_grade']

TypeError: 'Dataset' object does not support item assignment

In [None]:
# Check distribution of target variable to find out if there is class imbaance problem
x['damage'].value_counts()

In [None]:
# Add a column for volumne percentage 
x['volume_percentage']=x['area_percentage']* x['height_percentage']
x.columns

In [None]:
from xgboost import XGBClassifier


In [None]:
# Define X and y variables
X=pd.get_dummies(x.drop(columns=['building_id']))
X=pd.get_dummies(X.drop(columns=['damage']))
Y=x['damage'].astype(int)

In [None]:
# Initiate classifier using best params selected after grid search
clf=XGBClassifier(n_jobs=-1,n_estimators= 600, max_depth= 10,learning_rate= 0.1)

In [None]:
# Train on full training data
clf.fit(X,Y)

In [None]:
# Add volume percentage in test data
test_data['volume_percentage']=test_data['area_percentage']* test_data['height_percentage']


In [None]:
# Predict for test dataset
prediction=clf.predict((pd.get_dummies(test_data)))

In [None]:
# Format the prediction as per submission requirement
result=pd.DataFrame(prediction)
result['building_id']=X_test['building_id']
result.rename(columns={0:'damage_grade'},inplace=True)
result=result[['building_id','damage_grade']]
result.head()

# LightGBM Training

In [18]:
y = np.array(train_y["damage_grade"])-1

df = train_data.drop(["building_id"], axis=1)
x = np.array(df)

kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
for ix, (train_index, test_index) in enumerate(kf.split(x)):
    lgb_params = {
        "objective" : "multiclass",
        "num_class":3,
        "metric" : "multi_error",
        "boosting": 'gbdt',
        "max_depth" : -1,
        "num_leaves" : 30,
        "learning_rate" : 0.1,
        "feature_fraction" : 0.5,
        "min_sum_hessian_in_leaf" : 0.1,
        "max_bin":8192,
        "verbosity" : 1,
        "num_threads":6,
        "seed": SEED
    }

    x_train, x_val, y_train, y_val= x[train_index], x[test_index], y[train_index], y[test_index]

    train_data = lgb.Dataset(x_train, label=y_train)
    val_data   = lgb.Dataset(x_val, label=y_val)

    lgb_clf = lgb.train(lgb_params,
                        train_data,
                        20000,
                        valid_sets = [val_data],
                        early_stopping_rounds=3000,
                        verbose_eval = 1000)

    y_pred = lgb_clf.predict(x_val)
    print("F1-MICRO SCORE: ", f1_score(np.array(pd.get_dummies(y_val)), threshold_arr(y_pred), average='micro'))
    lgb_clf.save_model(f'models/model{ix}.txt')

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 122305
[LightGBM] [Info] Number of data points in the train set: 208480, number of used features: 81
[LightGBM] [Info] Start training from score -2.333567
[LightGBM] [Info] Start training from score -0.564567
[LightGBM] [Info] Start training from score -1.095284
Training until validation scores don't improve for 3000 rounds
[1000]	valid_0's multi_error: 0.246925
[2000]	valid_0's multi_error: 0.245736
[3000]	valid_0's multi_error: 0.247002
[4000]	valid_0's multi_error: 0.247904
Early stopping, best iteration is:
[1840]	valid_0's multi_error: 0.245275
F1-MICRO SCORE:  0.7547245831814432
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 122213
[LightGBM] [Info] Number of data points in the train set: 208481, number of used features: 79
[LightGBM] [Info] Start training from score -2.343166
[LightGBM] [Info] Start training from score -0.562987
[LightGBM] [Info] Start t

# Create Submission File

In [19]:
# Load all LightGB Models and concatenate.
models = []
for i in range(5):
    model = lgb.Booster(model_file=f'models/model{i}.txt')

    y_pred = model.predict(x)
    score  = f1_score(np.array(pd.get_dummies(y)), threshold_arr(y_pred), average='micro')
    print("F1-MICRO SCORE: ", score)
    models.append(model)

F1-MICRO SCORE:  0.8166392300873749
F1-MICRO SCORE:  0.8194020744356315
F1-MICRO SCORE:  0.7932548225064371
F1-MICRO SCORE:  0.838546283398759
F1-MICRO SCORE:  0.8092793197263249


In [20]:
def ensemble(models, x):
    # Ensemble K-Fold CV models with adding all confidence score by class.
    y_preds = []
    
    for model in models:
        y_pred = model.predict(x)
        y_preds.append(y_pred)
        
    init_y_pred = y_preds[0]
    for ypred in y_preds[1:]:
        init_y_pred += ypred
        
    y_pred = threshold_arr(init_y_pred)
    
    return y_pred

In [21]:
df = test_data.drop(["building_id"], axis=1)
x = np.array(df)

In [22]:
y_pred = ensemble(models, x)
y_pred = y_pred.argmax(axis=1)+1

In [23]:
sub_csv["damage_grade"] = y_pred
sub_csv.to_csv("submission.csv", index=False)