# Dependecies

In [38]:
import os 
import numpy as np
import pandas as pd

import lightgbm as lgb

import keras 
from keras.layers import *
from keras.optimizers import *
from keras.models import Model

from sklearn.metrics import f1_score 
from sklearn.model_selection import KFold

In [39]:
DIR  = "data/"
SEED = 1881

if not os.path.isdir("models/"):
    os.makedirs("models")
    
print(os.listdir(DIR))

['submission_format.csv', 'test_values.csv', 'train_labels.csv', 'train_values.csv']


In [40]:
train_x = pd.read_csv(DIR+"train_values.csv")
train_y = pd.read_csv(DIR+"train_labels.csv")
test_x  = pd.read_csv(DIR+"test_values.csv")
sub_csv = pd.read_csv(DIR+"submission_format.csv")

# Geographic Location ID Embedding w/ Autoencoder

In [41]:
geo1 = np.array(pd.get_dummies(pd.concat([train_x["geo_level_1_id"], test_x["geo_level_1_id"]])))
geo2 = np.array(pd.get_dummies(pd.concat([train_x["geo_level_2_id"], test_x["geo_level_2_id"]])))
geo3 = np.array(pd.get_dummies(pd.concat([train_x["geo_level_3_id"], test_x["geo_level_3_id"]])))

In [42]:
geo3.shape

(347469, 11861)

In [45]:
def NET():
    inp = Input( shape=(geo3.shape[1],))
    i1 = Dense(16, name="intermediate")(inp)
    x2 = Dense(geo2.shape[1], activation='sigmoid')(i1)
    x1 = Dense(geo1.shape[1], activation='sigmoid')(i1)
    
    model = Model(inp, [x2,x1])
    model.compile(loss="binary_crossentropy", optimizer="adam")
    return model

In [46]:
model = NET()
model.fit(geo3, [geo2, geo1], batch_size=128, epochs=10, verbose=2)
model.save("geo_embed.h5")

Epoch 1/10
2715/2715 - 17s - loss: 0.2054 - dense_10_loss: 0.0532 - dense_11_loss: 0.1522
Epoch 2/10
2715/2715 - 16s - loss: 0.0531 - dense_10_loss: 0.0052 - dense_11_loss: 0.0480
Epoch 3/10
2715/2715 - 15s - loss: 0.0146 - dense_10_loss: 0.0041 - dense_11_loss: 0.0105
Epoch 4/10
2715/2715 - 15s - loss: 0.0067 - dense_10_loss: 0.0035 - dense_11_loss: 0.0032
Epoch 5/10
2715/2715 - 15s - loss: 0.0043 - dense_10_loss: 0.0030 - dense_11_loss: 0.0013
Epoch 6/10
2715/2715 - 15s - loss: 0.0030 - dense_10_loss: 0.0024 - dense_11_loss: 6.0220e-04
Epoch 7/10
2715/2715 - 14s - loss: 0.0020 - dense_10_loss: 0.0017 - dense_11_loss: 3.0605e-04
Epoch 8/10
2715/2715 - 15s - loss: 0.0013 - dense_10_loss: 0.0011 - dense_11_loss: 1.6005e-04
Epoch 9/10
2715/2715 - 15s - loss: 8.3416e-04 - dense_10_loss: 7.4900e-04 - dense_11_loss: 8.5161e-05
Epoch 10/10
2715/2715 - 15s - loss: 5.6902e-04 - dense_10_loss: 5.2231e-04 - dense_11_loss: 4.6712e-05


In [None]:
# Load GEO-Embed Model
model = NET()
model.load_weights("geo_embed.h5")

In [None]:
# "Extract Intermediate Layer" Function
from keras import backend as K

get_int_layer_output = K.function([model.layers[0].input],
                                  [model.layers[1].output])

In [47]:
geo3Expanded = tf.expand_dims(geo3, axis=1)

In [48]:
# Extract GEO-Embeds for all train data points.
# Then assign with train_data


out = []
for dat in geo3Expanded[:260601]:
    layer_output = get_int_layer_output([[dat]])[0]
    out.append(layer_output)

out = np.array(out)
out = np.squeeze(out)

train_data = pd.get_dummies(train_x.copy())
train_data = train_data.drop(['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id'], axis=1)
train_data = train_data.assign(geo_feat1=out[:,0],
                               geo_feat2=out[:,1],
                               geo_feat3=out[:,2],  
                               geo_feat4=out[:,3],
                               geo_feat5=out[:,4],    
                               geo_feat6=out[:,5],
                               geo_feat7=out[:,6],
                               geo_feat8=out[:,7],
                               geo_feat9=out[:,8],
                               geo_feat10=out[:,9],
                               geo_feat11=out[:,10],
                               geo_feat12=out[:,11],
                               geo_feat13=out[:,12],
                               geo_feat14=out[:,13],
                               geo_feat15=out[:,14],           
                               geo_feat16=out[:,15])

In [50]:
train_data.head()

Unnamed: 0,building_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,...,geo_feat7,geo_feat8,geo_feat9,geo_feat10,geo_feat11,geo_feat12,geo_feat13,geo_feat14,geo_feat15,geo_feat16
0,802906,2,30,6,5,1,1,0,0,0,...,1.298078,1.887222,1.773457,-1.606197,0.956388,-1.149275,-0.635502,-0.690516,-2.072248,-1.572519
1,28830,2,10,8,7,0,1,0,0,0,...,0.978798,0.707814,1.624692,-2.131671,1.214296,-1.321,1.097184,-0.870171,-1.831236,-1.476778
2,94947,2,10,5,5,0,1,0,0,0,...,2.855777,1.401815,2.107203,-3.018548,0.067623,0.475801,0.703143,1.704041,-1.125114,-2.531196
3,590882,2,10,6,5,0,1,0,0,0,...,1.135501,1.827378,1.406697,-2.348413,1.688675,0.774404,0.590601,1.712054,-1.901126,-1.193765
4,201944,3,30,8,9,1,0,0,0,0,...,-0.650774,1.102846,0.713444,-2.213503,1.267997,-1.579288,1.734004,1.563726,-0.64529,-2.48516


In [51]:
train_data.columns

Index(['building_id', 'count_floors_pre_eq', 'age', 'area_percentage',
       'height_percentage', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'count_families', 'has_secondary_use', 'has_secondary_use_agriculture',
       'has_secondary_use_hotel', 'has_secondary_use_rental',
       'has_secondary_use_institution', 'has_secondary_use_school',
       'has_secondary_use_industry', 'has_secondary_use_health_post',
       'has_secondary_use_gov_office', 'has_secondary_use_use_police',
       'has_secondary_use_other', 'land_surface_condition_n',
       'land_surface_condition_o', 'land_surface_condition

In [53]:
# Extract GEO-Embeds for all test data points.
# Then assign with test_data

out = []
for dat in geo3Expanded[260601:]:
    layer_output = get_int_layer_output([[dat]])[0]
    out.append(layer_output)

out = np.array(out)
out = np.squeeze(out)

test_data = pd.get_dummies(test_x.copy())
test_data = test_data.drop(['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id'], axis=1)
test_data = test_data.assign(geo_feat1=out[:,0],
                             geo_feat2=out[:,1],
                             geo_feat3=out[:,2],  
                             geo_feat4=out[:,3],
                             geo_feat5=out[:,4],    
                             geo_feat6=out[:,5],
                             geo_feat7=out[:,6],
                             geo_feat8=out[:,7],
                             geo_feat9=out[:,8],
                             geo_feat10=out[:,9],
                             geo_feat11=out[:,10],
                             geo_feat12=out[:,11],
                             geo_feat13=out[:,12],
                             geo_feat14=out[:,13],
                             geo_feat15=out[:,14],           
                             geo_feat16=out[:,15])

In [54]:
test_data.head()

Unnamed: 0,building_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,...,geo_feat7,geo_feat8,geo_feat9,geo_feat10,geo_feat11,geo_feat12,geo_feat13,geo_feat14,geo_feat15,geo_feat16
0,300051,3,20,7,6,0,1,0,0,0,...,1.840251,1.579733,0.319355,-1.440094,1.346968,-0.206693,0.317856,-0.315186,0.391554,-0.087974
1,99355,2,25,13,5,0,1,0,0,0,...,0.6737,0.684438,0.8907,-0.602692,1.234751,-0.979126,-0.891809,0.4547,-1.59686,0.288314
2,890251,2,5,4,5,0,1,0,0,0,...,0.564316,1.182937,0.468564,-1.877435,1.038722,-0.136939,-0.342095,1.024529,-0.981958,-0.437928
3,745817,1,0,19,3,0,0,0,0,0,...,2.473279,3.447883,1.050886,-1.3416,1.213109,-2.727836,1.263915,2.56065,-2.137207,-1.2418
4,421793,3,15,8,7,0,1,0,0,0,...,1.912903,1.471192,0.755208,-1.445566,1.361968,-0.625906,1.252519,0.65118,0.457922,0.214106


In [55]:
test_data.columns

Index(['building_id', 'count_floors_pre_eq', 'age', 'area_percentage',
       'height_percentage', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'count_families', 'has_secondary_use', 'has_secondary_use_agriculture',
       'has_secondary_use_hotel', 'has_secondary_use_rental',
       'has_secondary_use_institution', 'has_secondary_use_school',
       'has_secondary_use_industry', 'has_secondary_use_health_post',
       'has_secondary_use_gov_office', 'has_secondary_use_use_police',
       'has_secondary_use_other', 'land_surface_condition_n',
       'land_surface_condition_o', 'land_surface_condition

In [56]:
def threshold_arr(array):
    # Get major confidence-scored predicted value.
    new_arr = []
    for ix, val in enumerate(array):
        loc = np.array(val).argmax(axis=0)
        k = list(np.zeros((len(val))))
        k[loc]=1
        new_arr.append(k)
        
    return np.array(new_arr)

# LightGBM Training

In [57]:
y = np.array(train_y["damage_grade"])-1

df = train_data.drop(["building_id"], axis=1)
x = np.array(df)

kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
for ix, (train_index, test_index) in enumerate(kf.split(x)):
    lgb_params = {
        "objective" : "multiclass",
        "num_class":3,
        "metric" : "multi_error",
        "boosting": 'gbdt',
        "max_depth" : -1,
        "num_leaves" : 30,
        "learning_rate" : 0.1,
        "feature_fraction" : 0.5,
        "min_sum_hessian_in_leaf" : 0.1,
        "max_bin":8192,
        "verbosity" : 1,
        "num_threads":6,
        "seed": SEED
    }

    x_train, x_val, y_train, y_val= x[train_index], x[test_index], y[train_index], y[test_index]

    train_data = lgb.Dataset(x_train, label=y_train)
    val_data   = lgb.Dataset(x_val, label=y_val)

    lgb_clf = lgb.train(lgb_params,
                        train_data,
                        20000,
                        valid_sets = [val_data],
                        early_stopping_rounds=3000,
                        verbose_eval = 1000)

    y_pred = lgb_clf.predict(x_val)
    print("F1-MICRO SCORE: ", f1_score(np.array(pd.get_dummies(y_val)), threshold_arr(y_pred), average='micro'))
    lgb_clf.save_model(f'models/model{ix}.txt')

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 120542
[LightGBM] [Info] Number of data points in the train set: 208480, number of used features: 81
[LightGBM] [Info] Start training from score -2.333567
[LightGBM] [Info] Start training from score -0.564567
[LightGBM] [Info] Start training from score -1.095284
Training until validation scores don't improve for 3000 rounds
[1000]	valid_0's multi_error: 0.24729
[2000]	valid_0's multi_error: 0.246695
[3000]	valid_0's multi_error: 0.248364
[4000]	valid_0's multi_error: 0.249803
Early stopping, best iteration is:
[1703]	valid_0's multi_error: 0.245717
F1-MICRO SCORE:  0.7542833023157651
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 120507
[LightGBM] [Info] Number of data points in the train set: 208481, number of used features: 79
[LightGBM] [Info] Start training from score -2.343166
[LightGBM] [Info] Start training from score -0.562987
[LightGBM] [Info] Start tr

KeyboardInterrupt: 

# Create Submission File

In [58]:
# Load all LightGB Models and concatenate.
models = []
for i in range(5):
    model = lgb.Booster(model_file=f'models/model{i}.txt')

    y_pred = model.predict(x)
    score  = f1_score(np.array(pd.get_dummies(y)), threshold_arr(y_pred), average='micro')
    print("F1-MICRO SCORE: ", score)
    models.append(model)

F1-MICRO SCORE:  0.8127712479998158
F1-MICRO SCORE:  0.8276253736555117
F1-MICRO SCORE:  0.8206914017981511


LightGBMError: Could not open models/model3.txt

In [59]:
def ensemble(models, x):
    # Ensemble K-Fold CV models with adding all confidence score by class.
    y_preds = []
    
    for model in models:
        y_pred = model.predict(x)
        y_preds.append(y_pred)
        
    init_y_pred = y_preds[0]
    for ypred in y_preds[1:]:
        init_y_pred += ypred
        
    y_pred = threshold_arr(init_y_pred)
    
    return y_pred

In [60]:
df = test_data.drop(["building_id"], axis=1)
x = np.array(df)

In [61]:
y_pred = ensemble(models, x)
y_pred = y_pred.argmax(axis=1)+1

In [62]:
sub_csv["damage_grade"] = y_pred
sub_csv.to_csv("submission.csv", index=False)