<center><h1>Choosing the best model and producing the submission file</h1></center>

The tree-based model was adopted as it had shown better generalization and stability.

In [1]:
import pandas as pd

# loading results
df_xgb = pd.read_csv("./simulation_results/df_xgb.csv", sep='\t')

temp = df_xgb.sort_values("$f_o$", ascending=False).iloc[0,:][0:3].to_dict()
temp.update({'tree_method':'gpu_hist', 'objective':'multi:softmax'})
best_params = {key:value if key!='n_estimators'and key!='max_depth' else int(value) for key, value in temp.items()}
print("best_params:")
display(best_params)

best_params:


{'learning_rate': 0.1,
 'n_estimators': 500,
 'max_depth': 12,
 'tree_method': 'gpu_hist',
 'objective': 'multi:softmax'}

In [2]:
%%time
# training the chosen model in all the training dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from xgboost import XGBClassifier

# loading preprocessed data set
X_tr = pd.read_csv("X_tr.csv")
Y_tr = pd.read_csv("Y_tr.csv", header=None)

# Train/validation split
X_train, X_validation, y_train, y_validation = train_test_split(X_tr, Y_tr, test_size=0.2)

# no scaling is needed!!!

# model fitting
xgb = XGBClassifier(**best_params)
xgb.fit(X_train, y_train, eval_set=[(X_validation,y_validation)]
        ,early_stopping_rounds=30, verbose=False)

# model final evaluation on validation set
y_pred = xgb.predict(X_validation)
print("F1_validation: {}".format(f1_score(y_validation, y_pred, average='weighted')))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


F1_validation: 0.7730810044333333
CPU times: user 9min 26s, sys: 2min 27s, total: 11min 53s
Wall time: 11min 53s


In [3]:
import pickle

In [4]:
# saving final model
# with open('model_data.pkl', 'wb') as output:
#     pickle.dump(xgb, output, pickle.HIGHEST_PROTOCOL)

In [5]:
# loading final model
with open('model_data.pkl', 'rb') as input:
    xgb = pickle.load(input)

In [6]:
xgb

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=12,
              min_child_weight=1, missing=nan, n_estimators=500, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, tree_method='gpu_hist', verbosity=1)

In [7]:
# preprocessing of the test data set
test       = pd.read_csv("./dataset/test.csv")
build_own  = pd.read_csv("./dataset/Building_Ownership_Use.csv") 
build_str  = pd.read_csv("./dataset/Building_Structure.csv")
build_data = pd.merge(build_str, build_own,  on=['building_id', 'district_id', 'vdcmun_id', 'ward_id'])
testFull   = pd.merge(test,      build_data, on=['building_id', 'district_id', 'vdcmun_id'])
test_num   = testFull

# encoding to dummies
catFeat = ['area_assesed','district_id','land_surface_condition','foundation_type','roof_type',
           'ground_floor_type','other_floor_type','position','plan_configuration','condition_post_eq',
           'legal_ownership_status']
test_num = pd.get_dummies(test_num, columns=catFeat,                drop_first=True)
test_num = pd.get_dummies(test_num, columns=['has_repair_started'], drop_first=True, dummy_na=True)

# Converting 'building_id' to numerical format
test_num['building_id'] = test_num['building_id'].apply(lambda x: int(x,16))
print("test.shape = {}".format(test_num.shape))

test.shape = (421175, 113)


In [8]:
test_predictions = xgb.predict(test_num)

# Converting predictions to submission format
damage_grade = ['Grade {}'.format(prediction) for prediction in test_predictions]

In [9]:
# creating and saving predictions data frame for submission
submission = pd.DataFrame(data={'building_id':  testFull['building_id'].values,
                                'damage_grade': damage_grade
                               })

submission.head()

Unnamed: 0,building_id,damage_grade
0,a3380c4f75,Grade 3
1,a338a4e653,Grade 5
2,a338a4e6b7,Grade 5
3,a33a6eaa3a,Grade 3
4,a33b073ff6,Grade 5


In [10]:
submission.to_csv('submission.csv', index=False)