In [1]:
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb

In [2]:
train = pd.read_csv('Train/train_set_geo_level_sum.csv')

train.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,...,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,count_families,geo_level_1_id_sum,geo_level_2_id_sum,geo_level_3_id_sum,damage_grade
0,6,487,12198,2,30,6,5,1,1,0,...,0,0,0,0,0,1,24381,270,37,3
1,8,900,2812,2,10,8,7,0,1,0,...,0,0,0,0,0,1,19080,199,16,2
2,21,363,8973,2,10,5,5,0,1,0,...,0,0,0,0,0,1,14889,1760,136,3
3,22,418,10694,2,10,6,5,0,1,0,...,1,1,0,0,0,1,6252,205,31,2
4,11,131,1488,3,30,8,9,1,0,0,...,0,0,0,0,0,1,8220,1038,122,3


In [3]:
X, y = train.iloc[:,:-1], train.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [30]:
params = {
    'boosting_type': ['gbdt'],
    'num_leaves': [10, 20, 30],
    'max_depth': [10, 20, 50],
    'learning_rate': [0.1, 0.01, 0.5],
    'n_estimators': [10, 30, 40],
    'objective': ['multiclass'],
    'random_state': [123],
    'silent': [True],
    'force_row_wise': [False],
    'force_col_wise': [False]
}

In [32]:
opt = RandomizedSearchCV(lgb.LGBMClassifier(), params, cv=10, n_iter=81)
opt.fit(X_train, y_train)

RandomizedSearchCV(cv=10, estimator=LGBMClassifier(), n_iter=81,
                   param_distributions={'boosting_type': ['gbdt'],
                                        'force_col_wise': [False],
                                        'force_row_wise': [False],
                                        'learning_rate': [0.1, 0.01, 0.5],
                                        'max_depth': [10, 20, 50],
                                        'n_estimators': [10, 30, 40],
                                        'num_leaves': [10, 20, 30],
                                        'objective': ['multiclass'],
                                        'random_state': [123],
                                        'silent': [True]})

In [33]:
opt.best_estimator_

LGBMClassifier(force_col_wise=False, force_row_wise=False, learning_rate=0.5,
               max_depth=10, n_estimators=40, num_leaves=30,
               objective='multiclass', random_state=123)

In [34]:
preds = opt.predict(X_test)
f1 = f1_score(y_test, preds, average='micro')
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.727883


In [4]:
params = {
    'boosting_type': ['gbdt'],
    'num_leaves': [30, 40, 50],
    'max_depth': [5, 10],
    'learning_rate': [0.1, 0.5, 0.7],
    'n_estimators': [40, 50, 60],
    'objective': ['multiclass'],
    'random_state': [123],
    'silent': [True],
    'force_row_wise': [False],
    'force_col_wise': [False]
}

In [6]:
opt_2 = RandomizedSearchCV(lgb.LGBMClassifier(), params, cv=10, n_iter=54)
opt_2.fit(X_train, y_train)

RandomizedSearchCV(cv=10, estimator=LGBMClassifier(), n_iter=54,
                   param_distributions={'boosting_type': ['gbdt'],
                                        'force_col_wise': [False],
                                        'force_row_wise': [False],
                                        'learning_rate': [0.1, 0.5, 0.7],
                                        'max_depth': [5, 10],
                                        'n_estimators': [40, 50, 60],
                                        'num_leaves': [30, 40, 50],
                                        'objective': ['multiclass'],
                                        'random_state': [123],
                                        'silent': [True]})

In [7]:
opt_2.best_estimator_

LGBMClassifier(force_col_wise=False, force_row_wise=False, learning_rate=0.5,
               max_depth=10, n_estimators=60, num_leaves=50,
               objective='multiclass', random_state=123)

In [8]:
preds = opt_2.predict(X_test)
f1 = f1_score(y_test, preds, average='micro')
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.731720


In [9]:
params = {
    'boosting_type': ['gbdt'],
    'num_leaves': [40, 50, 60, 70],
    'max_depth': [5, 10, 20],
    'learning_rate': [0.1, 0.5, 0.7],
    'n_estimators': [60, 75, 100],
    'objective': ['multiclass'],
    'random_state': [123],
    'silent': [True],
    'force_row_wise': [False],
    'force_col_wise': [False]
}

In [10]:
opt_3 = RandomizedSearchCV(lgb.LGBMClassifier(), params, cv=10, n_iter=108)
opt_3.fit(X_train, y_train)

RandomizedSearchCV(cv=10, estimator=LGBMClassifier(), n_iter=108,
                   param_distributions={'boosting_type': ['gbdt'],
                                        'force_col_wise': [False],
                                        'force_row_wise': [False],
                                        'learning_rate': [0.1, 0.5, 0.7],
                                        'max_depth': [5, 10, 20],
                                        'n_estimators': [60, 75, 100],
                                        'num_leaves': [40, 50, 60, 70],
                                        'objective': ['multiclass'],
                                        'random_state': [123],
                                        'silent': [True]})

In [11]:
opt_3.best_estimator_

LGBMClassifier(force_col_wise=False, force_row_wise=False, learning_rate=0.5,
               max_depth=10, num_leaves=60, objective='multiclass',
               random_state=123)

In [12]:
preds = opt_3.predict(X_test)
f1 = f1_score(y_test, preds, average='micro')
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.736594
