In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

In [3]:
import lightgbm as lgb
import hyperopt as hpt

In [3]:
train_sampled = pd.read_csv("datasets/train_sampled.csv", encoding='utf8', index_col=False)

In [4]:
cat_cols = [
    'site_id',
    'site_domain',
    'site_category',
    'app_id',
    'app_domain',
    'app_category',
    'device_id',
    'device_ip',
    'device_model',
]
train_sampled[cat_cols]

Unnamed: 0,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model
0,5114c672,3f2f3819,3e814130,ecad2386,7801e8d9,07d7df22,a99f214a,cec8d87f,7065d1c1
1,856e6d3f,58a89a43,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,b6be2313,597f7364
2,856e6d3f,58a89a43,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,26f61e9b,76dc4769
3,ce3307ec,7e091613,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,285aa37d,cad4c01d
4,9e8cf15d,0d3cb7be,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,0489ce3f,ecb851b2
...,...,...,...,...,...,...,...,...,...
2021443,f5af7a86,c4e18dd6,3e814130,ecad2386,7801e8d9,07d7df22,a99f214a,e0527567,4962150e
2021444,85f751fd,c4e18dd6,50e219e0,03528b27,2347f47a,8ded1f7a,797b971a,57cd4006,1f0bc64f
2021445,e151e245,7e091613,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,0b7e6fa3,158e4944
2021446,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,38fe3782,d4897fef


In [4]:
from sklearn.preprocessing import LabelEncoder

In [5]:
import joblib

In [7]:
for col in cat_cols:
    label_encoder = LabelEncoder()
    train_sampled[col] = pd.DataFrame(label_encoder.fit_transform(train_sampled[col]))
    joblib.dump(label_encoder, './label_encoders/{}.save'.format(col))

In [8]:
train_labels = train_sampled.pop('click')
train_features = train_sampled

In [17]:
train_set = lgb.Dataset(train_features, train_labels)

In [6]:
import csv

In [19]:
N_FOLDS = 10

In [20]:
def objective(params, n_folds=N_FOLDS):
    """Objective function for Gradient Boosting Machine Hyperparameter Tuning"""
    
    # Perform n_fold cross validation with hyperparameters
    # Use early stopping and evalute based on ROC AUC
    cv_results = lgb.cv(params,
                        train_set,
                        nfold=n_folds,
                        num_boost_round = 1000,
                        early_stopping_rounds = 10,
                        metrics = 'auc',
                        seed=27)
  
    # Extract the best score
    best_score = np.max(cv_results['auc-mean'])
    loss = 1 - best_score
    
    n_estimators = int(np.argmax(cv_results['auc-mean']) + 1)
    writer.writerow([loss, params, n_estimators])
    return {'loss': loss,
            'params': params,
            'estimators': n_estimators, 
            'status': hpt.STATUS_OK}

In [21]:
space = {
    'boosting_type': hpt.hp.choice('boosting_type', ['gbdt', 'goss']),
    'num_leaves': hpt.hp.choice('num_leaves', np.arange(2, 1500+1, dtype=int)),#hpt.hp.quniform('num_leaves', 5, 150, 1),
    'max_depth': hpt.hp.choice('max_depth', np.arange(2, 10+1, dtype=int)),
    'learning_rate': hpt.hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
    'seed': 27,
}

In [7]:
from hyperopt import Trials
# Keep track of results
bayes_trials = Trials()

In [10]:
MAX_EVALS = 100
out_file = './trials/lgb_trials.csv'
of_connection = open(out_file, 'w')
writer = csv.writer(of_connection)
writer.writerow(['loss', 'params', 'estimators'])

24

In [24]:
%%time
best = hpt.fmin(fn=objective,
                     space = space,
                     algo = hpt.tpe.suggest,                      
                     max_evals = MAX_EVALS, 
                     trials = bayes_trials,
                     rstate = np.random.RandomState(27))

of_connection.close()

100%|████████████████████████████████████████████| 100/100 [12:08:21<00:00, 437.02s/it, best loss: 0.24553986869099376]
Wall time: 12h 8min 22s


In [8]:
import matplotlib.pyplot as plt

In [11]:
results=pd.read_csv(out_file)
plt.plot(results['loss'])

EmptyDataError: No columns to parse from file

In [27]:
results.sort_values('loss', ascending = True, inplace = True)
# results.reset_index(inplace = True, drop = True)
results.head()

Unnamed: 0,loss,params,estimators
68,0.24554,"{'boosting_type': 'gbdt', 'learning_rate': 0.0...",928
32,0.245678,"{'boosting_type': 'gbdt', 'learning_rate': 0.0...",1000
78,0.245692,"{'boosting_type': 'gbdt', 'learning_rate': 0.0...",996
59,0.245709,"{'boosting_type': 'gbdt', 'learning_rate': 0.0...",1000
67,0.245711,"{'boosting_type': 'gbdt', 'learning_rate': 0.0...",1000


In [28]:
base_hp = {
    'objective': 'binary',
    'n_jobs': -1,
    'metrics': 'binary_error'
}

best_hp = eval(results.params.values[0])
best_bayes_estimators = int(results.estimators.values[0])

best_hp.update(base_hp)

In [1]:
best_hp

NameError: name 'best_hp' is not defined

In [None]:
eval_label = df_eval.pop('is_over_kpi')
eval_features = df_eval.copy()
valid_sets = lgb.Dataset(eval_features, eval_label)

In [None]:
test_label = df_test.pop('is_over_kpi')
test_features = df_test.copy()

In [30]:
best_bayes_model = lgb.train(params = best_hp,
                             train_set = train_set,
#                              valid_sets = [train_set, valid_sets],
#                              valid_names=['train', 'eval'],
                             num_boost_round = 10000,)
#                              early_stopping_rounds = 100)

In [None]:
from sklearn.metrics import roc_auc_score, f1_score
predicts = best_bayes_model.predict(test_features)
predictions = pd.DataFrame({'class': predicts})
predictions['class'] = predictions['class'].apply(lambda x : 1 if x >= 0.5 else 0)
print('The best model from Bayes optimization scores {:.5f} AUC ROC on the test set.'.format(roc_auc_score(test_label, preds)))

In [None]:
predicts = best_bayes_model.predict(test_features)
predictions = pd.DataFrame({'class': predicts})
predictions['class'] = predictions['class'].apply(lambda x : 1 if x >= 0.5 else 0)
f1_score(test_label.values, predictions)