In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import gc
import warnings
from bayes_opt import BayesianOptimization

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import KFold
import warnings
import time
import sys
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from datetime import datetime
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod(
            (datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [3]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
train = pd.read_csv("data/train.csv", index_col=0)
train = reduce_mem_usage(train)

target = train['target']
del train['target']

Mem. usage decreased to  4.04 Mb (56.2% reduction)


In [5]:
unimportant_features = [
    'auth_category_2_1.0_mean',
    'auth_category_2_2.0_mean',
    'auth_category_2_3.0_mean',
    'auth_category_2_5.0_mean',
    'hist_category_2_3.0_mean',
    'hist_category_2_4.0_mean',
    'hist_category_2_5.0_mean',
    'hist_category_3_A_mean',
    'hist_installments_min',
    'hist_installments_std',
    'hist_month_lag_std',
    'hist_purchase_amount_max',
    'hist_purchase_month_max',
    'hist_purchase_month_min',
    'hist_purchase_month_std',
    'installments_min_mean',
    'new_category_2_1.0_mean',
    'new_category_2_2.0_mean',
    'new_category_2_3.0_mean',
    'new_category_2_5.0_mean',
    'new_city_id_nunique',
    'new_installments_std',
    'new_state_id_nunique',
    'purchase_amount_mean_mean'
]
features = [c for c in train.columns if c not in ['card_id', 'first_active_month']]
#features = [f for f in features if f not in unimportant_features]
categorical_feats = [c for c in features if 'feature_' in c]

In [6]:
def LGB_CV(
          max_depth,
          num_leaves,
          min_data_in_leaf,
          feature_fraction,
          bagging_fraction,
          lambda_l1
         ):
    
    folds = KFold(n_splits=10, shuffle=True, random_state=15)
    oof = np.zeros(train.shape[0])

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
        print("fold n°{}".format(fold_))
        trn_data = lgb.Dataset(train.iloc[trn_idx][features],
                               label=target.iloc[trn_idx],
                               categorical_feature=categorical_feats)
        val_data = lgb.Dataset(train.iloc[val_idx][features],
                               label=target.iloc[val_idx],
                               categorical_feature=categorical_feats)
    
        param = {
            'num_leaves': int(num_leaves),
            'min_data_in_leaf': int(min_data_in_leaf), 
            'objective':'regression',
            'max_depth': int(max_depth),
            'learning_rate': 0.01,
            "boosting": "gbdt",
            "feature_fraction": feature_fraction,
            "bagging_freq": 1,
            "bagging_fraction": bagging_fraction ,
            "bagging_seed": 11,
            "metric": 'rmse',
            "lambda_l1": lambda_l1,
            "verbosity": -1
        }
    
        clf = lgb.train(param,
                        trn_data,
                        10000,
                        valid_sets = [trn_data, val_data],
                        verbose_eval=500,
                        early_stopping_rounds = 500)
        
        oof[val_idx] = clf.predict(train.iloc[val_idx][features],
                                   num_iteration=clf.best_iteration)
        
        del clf, trn_idx, val_idx
        gc.collect()
        
    return -mean_squared_error(oof, target)**0.5

In [7]:
LGB_BO = BayesianOptimization(LGB_CV, {
    'max_depth': (1, 20),
    'num_leaves': (2, 200),
    'min_data_in_leaf': (1, 200),
    'feature_fraction': (0.2, 1.0),
    'bagging_fraction': (0.2, 1.0),
    'lambda_l1': (0, 6)
    })

In [8]:
print('-'*126)

start_time = timer(None)
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    LGB_BO.maximize(init_points=2, n_iter=20, acq='ei', xi=0.0)
timer(start_time)


print('-'*130)
print('Final Results')
print('Maximum  value: %f' % LGB_BO.max['target'])
print('Best  parameters: ', LGB_BO.max['params'])


history_df = pd.DataFrame([x['params'] for x in LGB_BO.res])
history_df2 = pd.DataFrame([x['target'] for x in LGB_BO.res])
history_df = pd.concat((history_df, history_df2), axis=1)
history_df.rename(columns = { 0 : 'gini'}, inplace=True)
history_df.to_csv('ParametersOptimization.csv')

------------------------------------------------------------------------------------------------------------------------------
|   iter    |  target   | baggin... | featur... | lambda_l1 | max_depth | min_da... | num_le... |
-------------------------------------------------------------------------------------------------
fold n°0
Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 3.84075	valid_1's rmse: 3.9206
[1000]	training's rmse: 3.84073	valid_1's rmse: 3.92051
[1500]	training's rmse: 3.84073	valid_1's rmse: 3.9205
Early stopping, best iteration is:
[1018]	training's rmse: 3.84073	valid_1's rmse: 3.9205
fold n°1
Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 3.84943	valid_1's rmse: 3.84312
Early stopping, best iteration is:
[272]	training's rmse: 3.84954	valid_1's rmse: 3.84307
fold n°2
Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 3.85707	valid_1's rmse: 3.77361
[1000]

[500]	training's rmse: 3.8459	valid_1's rmse: 3.88128
[1000]	training's rmse: 3.84582	valid_1's rmse: 3.8812
[1500]	training's rmse: 3.84581	valid_1's rmse: 3.88118
Early stopping, best iteration is:
[1202]	training's rmse: 3.84581	valid_1's rmse: 3.88118
fold n°6
Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 3.8294	valid_1's rmse: 4.02527
Early stopping, best iteration is:
[177]	training's rmse: 3.82975	valid_1's rmse: 4.02517
fold n°7
Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 3.8387	valid_1's rmse: 3.94494
[1000]	training's rmse: 3.83864	valid_1's rmse: 3.94471
[1500]	training's rmse: 3.83864	valid_1's rmse: 3.94464
[2000]	training's rmse: 3.83864	valid_1's rmse: 3.94463
Early stopping, best iteration is:
[1634]	training's rmse: 3.83864	valid_1's rmse: 3.94462
fold n°8
Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 3.85158	valid_1's rmse: 3.83045
[1000]	training'

[1000]	training's rmse: 3.84074	valid_1's rmse: 3.92053
[1500]	training's rmse: 3.84074	valid_1's rmse: 3.92049
[2000]	training's rmse: 3.84073	valid_1's rmse: 3.92053
Early stopping, best iteration is:
[1680]	training's rmse: 3.84074	valid_1's rmse: 3.92046
fold n°1
Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 3.84943	valid_1's rmse: 3.84311
Early stopping, best iteration is:
[272]	training's rmse: 3.84955	valid_1's rmse: 3.84306
fold n°2
Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 3.85707	valid_1's rmse: 3.77361
[1000]	training's rmse: 3.85705	valid_1's rmse: 3.7736
Early stopping, best iteration is:
[634]	training's rmse: 3.85706	valid_1's rmse: 3.77357
fold n°3
Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 3.85044	valid_1's rmse: 3.83392
Early stopping, best iteration is:
[462]	training's rmse: 3.85045	valid_1's rmse: 3.83389
fold n°4
Training until validation 

[500]	training's rmse: 3.83868	valid_1's rmse: 3.94481
[1000]	training's rmse: 3.83864	valid_1's rmse: 3.94465
[1500]	training's rmse: 3.83864	valid_1's rmse: 3.94463
[2000]	training's rmse: 3.83864	valid_1's rmse: 3.94463
Early stopping, best iteration is:
[1846]	training's rmse: 3.83864	valid_1's rmse: 3.94461
fold n°8
Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 3.85153	valid_1's rmse: 3.83039
[1000]	training's rmse: 3.85149	valid_1's rmse: 3.83026
[1500]	training's rmse: 3.85149	valid_1's rmse: 3.83022
[2000]	training's rmse: 3.85149	valid_1's rmse: 3.83024
Early stopping, best iteration is:
[1608]	training's rmse: 3.85149	valid_1's rmse: 3.83022
fold n°9
Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 3.85991	valid_1's rmse: 3.7535
[1000]	training's rmse: 3.85988	valid_1's rmse: 3.75348
Early stopping, best iteration is:
[964]	training's rmse: 3.85988	valid_1's rmse: 3.75347
| [0m 8       [0m | [0m-3

fold n°1
Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 3.85012	valid_1's rmse: 3.84297
[1000]	training's rmse: 3.85009	valid_1's rmse: 3.84291
[1500]	training's rmse: 3.85009	valid_1's rmse: 3.84286
[2000]	training's rmse: 3.85009	valid_1's rmse: 3.84287
Early stopping, best iteration is:
[1543]	training's rmse: 3.85009	valid_1's rmse: 3.84286
fold n°2
Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 3.8576	valid_1's rmse: 3.77473
Early stopping, best iteration is:
[414]	training's rmse: 3.85763	valid_1's rmse: 3.77471
fold n°3
Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 3.85109	valid_1's rmse: 3.83408
[1000]	training's rmse: 3.85106	valid_1's rmse: 3.83403
[1500]	training's rmse: 3.85106	valid_1's rmse: 3.83404
Early stopping, best iteration is:
[1239]	training's rmse: 3.85106	valid_1's rmse: 3.83402
fold n°4
Training until validation scores don't improve for 500 roun

Early stopping, best iteration is:
[196]	training's rmse: 3.82964	valid_1's rmse: 4.02519
fold n°7
Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 3.83868	valid_1's rmse: 3.94481
[1000]	training's rmse: 3.83864	valid_1's rmse: 3.94466
[1500]	training's rmse: 3.83864	valid_1's rmse: 3.94463
[2000]	training's rmse: 3.83864	valid_1's rmse: 3.94462
[2500]	training's rmse: 3.83864	valid_1's rmse: 3.94463
Early stopping, best iteration is:
[2030]	training's rmse: 3.83864	valid_1's rmse: 3.94462
fold n°8
Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 3.85153	valid_1's rmse: 3.8304
[1000]	training's rmse: 3.85149	valid_1's rmse: 3.83026
[1500]	training's rmse: 3.85149	valid_1's rmse: 3.83023
[2000]	training's rmse: 3.85149	valid_1's rmse: 3.83024
Early stopping, best iteration is:
[1578]	training's rmse: 3.85149	valid_1's rmse: 3.83022
fold n°9
Training until validation scores don't improve for 500 rounds.
[500]	train

[500]	training's rmse: 3.85017	valid_1's rmse: 3.84308
[1000]	training's rmse: 3.8501	valid_1's rmse: 3.84292
[1500]	training's rmse: 3.85009	valid_1's rmse: 3.84289
[2000]	training's rmse: 3.85009	valid_1's rmse: 3.84287
[2500]	training's rmse: 3.85009	valid_1's rmse: 3.84286
[3000]	training's rmse: 3.85009	valid_1's rmse: 3.84286
Early stopping, best iteration is:
[2730]	training's rmse: 3.85009	valid_1's rmse: 3.84286
fold n°2
Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 3.85765	valid_1's rmse: 3.77475
[1000]	training's rmse: 3.85757	valid_1's rmse: 3.77476
Early stopping, best iteration is:
[712]	training's rmse: 3.85759	valid_1's rmse: 3.77474
fold n°3
Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 3.85115	valid_1's rmse: 3.83411
[1000]	training's rmse: 3.85107	valid_1's rmse: 3.83406
[1500]	training's rmse: 3.85106	valid_1's rmse: 3.83405
[2000]	training's rmse: 3.85106	valid_1's rmse: 3.83404
[2500]	

[1000]	training's rmse: 3.84581	valid_1's rmse: 3.88118
Early stopping, best iteration is:
[648]	training's rmse: 3.84583	valid_1's rmse: 3.88116
fold n°6
Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 3.82934	valid_1's rmse: 4.02534
Early stopping, best iteration is:
[176]	training's rmse: 3.82966	valid_1's rmse: 4.02515
fold n°7
Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 3.83867	valid_1's rmse: 3.94479
[1000]	training's rmse: 3.83864	valid_1's rmse: 3.94463
[1500]	training's rmse: 3.83864	valid_1's rmse: 3.94462
[2000]	training's rmse: 3.83864	valid_1's rmse: 3.94461
Early stopping, best iteration is:
[1516]	training's rmse: 3.83864	valid_1's rmse: 3.94464
fold n°8
Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 3.85152	valid_1's rmse: 3.83036
[1000]	training's rmse: 3.85149	valid_1's rmse: 3.83027
[1500]	training's rmse: 3.85149	valid_1's rmse: 3.83023
[2000]	train

fold n°0
Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 3.84077	valid_1's rmse: 3.9207
[1000]	training's rmse: 3.84074	valid_1's rmse: 3.92056
[1500]	training's rmse: 3.84073	valid_1's rmse: 3.92052
[2000]	training's rmse: 3.84073	valid_1's rmse: 3.92051
[2500]	training's rmse: 3.84073	valid_1's rmse: 3.9205
[3000]	training's rmse: 3.84073	valid_1's rmse: 3.9205
Early stopping, best iteration is:
[2882]	training's rmse: 3.84073	valid_1's rmse: 3.9205
fold n°1
Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 3.84948	valid_1's rmse: 3.84304
[1000]	training's rmse: 3.84942	valid_1's rmse: 3.84312
Early stopping, best iteration is:
[523]	training's rmse: 3.84948	valid_1's rmse: 3.84304
fold n°2
Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 3.8571	valid_1's rmse: 3.77369
[1000]	training's rmse: 3.85706	valid_1's rmse: 3.7736
[1500]	training's rmse: 3.85706	valid_1's rmse: 3.77