# Implementing LightGBM for a different set of space parameters

## Function: K Folds Cross Validation using Hyperopt

In [4]:
data_path = "C:/Users/as14478/Sanchita Kaggle/"
submission_path= data_path+'submission/'
fold_path = data_path+'fold_data/'

from Functions import *

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
import lightgbm as lgb

%matplotlib inline
seed=1204

cv_loss_list=[]
n_iteration_list=[]

def score(params):
    print("Training with params: ")
    print(params)
    cv_losses=[]
    cv_iteration=[]
    for (train_idx,val_idx) in cv:
        cv_train = X.iloc[train_idx]
        cv_val = X.iloc[val_idx]
        cv_y_train = y[train_idx]
        cv_y_val = y[val_idx]
        lgb_model = lgb.train(params, lgb.Dataset(cv_train, label=cv_y_train), 2000, 
                          lgb.Dataset(cv_val, label=cv_y_val), verbose_eval=False, 
                          early_stopping_rounds=100)
       
        train_pred = lgb_model.predict(cv_train,lgb_model.best_iteration+1)
        val_pred = lgb_model.predict(cv_val,lgb_model.best_iteration+1)
        
        val_loss = root_mean_squared_error(cv_y_val,val_pred)
        train_loss = root_mean_squared_error(cv_y_train,train_pred)
        print('Train RMSE: {}. Val RMSE: {}'.format(train_loss,val_loss))
        print('Best iteration: {}'.format(lgb_model.best_iteration))
        cv_losses.append(val_loss)
        cv_iteration.append(lgb_model.best_iteration)
    print('6 fold results: {}'.format(cv_losses))
    cv_loss_list.append(cv_losses)
    n_iteration_list.append(cv_iteration)
    
    mean_cv_loss = np.mean(cv_losses)
    print('Average iterations: {}'.format(np.mean(cv_iteration)))
    print("Mean Cross Validation RMSE: {}\n".format(mean_cv_loss))
    return {'loss': mean_cv_loss, 'status': STATUS_OK}

def optimize(space,seed=seed,max_evals=5):
    
    best = fmin(score, space, algo=tpe.suggest, 
        # trials=trials, 
        max_evals=max_evals)
    return best

In [5]:
all_data = get_all_data(data_path,'new_sales_lag_after12.pickle')
X,y = get_X_y(all_data,33)
X.drop('date_block_num',axis=1,inplace=True)
cv = get_cv_idxs(all_data,28,33)

## Defining Space Parameters

In [7]:
space = {
#     'max_depth': hp.choice('max_depth', np.arange(3, 15, dtype=int)),
    'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
    'min_data_in_leaf': hp.choice('min_data_in_leaf',np.arange(5, 30,1, dtype=int)),
    'learning_rate': hp.quniform('learning_rate', 0.025, 0.5, 0.025),
    'seed':seed,
    'objective': 'regression',
    'metric':'rmse',
}
best_hyperparams = optimize(space,max_evals=20)
print("The best hyperparameters are: ")
print(best_hyperparams)

Training with params: 
{'colsample_bytree': 0.55, 'learning_rate': 0.45, 'metric': 'rmse', 'min_data_in_leaf': 7, 'objective': 'regression', 'seed': 1204, 'subsample': 0.5}
Train RMSE: 0.8138498804706965. Val RMSE: 0.8519609778799113
Best iteration: 34
Train RMSE: 0.7698732291520568. Val RMSE: 0.7881710281819708
Best iteration: 90
Train RMSE: 0.8302663445399524. Val RMSE: 0.7180596121305637
Best iteration: 19
Train RMSE: 0.7436565895092244. Val RMSE: 0.7892349365907114
Best iteration: 139
Train RMSE: 0.8406105801204801. Val RMSE: 0.9051770401067432
Best iteration: 12
Train RMSE: 0.8034656581567735. Val RMSE: 0.9784288640980342
Best iteration: 40
6 fold results: [0.8519609778799113, 0.7881710281819708, 0.7180596121305637, 0.7892349365907114, 0.9051770401067432, 0.9784288640980342]
Average iterations: 55.666666666666664
Mean Cross Validation RMSE: 0.8385054098313224

Training with params: 
{'colsample_bytree': 0.8500000000000001, 'learning_rate': 0.07500000000000001, 'metric': 'rmse', 'm

Train RMSE: 0.7958667785088822. Val RMSE: 0.8569868881494845
Best iteration: 46
Train RMSE: 0.8143898895714956. Val RMSE: 0.7845434909674709
Best iteration: 28
Train RMSE: 0.7197725214530892. Val RMSE: 0.7130503989162306
Best iteration: 204
Train RMSE: 0.7406430547485386. Val RMSE: 0.7973888509649526
Best iteration: 150
Train RMSE: 0.7957415734882141. Val RMSE: 0.8894136249920724
Best iteration: 37
Train RMSE: 0.8245250612663878. Val RMSE: 0.9247403924574509
Best iteration: 20
6 fold results: [0.8569868881494845, 0.7845434909674709, 0.7130503989162306, 0.7973888509649526, 0.8894136249920724, 0.9247403924574509]
Average iterations: 80.83333333333333
Mean Cross Validation RMSE: 0.8276872744079435

Training with params: 
{'colsample_bytree': 0.65, 'learning_rate': 0.05, 'metric': 'rmse', 'min_data_in_leaf': 21, 'objective': 'regression', 'seed': 1204, 'subsample': 0.65}
Train RMSE: 0.7574297437328377. Val RMSE: 0.8268654907073704
Best iteration: 787
Train RMSE: 0.7805627574492477. Val RMS

Train RMSE: 0.7224752592865761. Val RMSE: 0.7717207048843059
Best iteration: 536
Train RMSE: 0.7540546689665839. Val RMSE: 0.8897251983188682
Best iteration: 283
Train RMSE: 0.7891404844394228. Val RMSE: 0.9107621166245996
Best iteration: 129
6 fold results: [0.8417506454099192, 0.775276859210448, 0.7036487984662023, 0.7717207048843059, 0.8897251983188682, 0.9107621166245996]
Average iterations: 284.0
Mean Cross Validation RMSE: 0.8154807204857238

Training with params: 
{'colsample_bytree': 0.9500000000000001, 'learning_rate': 0.15000000000000002, 'metric': 'rmse', 'min_data_in_leaf': 9, 'objective': 'regression', 'seed': 1204, 'subsample': 0.9500000000000001}
Train RMSE: 0.7469320814746675. Val RMSE: 0.8238934892293873
Best iteration: 287
Train RMSE: 0.8025137818178907. Val RMSE: 0.7741391115214828
Best iteration: 83
Train RMSE: 0.7927860697851847. Val RMSE: 0.7073130483562933
Best iteration: 102
Train RMSE: 0.747121933591073. Val RMSE: 0.7706965512732511
Best iteration: 272
Train RM

In [8]:
all_data = get_all_data(data_path,'new_sales_lag_after12.pickle')

X,y = get_X_y(all_data,33)
X.drop('date_block_num',axis=1,inplace=True)

cv = get_cv_idxs(all_data,28,33)

In [10]:
lgb_params = {
               'colsample_bytree': 0.65,
               'metric': 'rmse',
               'min_data_in_leaf': 16, 
               'subsample': 0.65, 
               'learning_rate': 0.05, 
               'objective': 'regression', 
               'bagging_seed': 128, 
               'num_leaves': 128,
               'bagging_freq':1,
               'seed':1204
              }

# Train on entire dataset

In [11]:
%%time
lgb_model_full = lgb.train(lgb_params, lgb.Dataset(X, label=y), 708, 
                      lgb.Dataset(X, label=y), verbose_eval=10)

[10]	valid_0's rmse: 0.999437
[20]	valid_0's rmse: 0.902923
[30]	valid_0's rmse: 0.855396
[40]	valid_0's rmse: 0.828039
[50]	valid_0's rmse: 0.810034
[60]	valid_0's rmse: 0.798959
[70]	valid_0's rmse: 0.790089
[80]	valid_0's rmse: 0.78329
[90]	valid_0's rmse: 0.776922
[100]	valid_0's rmse: 0.771793
[110]	valid_0's rmse: 0.768016
[120]	valid_0's rmse: 0.763588
[130]	valid_0's rmse: 0.760196
[140]	valid_0's rmse: 0.757463
[150]	valid_0's rmse: 0.754458
[160]	valid_0's rmse: 0.75124
[170]	valid_0's rmse: 0.748789
[180]	valid_0's rmse: 0.746627
[190]	valid_0's rmse: 0.744028
[200]	valid_0's rmse: 0.742242
[210]	valid_0's rmse: 0.740063
[220]	valid_0's rmse: 0.738322
[230]	valid_0's rmse: 0.736947
[240]	valid_0's rmse: 0.734106
[250]	valid_0's rmse: 0.732244
[260]	valid_0's rmse: 0.730864
[270]	valid_0's rmse: 0.729421
[280]	valid_0's rmse: 0.727752
[290]	valid_0's rmse: 0.726284
[300]	valid_0's rmse: 0.724299
[310]	valid_0's rmse: 0.722679
[320]	valid_0's rmse: 0.721255
[330]	valid_0's rms

# Predicting on test dataset

In [13]:
test_lag = pd.read_csv(os.path.join(data_path, 'test_lag_data.csv'),encoding = "ISO-8859-1")
test_lag.drop(['ID','item_name','date_block_num'],axis=1,inplace=True)
test_lag_pred = lgb_model_full.predict(test_lag,708)

In [14]:
def get_submission(item_cnt_month,sub_name,clip=20,data_path ='C:/Users/as14478/Sanchita Kaggle/' ):
    item_cnt_month = np.clip(item_cnt_month,0,clip)
    test= pd.read_csv(os.path.join(data_path, 'test.csv.gz'))
    sub = test.copy()
    sub['item_cnt_month'] = item_cnt_month
    sub.drop(['item_id','shop_id'],axis=1,inplace=True)
    sub.to_csv(data_path+'submission/' + sub_name+'.csv',index=False)
    return sub
get_submission(test_lag_pred,'lightgbm_basic_6folds_v2');