In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, PredefinedSplit
import lightgbm as lgb
from funs import rmse, clip20, clip40
import pickle
import pyarrow.parquet as pq

## Data

In [14]:
X_train = pq.read_table('data/X_train.parquet').to_pandas()
X_val = pq.read_table('data/X_val.parquet').to_pandas()
y_train = np.load('data/y_train.npy')
y_val = np.load('data/y_val.npy')

In [3]:
X_train.head()

Unnamed: 0,shop_id,item_id,target_item_lag_1,target_lag_1,target_shop_lag_1,target_item_lag_2,target_lag_2,target_shop_lag_2,target_item_lag_3,target_lag_3,target_shop_lag_3,item_category_id
0,54,10297,42.0,3.0,4282.0,2.0,0.0,3085.0,0.0,0.0,0.0,37
1,54,10298,369.0,21.0,4282.0,1309.0,119.0,3085.0,144.0,7.0,2464.0,40
2,54,10300,54.0,1.0,4282.0,361.0,31.0,3085.0,53.0,0.0,2464.0,37
3,54,10292,156.0,8.0,4282.0,203.0,16.0,3085.0,279.0,15.0,2464.0,40
4,54,10143,18.0,1.0,4282.0,2.0,0.0,3085.0,0.0,0.0,0.0,55


In [4]:
X_train.head()

Unnamed: 0,shop_id,item_id,target_item_lag_1,target_lag_1,target_shop_lag_1,target_item_lag_2,target_lag_2,target_shop_lag_2,target_item_lag_3,target_lag_3,target_shop_lag_3,item_category_id
0,54,10297,42.0,3.0,4282.0,2.0,0.0,3085.0,0.0,0.0,0.0,37
1,54,10298,369.0,21.0,4282.0,1309.0,119.0,3085.0,144.0,7.0,2464.0,40
2,54,10300,54.0,1.0,4282.0,361.0,31.0,3085.0,53.0,0.0,2464.0,37
3,54,10292,156.0,8.0,4282.0,203.0,16.0,3085.0,279.0,15.0,2464.0,40
4,54,10143,18.0,1.0,4282.0,2.0,0.0,3085.0,0.0,0.0,0.0,55


In [5]:
X_val.head()

Unnamed: 0,shop_id,item_id,target_item_lag_1,target_lag_1,target_shop_lag_1,target_item_lag_2,target_lag_2,target_shop_lag_2,target_item_lag_3,target_lag_3,target_shop_lag_3,item_category_id
1080263,37,3460,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,28
1080264,37,4615,19.0,0.0,791.0,15.0,1.0,810.0,16.0,0.0,865.0,55
1080265,37,4870,189.0,1.0,791.0,182.0,0.0,810.0,214.0,1.0,865.0,23
1080266,37,4872,309.0,2.0,791.0,266.0,4.0,810.0,384.0,3.0,865.0,19
1080267,37,4873,224.0,2.0,791.0,0.0,0.0,0.0,0.0,0.0,0.0,20


## Models

Grid Search for Hyperparameter tuning

### Light Gradient Boosting

In [17]:
learning_rates = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]
best_rmse = 9999999999999
for lr in learning_rates:
    lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': lr, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':0 
              }

    lgb_model = lgb.train(lgb_params, lgb.Dataset(X_train, label=clip40(y_train)), int(100 * (lr / 0.03)))
    pred_lgb_val = lgb_model.predict(X_val)
    score = rmse(clip20(y_val), clip20(pred_lgb_val))

    if score < best_rmse:
        best_rmse = score
        best_lr = lr
        best_lgb = lgb_model

In [19]:
best_lr

0.05

We train the best model with all the data.

In [20]:
X = X_train.append(X_val)
y = np.append(y_train, y_val)

In [21]:
best_lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': best_lr, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':0 
              }
best_lgb = lgb.train(lgb_params, lgb.Dataset(X, label=clip40(y)), int(100 * (lr / 0.03)))

Save the model

In [22]:
filename = 'models/best_lgb.sav'
pickle.dump(best_lgb, open(filename, 'wb'))

### Random Forest

CV Iterator

In [12]:
X = X_train.append(X_val)
Y = np.concatenate([y_train, y_val])
train_ind=np.zeros(X.shape[0])
for i in range(0, len(X_train)):
    train_ind[i] = -1
ps = PredefinedSplit(test_fold=(train_ind))

Grid Search for Hyperparameter tuning

In [14]:
param_grid={'bootstrap':[0.7, 0.8], 'max_features':[4, 6, 8], 
            'max_depth' : [None, 4, 6, 8, 10, 12]}
gs = GridSearchCV(cv = ps, 
                  estimator = RandomForestRegressor(n_estimators=300, n_jobs=4), 
                  param_grid=param_grid, scoring='neg_mean_squared_error')

In [None]:
gs.fit(X, clip40(Y))
best_rf = gs.best_estimator_

In [12]:
best_rf = gs.best_estimator_
best_rf

RandomForestRegressor(bootstrap=0.7, criterion='mse', max_depth=12,
           max_features=6, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=300, n_jobs=4, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [23]:
best_rf = RandomForestRegressor(bootstrap=0.7, criterion='mse', max_depth=12,
           max_features=6, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=300, n_jobs=4, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

We train the best model with all the data.

In [8]:
best_rf = pickle.load(open('models/best_rf.sav', 'rb'))

In [24]:
from time import time
t = time()
best_rf.fit(X, clip40(y))
print(time() - t)

1190.7306106090546


In [25]:
X.head()

Unnamed: 0,shop_id,item_id,target_item_lag_1,target_lag_1,target_shop_lag_1,target_item_lag_2,target_lag_2,target_shop_lag_2,target_item_lag_3,target_lag_3,target_shop_lag_3,item_category_id,mean_enc_cat_id,var_enc_cat_id
0,54,10297,42.0,3.0,4282.0,2.0,0.0,3085.0,0.0,0.0,0.0,37,0.512281,2.742767
1,54,10298,369.0,21.0,4282.0,1309.0,119.0,3085.0,144.0,7.0,2464.0,40,1.120868,10.631319
2,54,10300,54.0,1.0,4282.0,361.0,31.0,3085.0,53.0,0.0,2464.0,37,0.512281,2.742767
3,54,10292,156.0,8.0,4282.0,203.0,16.0,3085.0,279.0,15.0,2464.0,40,1.120868,10.631319
4,54,10143,18.0,1.0,4282.0,2.0,0.0,3085.0,0.0,0.0,0.0,55,0.902726,2.715576


Save the model

In [26]:
filename = 'models/best_rf.sav'
pickle.dump(best_rf, open(filename, 'wb'))