In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")

In [4]:
train= pd.read_csv('C:/datasets/train_sales.csv')
test = pd.read_csv('C:/datasets/test_sales.csv')

In [5]:
train_ds = train.groupby(['date_block_num', 'location','shop_type','shop_id','item_category_id','item_category', 'subcat','item_id',]).agg({'item_cnt_day':'sum','item_price':'mean'}).reset_index()
train_ds.columns = ['date_block_num', 'location','shop_type','shop_id','item_category_id','item_category','subcat','item_id','item_cnt_monthly','mean_item_price']

In [6]:
def label_encode_data(data):
    
    #shop
    data["loc_cd"] = LabelEncoder().fit_transform(data["location"])
    data["shop_type_cd"] = LabelEncoder().fit_transform(data["shop_type"])
    data.drop(["location"], axis=1, inplace=True)
    data.drop(["shop_type"], axis=1, inplace=True)

    #item
    data["item_cat_cd"] = LabelEncoder().fit_transform(data["item_category"])
    data["subcat_cd"] = LabelEncoder().fit_transform(data["subcat"])
    data.drop(["item_category"], axis=1, inplace=True)
    data.drop(["subcat"], axis=1, inplace=True)
    
    return data

def one_hot_encode_data(data):
    
    data = pd.get_dummies(data, columns = ['location', 'shop_type','item_category','subcat'])
    
    return data

def mean_encode_data(data):
    Mean_encoded_loc = data.groupby(['location'])['item_cnt_monthly'].mean().to_dict()
    data['loc_cd'] =  data['location'].map(Mean_encoded_loc)
    Mean_encoded_shop = data.groupby(['shop_type'])['item_cnt_monthly'].mean().to_dict()
    data['shop_type_cd'] =  data['shop_type'].map(Mean_encoded_shop)
    data.drop(["location"], axis=1, inplace=True)
    data.drop(["shop_type"], axis=1, inplace=True)

    #item
    Mean_encoded_cat= data.groupby(['item_category'])['item_cnt_monthly'].mean().to_dict()
    data['item_cat_cd'] =  data['item_category'].map(Mean_encoded_cat)
    Mean_encoded_subcat = data.groupby(['subcat'])['item_cnt_monthly'].mean().to_dict()
    data['subcat_cd'] =  data['subcat'].map(Mean_encoded_subcat)
    data.drop(["item_category"], axis=1, inplace=True)
    data.drop(["subcat"], axis=1, inplace=True)
    
    return data

def novelty_feature(data):
    
    #novelty
    min_values = data.groupby("item_id")["date_block_num"].min().reset_index()
    min_values.columns = ['item_id','first_sales_date_block']
    data = pd.merge(data, min_values, on='item_id', how = 'left')
    
    return data

def lag_features(df, lags, col_list):
    
    for col_name in col_list:
        tmp = df[["date_block_num", "shop_id", "item_id", col_name]]
        for i in lags:
            shifted = tmp.copy()
            shifted.columns = [
                "date_block_num",
                "shop_id",
                "item_id",
                col_name + "_lag_" + str(i),
            ]
            shifted["date_block_num"] += i
            df = pd.merge(
                df, shifted, on=["date_block_num", "shop_id", "item_id"], how="left"
            )
    return df

def last_halfyear_feathure(train_ds):
#last 6 month average of sales
    train_ds["last_6month_cnt"] = train_ds[["item_cnt_monthly_lag_1", "item_cnt_monthly_lag_2", "item_cnt_monthly_lag_3", "item_cnt_monthly_lag_4", "item_cnt_monthly_lag_5","item_cnt_monthly_lag_6"]].mean(skipna=True, axis=1)
    return train_ds

In [7]:
def primary_data_for_modeling(data, enc):
    
    train_ds = novelty_feature(data)
    if enc=='OHE':
        train_ds = one_hot_encode_data(train_ds)
    if enc=='mean':
        train_ds = mean_encode_data(train_ds)
    if enc=='label':
        train_ds = label_encode_data(train_ds)
    train_ds = lag_features(train_ds, [1, 2, 3, 4, 5, 6, 12], ["item_cnt_monthly"])
    train_ds = last_halfyear_feathure(train_ds)
    train_ds.fillna(0, inplace=True)
    
    return train_ds

Here i changed cross-validation function for it to fit as a parameter of cross-validation of grid search.

The whole structure of it is the same except the output is yield indices.

In [8]:
def cross_validation_for_grid(data, month):
    
    if month>=data.date_block_num.max():
        print("Cannot be splited")
        
    elif month<=data.date_block_num.min():
        print("Cannot be splited")
    else:
        while True:
            
            ##feature gen.
            data_1 = data[:(data[data.date_block_num==(month+1)][-1:].index[0]+1)]
            #data_1 = primary_data_for_modeling(data_1, enc)
            
            train_ds = np.array(data_1[:(data_1[data_1.date_block_num==month][-1:].index[0]+1)].index)
            if (month+1)<=data_1.date_block_num.max():
                test_ds = np.array(data_1[(data_1[data_1.date_block_num==month][-1:].index[0]+1):(data_1[data_1.date_block_num==(month+1)][-1:].index[0]+1)].index)
            else:
                test_ds = np.array(data_1[(data_1[data_1.date_block_num==month][-1:].index[0]+1):].index)
            
            yield train_ds, test_ds

            month+=1
            
            if month>=data.date_block_num.max():
                break

Next there is a function that do the hyperparameters searching since in the task it is said to experiment with models(?).

In [9]:
def grid_search_func(train_ds, model, param_space):
    
            from sklearn.model_selection import GridSearchCV

            tscv = cross_validation_for_grid(train_ds, 28)
            #train_ds = primary_data_for_modeling(train_ds, enc)
            
            best_params_models = {}
            
            grid_search = GridSearchCV(model, param_space, cv=cross_validation_for_grid(train_ds, 30), return_train_score=True, verbose = 5, n_jobs=-1)
            grid_search.fit(train_ds.drop('item_cnt_monthly', axis=1), train_ds['item_cnt_monthly'])
                
            print(model)
            print("Best params: ", grid_search.best_params_)
            print("Best score: ", grid_search.best_score_)

In [10]:
#what was done here is for the sake of speed
train_ds_pred = train_ds[train_ds.date_block_num>11]
train_ds_pred = primary_data_for_modeling(train_ds_pred, 'label')

In [11]:
del train, test, train_ds

**Decision Tree:**

In [12]:
param_DT = {'max_depth': [3, 5, 6, 8]}
grid_search_func(train_ds_pred, DecisionTreeRegressor(), param_DT)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
DecisionTreeRegressor()
Best params:  {'max_depth': 3}
Best score:  0.41414438632067885


**Random Forest:**

In [14]:
param_RF = {'max_depth': [3, 5, 6, 8],
           'n_estimators': [50, 100],
           'n_jobs':[-1]}

grid_search_func(train_ds_pred, RandomForestRegressor(), param_RF)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
RandomForestRegressor()
Best params:  {'max_depth': 5, 'n_estimators': 50, 'n_jobs': -1}
Best score:  0.44769956396558985


**SVR:**

In [None]:
#The cell run in this one didn't complete for more than 8 hours, so i didn't ran it

In [24]:
from sklearn.preprocessing import StandardScaler
def scaler(train_ds):
    
    # the scaler object (model)
    scaler = StandardScaler()
    # fit and transform the data
    numeric = ['shop_id', 'item_category_id', 'mean_item_price','item_id','shop_id','loc_cd','shop_type_cd','item_cat_cd','subcat_cd',]
    scaler.fit(train_ds[numeric])
    train_ds[numeric] = scaler.transform(train_ds[numeric])
    
    return train_ds

train_ds_pred = scaler(train_ds_pred)

In [None]:
param_SVR = {'C': [0.1, 1, 10],
             'kernel': ['rbf', 'sigmoid']}

grid_search_func(train_ds_pred, SVR(cache_size=7000), param_SVR)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


**XGBoost:**

In [10]:
param_XGB = {'max_depth': [3, 5, 6, 8],
             'n_estimators': [50, 100],
             'n_jobs':[-1]}
grid_search_func(train_ds_pred, xgb.XGBRegressor(), param_XGB)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)
Best params:  {'max_depth': 8, 'n_estimators': 50, 'n_jobs': -1}
Best score:  0.4247756435743307


**LGBM:**

In [12]:
param_LGB = {'max_depth': [3, 5, 6, 8],
             'n_estimators': [50, 100],
             'n_jobs':[-1]}
grid_search_func(train_ds_pred, lgb.LGBMRegressor(), param_LGB)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
LGBMRegressor()
Best params:  {'max_depth': 3, 'n_estimators': 50, 'n_jobs': -1}
Best score:  0.42349810969927154


There lower score is in XGB and LGBM Regressors.