In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
import lightgbm
from sklearn.metrics import mean_squared_error
import warnings
import gc


warnings.filterwarnings('ignore')
np.random.seed(4590)

In [84]:
train = pd.read_csv('/Users/sinsakuokazaki/Project/pred-future-sales/\
input/competitive-data-science-predict-future-sales/train.csv')
test = pd.read_csv('/Users/sinsakuokazaki/Project/pred-future-sales/\
input/competitive-data-science-predict-future-sales/test.csv')

In [85]:
train = train[train.item_cnt_day < 1500]

In [87]:
train = train.drop(columns=['total_sales'])
test = test.drop(columns=["ID", "in_train", "combination_id"])
test["date_block_num"] = 34
test['month'] = 11
test['year'] = 2015
test['quarter'] = 4
data = pd.concat([train, test], join="outer")

In [89]:
data.isna().sum()

date_block_num             0
item_category_id           0
item_category_name         0
item_cnt_day          214200
item_id                    0
item_name                  0
month                      0
quarter                    0
shop_id                    0
shop_name                  0
year                       0
dtype: int64

In [90]:
def mean_encode(statistic, group, col_name, data):
    group_df = data.groupby(group).agg({'item_cnt_day': [statistic]})
    column = col_name.format(statistic)
    group_df.columns = [ column ]
    group_df.reset_index(inplace=True)
    return group_df, column

In [91]:
def lag_mean_encoding(lag, group_df, column, group, data):
    tmp = group_df.copy()
    tmp['date_block_num'] += lag
    new_name = column + str(lag)
    tmp = tmp.rename(index=str, columns={column: new_name})
    print(new_name)
    data = pd.merge(data, tmp, on=group, how='left')
    return data

In [92]:
groups = [["date_block_num"], ["date_block_num", "item_id"],
          ["date_block_num", "shop_id"],["date_block_num", "item_category_id"], 
          ["date_block_num", "item_category_id", "shop_id"]]

col_names = ['date_{}_item_cnt', 'date_item_{}_item_cnt', 'date_shop_{}_item_cnt',
             'date_category_{}_item_cnt', 'date_category_shop_{}_item_cnt']

for s in ["mean", "std", "median"]:
    for g, c in zip(groups, col_names):
        group_df, column = mean_encode(s, g, c, data)
        for l in [2, 3, 6, 12]:
            data = lag_mean_encoding(l, group_df, column, g, data)

date_mean_item_cnt2
date_mean_item_cnt3
date_mean_item_cnt6
date_mean_item_cnt12
date_item_mean_item_cnt2
date_item_mean_item_cnt3
date_item_mean_item_cnt6
date_item_mean_item_cnt12
date_shop_mean_item_cnt2
date_shop_mean_item_cnt3
date_shop_mean_item_cnt6
date_shop_mean_item_cnt12
date_category_mean_item_cnt2
date_category_mean_item_cnt3
date_category_mean_item_cnt6
date_category_mean_item_cnt12
date_category_shop_mean_item_cnt2
date_category_shop_mean_item_cnt3
date_category_shop_mean_item_cnt6
date_category_shop_mean_item_cnt12
date_std_item_cnt2
date_std_item_cnt3
date_std_item_cnt6
date_std_item_cnt12
date_item_std_item_cnt2
date_item_std_item_cnt3
date_item_std_item_cnt6
date_item_std_item_cnt12
date_shop_std_item_cnt2
date_shop_std_item_cnt3
date_shop_std_item_cnt6
date_shop_std_item_cnt12
date_category_std_item_cnt2
date_category_std_item_cnt3
date_category_std_item_cnt6
date_category_std_item_cnt12
date_category_shop_std_item_cnt2
date_category_shop_std_item_cnt3
date_category

In [95]:
data = data[data['date_block_num'] > 11]
data = data.fillna(0)

In [96]:
data.head()

Unnamed: 0,date_block_num,item_category_id,item_category_name,item_cnt_day,item_id,item_name,month,quarter,shop_id,shop_name,...,date_shop_median_item_cnt6,date_shop_median_item_cnt12,date_category_median_item_cnt2,date_category_median_item_cnt3,date_category_median_item_cnt6,date_category_median_item_cnt12,date_category_shop_median_item_cnt2,date_category_shop_median_item_cnt3,date_category_shop_median_item_cnt6,date_category_shop_median_item_cnt12
687724,12,19,Игры - PS3,1.0,27,"007 Legends [PS3, русская версия]",1,1,17,"Красноярск ТЦ ""Взлетка Плаза""",...,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
687725,12,40,Кино - DVD,1.0,30,007: КООРДИНАТЫ «СКАЙФОЛЛ»,1,1,3,"Балашиха ТРК ""Октябрь-Киномир""",...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
687726,12,40,Кино - DVD,1.0,30,007: КООРДИНАТЫ «СКАЙФОЛЛ»,1,1,5,"Вологда ТРЦ ""Мармелад""",...,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
687727,12,40,Кино - DVD,1.0,30,007: КООРДИНАТЫ «СКАЙФОЛЛ»,1,1,6,"Воронеж (Плехановская, 13)",...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
687728,12,40,Кино - DVD,4.0,30,007: КООРДИНАТЫ «СКАЙФОЛЛ»,1,1,7,"Воронеж ТРЦ ""Максимир""",...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [97]:
use_cols = ['date_block_num', 'item_category_id', 'item_id', 'shop_id',
            'item_cnt_day', 'date_mean_item_cnt2', 
            'date_mean_item_cnt3', 'date_mean_item_cnt6', 'date_mean_item_cnt12',
            'date_item_mean_item_cnt2', 'date_item_mean_item_cnt3',
            'date_item_mean_item_cnt6', 'date_item_mean_item_cnt12',
            'date_shop_mean_item_cnt2', 'date_shop_mean_item_cnt3',
            'date_shop_mean_item_cnt6', 'date_shop_mean_item_cnt12',
            'date_category_mean_item_cnt2', 'date_category_mean_item_cnt3',
            'date_category_mean_item_cnt6', 'date_category_mean_item_cnt12',
            'date_category_shop_mean_item_cnt2',
            'date_category_shop_mean_item_cnt3',
            'date_category_shop_mean_item_cnt6',
            'date_category_shop_mean_item_cnt12', 'date_std_item_cnt2',
            'date_std_item_cnt3', 'date_std_item_cnt6', 'date_std_item_cnt12',
            'date_item_std_item_cnt2', 'date_item_std_item_cnt3',
            'date_item_std_item_cnt6', 'date_item_std_item_cnt12',
            'date_shop_std_item_cnt2', 'date_shop_std_item_cnt3',
            'date_shop_std_item_cnt6', 'date_shop_std_item_cnt12',
            'date_category_std_item_cnt2', 'date_category_std_item_cnt3',
            'date_category_std_item_cnt6', 'date_category_std_item_cnt12',
            'date_category_shop_std_item_cnt2', 'date_category_shop_std_item_cnt3',
            'date_category_shop_std_item_cnt6', 'date_category_shop_std_item_cnt12',
            'date_median_item_cnt2', 'date_median_item_cnt3',
            'date_median_item_cnt6', 'date_median_item_cnt12',
            'date_item_median_item_cnt2', 'date_item_median_item_cnt3',
            'date_item_median_item_cnt6', 'date_item_median_item_cnt12',
            'date_shop_median_item_cnt2', 'date_shop_median_item_cnt3',
            'date_shop_median_item_cnt6', 'date_shop_median_item_cnt12',
            'date_category_median_item_cnt2', 'date_category_median_item_cnt3',
            'date_category_median_item_cnt6', 'date_category_median_item_cnt12',
            'date_category_shop_median_item_cnt2',
            'date_category_shop_median_item_cnt3',
            'date_category_shop_median_item_cnt6',
            'date_category_shop_median_item_cnt12',
            'month', 'quarter', 'year']

In [98]:
X_tra = data[data['date_block_num'] < 33][use_cols]
Y_tra = data[data['date_block_num'] < 33]['item_cnt_day']

X_val = data[data['date_block_num'] == 33][use_cols]
Y_val = data[data['date_block_num'] == 33]['item_cnt_day']

X_test = data[data['date_block_num'] == 34][use_cols]

In [99]:
params = {"max_depth": 7,
         "min_child_weight": 15,
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.01,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 4,
         "random_state": 4590}

In [100]:
tra_data = lightgbm.Dataset(X_tra, label=Y_tra)
val_data = lightgbm.Dataset(X_val, label=Y_val)

model = lightgbm.train(params, tra_data, num_boost_round=1000, \
                           valid_sets=[tra_data, val_data],  verbose_eval=100, \
                           early_stopping_rounds = 100)
pred = model.predict(X_val, num_iteration=model.best_iteration)
np.sqrt(mean_squared_error(pred, Y_val))

Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 3.56386	valid_1's rmse: 2.74513
[200]	training's rmse: 1.72857	valid_1's rmse: 1.12764
[300]	training's rmse: 1.15695	valid_1's rmse: 0.640871
[400]	training's rmse: 0.983271	valid_1's rmse: 0.487136
[500]	training's rmse: 0.916527	valid_1's rmse: 0.434813
[600]	training's rmse: 0.878106	valid_1's rmse: 0.41268
[700]	training's rmse: 0.847038	valid_1's rmse: 0.396337
[800]	training's rmse: 0.825177	valid_1's rmse: 0.384417
[900]	training's rmse: 0.808316	valid_1's rmse: 0.379895
Early stopping, best iteration is:
[869]	training's rmse: 0.813456	valid_1's rmse: 0.37833


0.3783298598382767