In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
import os

Read input data

In [3]:
shops = pd.read_csv("data/shops.csv")
item_categories = pd.read_csv("data/item_categories.csv")
sales_train = pd.read_csv("data/sales_train.csv")
items = pd.read_csv("data/items.csv")

test = pd.read_csv("data/test.csv")
sample_submission = pd.read_csv("data/sample_submission.csv")

In [4]:
train_master = sales_train.merge(items, on='item_id').merge(shops, on='shop_id').merge(item_categories, on='item_category_id')
train_master['city'] = train_master['shop_name'].str.split(' ').map(lambda x : x[0])
train_master.head(2)

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_name,item_category_id,shop_name,item_category_name,city
0,02.01.2013,0,59,22154,999.0,1.0,ЯВЛЕНИЕ 2012 (BD),37,"Ярославль ТЦ ""Альтаир""",Кино - Blu-Ray,Ярославль
1,26.04.2013,3,59,944,150.0,1.0,2012 (BD),37,"Ярославль ТЦ ""Альтаир""",Кино - Blu-Ray,Ярославль


## Feature Engineering

In [5]:
# Aggregate the sales by month
train = sales_train.groupby(['date_block_num', 'shop_id', 'item_id']).agg({'item_cnt_day': ['sum']})
train.columns = ['item_cnt_sum']
train.reset_index(inplace=True)      

Notice that we do not have all the (date_block_num, shop_id, item_id) pairs, if the sales = 0. Therefore, we need to fill in the missing pairs.

In [6]:
from tqdm import tqdm_notebook
from itertools import product

def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float16)
    df[int_cols]   = df[int_cols].astype(np.int16)
    
    return df

In [7]:
months = train['date_block_num'].unique()

In [8]:
cartesian = []
for month in months:
    shops_in_month = train.loc[train['date_block_num']==month, 'shop_id'].unique()
    items_in_month = train.loc[train['date_block_num']==month, 'item_id'].unique()
    cartesian.append(np.array(list(product(*[shops_in_month, items_in_month, [month]])), dtype='int32'))
    
cartesian_df = pd.DataFrame(np.vstack(cartesian), columns = ['shop_id', 'item_id', 'date_block_num'], dtype=np.int32)
print(cartesian_df.shape)

(10913850, 3)


The generated cartesian_df dataframe will contain all the unique (shop_id, item_id, date_block_num) pairs

We merge it back to the train dataframe and fill NaN with zeros to include zeros in the sales record

In [9]:
new_train = cartesian_df.merge(train, on=['date_block_num','shop_id','item_id'], how='left').fillna(0)
new_train.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_sum
0,0,32,0,6.0
1,0,33,0,3.0
2,0,35,0,1.0
3,0,43,0,1.0
4,0,51,0,2.0


In [10]:
new_train = new_train.merge(items, on='item_id').merge(shops, on='shop_id').merge(item_categories, on='item_category_id')
new_train['city'] = new_train['shop_name'].str.split(' ').map(lambda x : x[0])

new_train.sort_values(by=['date_block_num', 'shop_id', 'item_id'], inplace=True)
new_train.reset_index(drop=True, inplace=True)

In [11]:
train_backup = new_train.copy()

In [12]:
new_train.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_sum,item_name,item_category_id,shop_name,item_category_name,city
0,0,19,0,0.0,/ЗОЛОТАЯ КОЛЛЕКЦИЯ м/ф-72,40,"!Якутск Орджоникидзе, 56 фран",Кино - DVD,!Якутск
1,0,27,0,0.0,"007 Legends [PS3, русская версия]",19,"!Якутск Орджоникидзе, 56 фран",Игры - PS3,!Якутск
2,0,28,0,0.0,"007 Legends [PС, Jewel, русская версия]",30,"!Якутск Орджоникидзе, 56 фран",Игры PC - Стандартные издания,!Якутск
3,0,29,0,0.0,"007 Legends [Xbox 360, русская версия]",23,"!Якутск Орджоникидзе, 56 фран",Игры - XBOX 360,!Якутск
4,0,32,0,6.0,1+1,40,"!Якутск Орджоникидзе, 56 фран",Кино - DVD,!Якутск


### Add features: item_id related features <br>
- item_id
- shop_id
- name of the item (?)
- last month's sale
- last year's sale
- whether is on promotion (?)

In [12]:
#new_train = train_backup.copy()

In [13]:
def lag_feature(df, df_shift, lags, on, col):
    tmp = df_shift[on + [col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = on + [col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = df.merge(shifted, on=on, how='left')
    return df

In [14]:
new_train = lag_feature(new_train, new_train, lags=[1, 3, 6, 12], on=['date_block_num','shop_id','item_id'], col='item_cnt_sum')
new_train.head(2)

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_sum,item_name,item_category_id,shop_name,item_category_name,city,item_cnt_sum_lag_1,item_cnt_sum_lag_3,item_cnt_sum_lag_6,item_cnt_sum_lag_12
0,0,19,0,0.0,/ЗОЛОТАЯ КОЛЛЕКЦИЯ м/ф-72,40,"!Якутск Орджоникидзе, 56 фран",Кино - DVD,!Якутск,,,,
1,0,27,0,0.0,"007 Legends [PS3, русская версия]",19,"!Якутск Орджоникидзе, 56 фран",Игры - PS3,!Якутск,,,,


### Add features: Categorical related features <br>
- item_category_id
- last month's sale for the same item_category_id
- last year's sale for the same item_category_id

In [None]:
#train_backup_2 = train.copy()

In [15]:
df = train_master.groupby(['date_block_num', 'shop_id', 'item_category_id']).agg({'item_cnt_day':['sum']})
df.columns = ['item_cnt_by_cat_sum']
df.reset_index(inplace=True)
df.head(2)

Unnamed: 0,date_block_num,shop_id,item_category_id,item_cnt_by_cat_sum
0,0,0,2,53.0
1,0,0,3,28.0


In [17]:
new_train = lag_feature(new_train, df, lags=[1,3,6,12], on=['date_block_num','shop_id','item_category_id'], col='item_cnt_by_cat_sum')
new_train.head(2)

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_sum,item_name,item_category_id,shop_name,item_category_name,city,item_cnt_sum_lag_1,item_cnt_sum_lag_3,item_cnt_sum_lag_6,item_cnt_sum_lag_12,item_cnt_by_cat_sum_lag_1,item_cnt_by_cat_sum_lag_3,item_cnt_by_cat_sum_lag_6,item_cnt_by_cat_sum_lag_12
0,0,19,0,0.0,/ЗОЛОТАЯ КОЛЛЕКЦИЯ м/ф-72,40,"!Якутск Орджоникидзе, 56 фран",Кино - DVD,!Якутск,,,,,,,,
1,0,27,0,0.0,"007 Legends [PS3, русская версия]",19,"!Якутск Орджоникидзе, 56 фран",Игры - PS3,!Якутск,,,,,,,,


### OBSOLETE: Fill in missing (date_block_num, shop_id, item_id) pairs

First, let us look at what (shop_id, item_id) pairs in the test set are in the sales record within a year. If not, then we will simply predict them to be ZEROs.

In [None]:
# calculate the sum of items sold in the past twelve months
df_sum_one_year = train[train.date_block_num >= 22].groupby(['shop_id', 'item_id']).agg({'item_cnt_sum':['sum']})
df_sum_one_year.columns = ['item_cnt_sum']
df_sum_one_year.reset_index(inplace=True)
df_sum_one_year.head()

In [None]:
test_nonzero = test.merge(df_sum_one_year, on=['shop_id', 'item_id'], how='left').fillna(0)
test_nonzero = test_nonzero[test_nonzero.item_cnt_sum > 0]
test_nonzero.head()

### Add features: month and city label

Add a month feature

In [18]:
new_train['month'] = new_train['date_block_num'] % 12

Label encode the categorical variables

In [19]:
from sklearn import preprocessing

In [20]:
le = preprocessing.LabelEncoder()
new_train['city_label'] = le.fit_transform(new_train['city'])
new_train.drop(['shop_name', 'city'], axis = 1, inplace=True)
new_train.head(2)

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_sum,item_name,item_category_id,item_category_name,item_cnt_sum_lag_1,item_cnt_sum_lag_3,item_cnt_sum_lag_6,item_cnt_sum_lag_12,item_cnt_by_cat_sum_lag_1,item_cnt_by_cat_sum_lag_3,item_cnt_by_cat_sum_lag_6,item_cnt_by_cat_sum_lag_12,month,city_label
0,0,19,0,0.0,/ЗОЛОТАЯ КОЛЛЕКЦИЯ м/ф-72,40,Кино - DVD,,,,,,,,,0,0
1,0,27,0,0.0,"007 Legends [PS3, русская версия]",19,Игры - PS3,,,,,,,,,0,0


In [23]:
new_train.to_pickle('train.pkl')

## Build an XGBoost Model

Training set and evaluation set split: <br>
- training set: date_block_num in range(12, 33)
- validation set: date_block_num == 33

In [35]:
x_train = new_train[(new_train.date_block_num >= 12) & (new_train.date_block_num <= 32)]
x_val = new_train[new_train.date_block_num == 33]

In [37]:
y_train = x_train[['item_cnt_sum']]
y_val = x_val[['item_cnt_sum']]
x_train = x_train[['shop_id','item_id','item_category_id','month','city_label',
                   'item_cnt_sum_lag_1','item_cnt_sum_lag_3','item_cnt_sum_lag_6','item_cnt_sum_lag_12']]
x_val = x_val[['shop_id','item_id','item_category_id','month','city_label',
                   'item_cnt_sum_lag_1','item_cnt_sum_lag_3','item_cnt_sum_lag_6','item_cnt_sum_lag_12']]

In [1]:
import xgboost as xgb

In [38]:
x_train.isna().sum()

shop_id                      0
item_id                      0
item_category_id             0
month                        0
city_label                   0
item_cnt_sum_lag_1     1288325
item_cnt_sum_lag_3     1658858
item_cnt_sum_lag_6     2201695
item_cnt_sum_lag_12    3280022
dtype: int64

In [39]:
x_val.isna().sum()

shop_id                     0
item_id                     0
item_category_id            0
month                       0
city_label                  0
item_cnt_sum_lag_1      72491
item_cnt_sum_lag_3      85775
item_cnt_sum_lag_6      98690
item_cnt_sum_lag_12    125555
dtype: int64

In [40]:
y_train.isna().sum()

item_cnt_sum    0
dtype: int64

In [41]:
y_val.isna().sum()

item_cnt_sum    0
dtype: int64

In [42]:
x_train = x_train.fillna(0.0)

In [43]:
x_val = x_val.fillna(0.0)

In [32]:
from xgboost import XGBRegressor
from xgboost import plot_importance

In [44]:
model = XGBRegressor(
    n_estimators=5000,
    max_depth=10,
    #min_child_weight=300, 
    colsample_bytree=0.5, 
    subsample=0.5)
    #eta=0.3,    
    #seed=42)

model.fit(
    x_train, 
    y_train, 
    eval_metric="rmse", 
    eval_set=[(x_train, y_train), (x_val, y_val)], 
    verbose=True, 
    early_stopping_rounds = 50)

[0]	validation_0-rmse:3.36269	validation_1-rmse:5.23473
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 50 rounds.
[1]	validation_0-rmse:3.24907	validation_1-rmse:5.15967
[2]	validation_0-rmse:3.14981	validation_1-rmse:5.08771
[3]	validation_0-rmse:3.07841	validation_1-rmse:5.04267
[4]	validation_0-rmse:2.96832	validation_1-rmse:4.97105
[5]	validation_0-rmse:2.91692	validation_1-rmse:4.92989
[6]	validation_0-rmse:2.82384	validation_1-rmse:4.88673
[7]	validation_0-rmse:2.78615	validation_1-rmse:4.88107
[8]	validation_0-rmse:2.71989	validation_1-rmse:4.84633
[9]	validation_0-rmse:2.65333	validation_1-rmse:4.82679
[10]	validation_0-rmse:2.60152	validation_1-rmse:4.79729
[11]	validation_0-rmse:2.59515	validation_1-rmse:4.79451
[12]	validation_0-rmse:2.5509	validation_1-rmse:4.77739
[13]	validation_0-rmse:2.51512	validation_1-rmse:4.7796
[14]	validation_0-rmse:2.48958	validation_1-rmse:4.7773

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=0.5, gamma=0, importance_type='gain',
             learning_rate=0.1, max_delta_step=0, max_depth=10,
             min_child_weight=1, missing=None, n_estimators=5000, n_jobs=1,
             nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
             subsample=0.5)