In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
import os

Read input data

In [2]:
shops = pd.read_csv("data/shops.csv")
item_categories = pd.read_csv("data/item_categories.csv")
sales_train = pd.read_csv("data/sales_train.csv")
items = pd.read_csv("data/items.csv")

test = pd.read_csv("data/test.csv")
sample_submission = pd.read_csv("data/sample_submission.csv")

In [3]:
train_master = sales_train.merge(items, on='item_id').merge(shops, on='shop_id').merge(item_categories, on='item_category_id')
train_master['city'] = train_master['shop_name'].str.split(' ').map(lambda x : x[0])
train_master.head(2)

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_name,item_category_id,shop_name,item_category_name,city
0,02.01.2013,0,59,22154,999.0,1.0,ЯВЛЕНИЕ 2012 (BD),37,"Ярославль ТЦ ""Альтаир""",Кино - Blu-Ray,Ярославль
1,26.04.2013,3,59,944,150.0,1.0,2012 (BD),37,"Ярославль ТЦ ""Альтаир""",Кино - Blu-Ray,Ярославль


## Feature Engineering

In [4]:
# Aggregate the sales by month
train = sales_train.groupby(['date_block_num', 'shop_id', 'item_id']).agg({'item_cnt_day': ['sum']})
train.columns = ['item_cnt_sum']
train.reset_index(inplace=True)      

Notice that we do not have all the (date_block_num, shop_id, item_id) pairs, if the sales = 0. Therefore, we need to fill in the missing pairs.

In [5]:
from tqdm import tqdm_notebook
from itertools import product

def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float16)
    df[int_cols]   = df[int_cols].astype(np.int16)
    
    return df

In [6]:
months = train['date_block_num'].unique()

In [7]:
cartesian = []
for month in months:
    shops_in_month = train.loc[train['date_block_num']==month, 'shop_id'].unique()
    items_in_month = train.loc[train['date_block_num']==month, 'item_id'].unique()
    cartesian.append(np.array(list(product(*[shops_in_month, items_in_month, [month]])), dtype='int32'))
    
cartesian_df = pd.DataFrame(np.vstack(cartesian), columns = ['shop_id', 'item_id', 'date_block_num'], dtype=np.int32)
print(cartesian_df.shape)

(10913850, 3)


The generated cartesian_df dataframe will contain all the unique (shop_id, item_id, date_block_num) pairs

We merge it back to the train dataframe and fill NaN with zeros to include zeros in the sales record

In [8]:
new_train = cartesian_df.merge(train, on=['date_block_num','shop_id','item_id'], how='left').fillna(0)
new_train.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_sum
0,0,32,0,6.0
1,0,33,0,3.0
2,0,35,0,1.0
3,0,43,0,1.0
4,0,51,0,2.0


In [9]:
new_train = new_train.merge(items, on='item_id').merge(shops, on='shop_id').merge(item_categories, on='item_category_id')
new_train['city'] = new_train['shop_name'].str.split(' ').map(lambda x : x[0])

new_train.sort_values(by=['date_block_num', 'shop_id', 'item_id'], inplace=True)
new_train.reset_index(drop=True, inplace=True)

In [10]:
train_backup = new_train.copy()

In [11]:
new_train.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_sum,item_name,item_category_id,shop_name,item_category_name,city
0,0,19,0,0.0,/ЗОЛОТАЯ КОЛЛЕКЦИЯ м/ф-72,40,"!Якутск Орджоникидзе, 56 фран",Кино - DVD,!Якутск
1,0,27,0,0.0,"007 Legends [PS3, русская версия]",19,"!Якутск Орджоникидзе, 56 фран",Игры - PS3,!Якутск
2,0,28,0,0.0,"007 Legends [PС, Jewel, русская версия]",30,"!Якутск Орджоникидзе, 56 фран",Игры PC - Стандартные издания,!Якутск
3,0,29,0,0.0,"007 Legends [Xbox 360, русская версия]",23,"!Якутск Орджоникидзе, 56 фран",Игры - XBOX 360,!Якутск
4,0,32,0,6.0,1+1,40,"!Якутск Орджоникидзе, 56 фран",Кино - DVD,!Якутск


### Add features: item_id related features <br>
- item_id
- shop_id
- name of the item (?)
- last month's sale
- last year's sale
- whether is on promotion (?)

In [12]:
new_train = train_backup.copy()

In [13]:
def lag_feature(df, df_shift, lags, on, col):
    tmp = df_shift[on + [col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = on + [col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = df.merge(shifted, on=on, how='left')
    return df

In [14]:
new_train = lag_feature(new_train, new_train, lags=[1, 3, 6, 12], on=['date_block_num','shop_id','item_id'], col='item_cnt_sum')
new_train.head(2)

KeyboardInterrupt: 

### Add features: Categorical related features <br>
- item_category_id
- last month's sale for the same item_category_id
- last year's sale for the same item_category_id

In [None]:
train_backup_2 = train.copy()

In [None]:
df = train_master.groupby(['date_block_num', 'shop_id', 'item_category_id']).agg({'item_cnt_day':['sum']})
df.columns = ['item_cnt_by_cat_sum']
df.reset_index(inplace=True)
df.head(2)

In [None]:
train = lag_feature(train, df, lags=[1,3,6,12], on=['date_block_num','shop_id','item_category_id'], col='item_cnt_by_cat_sum')
train.head(2)

### OBSOLETE: Fill in missing (date_block_num, shop_id, item_id) pairs

First, let us look at what (shop_id, item_id) pairs in the test set are in the sales record within a year. If not, then we will simply predict them to be ZEROs.

In [None]:
# calculate the sum of items sold in the past twelve months
df_sum_one_year = train[train.date_block_num >= 22].groupby(['shop_id', 'item_id']).agg({'item_cnt_sum':['sum']})
df_sum_one_year.columns = ['item_cnt_sum']
df_sum_one_year.reset_index(inplace=True)
df_sum_one_year.head()

In [None]:
test_nonzero = test.merge(df_sum_one_year, on=['shop_id', 'item_id'], how='left').fillna(0)
test_nonzero = test_nonzero[test_nonzero.item_cnt_sum > 0]
test_nonzero.head()

### Add features: month and city label

Add a month feature

In [None]:
new_train['month'] = new_train['date_block_num'] % 12

Label encode the categorical variables

In [None]:
from sklearn import preprocessing

In [None]:
le = preprocessing.LabelEncoder()
new_train['city_label'] = le.fit_transform(new_train['city'])
new_train.drop(['shop_name', 'city'], axis = 1, inplace=True)
new_train.head(2)

## Build an XGBoost Model

Training set and evaluation set split: <br>
- training set: date_block_num in range(12, 33)
- validation set: date_block_num == 33

In [None]:
x_train = new_train[(new_train.date_block_num >= 12) & (new_train.date_block_num <= 32)]
x_val = new_train[new_train.date_block_num == 33]

In [None]:
y_train = x_train[['item_cnt_sum']]
y_val = y_train[['item_cnt_sum']]
x_train = x_train[['shop_id','item_id','item_category_id','month','city_label',
                   'item_cnt_sum_lag_1','item_cnt_sum_lag_3','item_cnt_sum_lag_6','item_cnt_sum_lag_12']]
x_val = x_val[['shop_id','item_id','item_category_id','month','city_label',
                   'item_cnt_sum_lag_1','item_cnt_sum_lag_3','item_cnt_sum_lag_6','item_cnt_sum_lag_12']]

In [None]:
import xgboost as xgb