# import libraries

In [77]:
import datetime
import warnings
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import xgboost
from xgboost import XGBRegressor
from xgboost import plot_importance

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler

%matplotlib inline
sns.set(style="darkgrid")
pd.set_option('display.float_format', lambda x: '%.2f' % x)
warnings.filterwarnings("ignore")

from pandas.testing import assert_frame_equal

import lightgbm as lgb
from lightgbm import LGBMRegressor





# import data

In [6]:
train = pd.read_csv(r'C:\Users\pc\Desktop\Data Science Folder\Retail2\sales_train_v2.csv')
test = pd.read_csv(r'C:\Users\pc\Desktop\Data Science Folder\Retail2\test.csv')
submission = pd.read_csv(r'C:\Users\pc\Desktop\Data Science Folder\Retail2\sample_submission.csv')
items = pd.read_csv(r'C:\Users\pc\Desktop\Data Science Folder\Retail2\items.csv')
item_categories  = pd.read_csv(r'C:\Users\pc\Desktop\Data Science Folder\Retail2\item_categories.csv')
shops = pd.read_csv(r'C:\Users\pc\Desktop\Data Science Folder\Retail2\shops.csv')



# See basic information on data

In [7]:
print('\n******* train ********\n', 'shape: ', train.shape, '\n', train.columns, '\n',\
      train.describe(), '\n', train.isnull().sum() )

print('\n******* test ********\n', 'shape: ', test.shape, '\n', test.columns, '\n', \
      test.describe(), '\n', test.isnull().sum() )

print('\n******* submission ********\n', 'shape: ', submission.shape, '\n', submission.columns, '\n', \
      submission.describe(), '\n', submission.isnull().sum() )

print('\n******* items ********\n', 'shape: ', items.shape, '\n', items.columns, '\n', \
      items.describe(), '\n', items.isnull().sum() )




******* train ********
 shape:  (2935849, 6) 
 Index(['date', 'date_block_num', 'shop_id', 'item_id', 'item_price',
       'item_cnt_day'],
      dtype='object') 
        date_block_num    shop_id    item_id  item_price  item_cnt_day
count      2935849.00 2935849.00 2935849.00  2935849.00    2935849.00
mean            14.57      33.00   10197.23      890.85          1.24
std              9.42      16.23    6324.30     1729.80          2.62
min              0.00       0.00       0.00       -1.00        -22.00
25%              7.00      22.00    4476.00      249.00          1.00
50%             14.00      31.00    9343.00      399.00          1.00
75%             23.00      47.00   15684.00      999.00          1.00
max             33.00      59.00   22169.00   307980.00       2169.00 
 date              0
date_block_num    0
shop_id           0
item_id           0
item_price        0
item_cnt_day      0
dtype: int64

******* test ********
 shape:  (214200, 3) 
 Index(['ID', 'shop_id', 

# merge to train DataFrame

In [8]:
train = train.merge(items, on='item_id', how='left')
train = train.merge(shops, on='shop_id', how='left')
train = train.merge(item_categories, on='item_category_id', how='left' )

train.columns



Index(['date', 'date_block_num', 'shop_id', 'item_id', 'item_price',
       'item_cnt_day', 'item_name', 'item_category_id', 'shop_name',
       'item_category_name'],
      dtype='object')

In [9]:
train.isnull().sum()


date                  0
date_block_num        0
shop_id               0
item_id               0
item_price            0
item_cnt_day          0
item_name             0
item_category_id      0
shop_name             0
item_category_name    0
dtype: int64

# choice: eliminate missing values in test from train (if not too many)

In [10]:
# if missing values aren't too many

x = train.shape

test_shop_ids = test['shop_id'].unique()
test_item_ids = test['item_id'].unique()
# Only shops that exist in test set.
train = train[train['shop_id'].isin(test_shop_ids)]
# Only items that exist in test set.
train = train[train['item_id'].isin(test_item_ids)]
# this is different than merge because merge keeps the same rows of test but the method above just verifies the existing
# of the value without changing anything


In [11]:
train.describe()


Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id
count,1224439.0,1224439.0,1224439.0,1224439.0,1224439.0,1224439.0
mean,19.35,32.15,9614.87,1030.67,1.32,40.56
std,9.11,16.47,6299.87,1827.38,3.31,18.61
min,0.0,2.0,30.0,0.5,-16.0,2.0
25%,12.0,19.0,4181.0,299.0,1.0,25.0
50%,21.0,31.0,7856.0,549.0,1.0,38.0
75%,27.0,46.0,15229.0,1199.0,1.0,55.0
max,33.0,59.0,22167.0,59200.0,2169.0,83.0


In [12]:
print('Data set size before leaking:', x)
print('Data set size after leaking:', train.shape[0])

Data set size before leaking: (2935849, 10)
Data set size after leaking: 1224439


# reset index after dropping rows for future purpose

In [13]:
train = train.reset_index(drop=True)

# choice: negative values & nan values: complete with mean (or eliminate them if they are few)

In [14]:
# see how many negative values are in a particular column
print(train[ train['item_cnt_day']<0 ].count())

print(train.describe() )


date                  2941
date_block_num        2941
shop_id               2941
item_id               2941
item_price            2941
item_cnt_day          2941
item_name             2941
item_category_id      2941
shop_name             2941
item_category_name    2941
dtype: int64
       date_block_num    shop_id    item_id  item_price  item_cnt_day  \
count      1224439.00 1224439.00 1224439.00  1224439.00    1224439.00   
mean            19.35      32.15    9614.87     1030.67          1.32   
std              9.11      16.47    6299.87     1827.38          3.31   
min              0.00       2.00      30.00        0.50        -16.00   
25%             12.00      19.00    4181.00      299.00          1.00   
50%             21.00      31.00    7856.00      549.00          1.00   
75%             27.00      46.00   15229.00     1199.00          1.00   
max             33.00      59.00   22167.00    59200.00       2169.00   

       item_category_id  
count        1224439.00  
mean   

In [15]:
# see how many null values are there
train.isnull().sum()

date                  0
date_block_num        0
shop_id               0
item_id               0
item_price            0
item_cnt_day          0
item_name             0
item_category_id      0
shop_name             0
item_category_name    0
dtype: int64

In [16]:
# choice1: elimination
    
#train = train[ train['item_cnt_day']>0 ]

#train = train.dropna()
#train = train.dropna(subset=['col4', 'col5']) # drop rows having null values in certain columns

In [17]:
train

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_name,item_category_id,shop_name,item_category_name
0,02.01.2013,0,59,22154,999.00,1.00,ЯВЛЕНИЕ 2012 (BD),37,"Ярославль ТЦ ""Альтаир""",Кино - Blu-Ray
1,03.01.2013,0,25,2574,399.00,2.00,DEL REY LANA Born To Die The Paradise Editio...,55,"Москва ТРК ""Атриум""",Музыка - CD локального производства
2,05.01.2013,0,25,2574,399.00,1.00,DEL REY LANA Born To Die The Paradise Editio...,55,"Москва ТРК ""Атриум""",Музыка - CD локального производства
3,07.01.2013,0,25,2574,399.00,1.00,DEL REY LANA Born To Die The Paradise Editio...,55,"Москва ТРК ""Атриум""",Музыка - CD локального производства
4,08.01.2013,0,25,2574,399.00,2.00,DEL REY LANA Born To Die The Paradise Editio...,55,"Москва ТРК ""Атриум""",Музыка - CD локального производства
5,10.01.2013,0,25,2574,399.00,1.00,DEL REY LANA Born To Die The Paradise Editio...,55,"Москва ТРК ""Атриум""",Музыка - CD локального производства
6,11.01.2013,0,25,2574,399.00,2.00,DEL REY LANA Born To Die The Paradise Editio...,55,"Москва ТРК ""Атриум""",Музыка - CD локального производства
7,13.01.2013,0,25,2574,399.00,1.00,DEL REY LANA Born To Die The Paradise Editio...,55,"Москва ТРК ""Атриум""",Музыка - CD локального производства
8,16.01.2013,0,25,2574,399.00,1.00,DEL REY LANA Born To Die The Paradise Editio...,55,"Москва ТРК ""Атриум""",Музыка - CD локального производства
9,26.01.2013,0,25,2574,399.00,1.00,DEL REY LANA Born To Die The Paradise Editio...,55,"Москва ТРК ""Атриум""",Музыка - CD локального производства


In [18]:
# choice2: 1) with mean

#Mean = train[train.item_price>0].item_price.mean()
#train.loc[train.item_price<0, 'item_price'] = Mean

# choice2: 2) with groupby and mean

# **** df['col1'] et df['col2'] **** must not contain any null values or else there will be error

L = ['item_cnt_day'] # contains columns having negative values

for col in L:
    
    train.loc[ train[col]<0, col] = np.nan

    gb = train.groupby(by=['shop_id', 'item_id' ], as_index=False)[col].mean()

    for i in range(len(train)):
        if ( isnan( train[col][i]) ):
            train[col][i] = gb[ (gb['shop_id'] == train['shop_id'][i]) & ( gb['item_id'] == train['item_id'][i]) ][col]

train

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_name,item_category_id,shop_name,item_category_name
0,02.01.2013,0,59,22154,999.00,1.00,ЯВЛЕНИЕ 2012 (BD),37,"Ярославль ТЦ ""Альтаир""",Кино - Blu-Ray
1,03.01.2013,0,25,2574,399.00,2.00,DEL REY LANA Born To Die The Paradise Editio...,55,"Москва ТРК ""Атриум""",Музыка - CD локального производства
2,05.01.2013,0,25,2574,399.00,1.00,DEL REY LANA Born To Die The Paradise Editio...,55,"Москва ТРК ""Атриум""",Музыка - CD локального производства
3,07.01.2013,0,25,2574,399.00,1.00,DEL REY LANA Born To Die The Paradise Editio...,55,"Москва ТРК ""Атриум""",Музыка - CD локального производства
4,08.01.2013,0,25,2574,399.00,2.00,DEL REY LANA Born To Die The Paradise Editio...,55,"Москва ТРК ""Атриум""",Музыка - CD локального производства
5,10.01.2013,0,25,2574,399.00,1.00,DEL REY LANA Born To Die The Paradise Editio...,55,"Москва ТРК ""Атриум""",Музыка - CD локального производства
6,11.01.2013,0,25,2574,399.00,2.00,DEL REY LANA Born To Die The Paradise Editio...,55,"Москва ТРК ""Атриум""",Музыка - CD локального производства
7,13.01.2013,0,25,2574,399.00,1.00,DEL REY LANA Born To Die The Paradise Editio...,55,"Москва ТРК ""Атриум""",Музыка - CD локального производства
8,16.01.2013,0,25,2574,399.00,1.00,DEL REY LANA Born To Die The Paradise Editio...,55,"Москва ТРК ""Атриум""",Музыка - CD локального производства
9,26.01.2013,0,25,2574,399.00,1.00,DEL REY LANA Born To Die The Paradise Editio...,55,"Москва ТРК ""Атриум""",Музыка - CD локального производства


In [20]:
# afterwards dealing with null values

L = train.columns[train.isnull().any()]

for col in L:
    gb = train.groupby(by=['shop_id','item_id' ], as_index=False)[col].mean()
    for i in range(len(df)):
        if ( isnan(train[col][i]) ):
            train[col][i] = gb[ (gb['shop_id'] == train['shop_id'][i]) & ( gb['item_id'] == train['item_id'][i]) ][col]

In [21]:
#train = train.fillna( train.mean() )
train.isnull().sum()

date                  0
date_block_num        0
shop_id               0
item_id               0
item_price            0
item_cnt_day          0
item_name             0
item_category_id      0
shop_name             0
item_category_name    0
dtype: int64

# exculde columns having unnecessary names

In [23]:

train = train[['date', 'date_block_num', 'shop_id', 'item_category_id', 'item_id', 'item_price', 'item_cnt_day']]

train.head()

Unnamed: 0,date,date_block_num,shop_id,item_category_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,37,22154,999.0,1.0
1,03.01.2013,0,25,55,2574,399.0,2.0
2,05.01.2013,0,25,55,2574,399.0,1.0
3,07.01.2013,0,25,55,2574,399.0,1.0
4,08.01.2013,0,25,55,2574,399.0,2.0


# ***** encode str / boolean columns  ******

In [10]:
# train['type_code'] = LabelEncoder().fit_transform( train['type'] )


# Extract time based features

In [24]:
train['date'] = pd.to_datetime(train.date,format="%d.%m.%Y") # year in upper case

train['month'] = train['date'].dt.month
train['year'] = train['date'].dt.year

train.head()

Unnamed: 0,date,date_block_num,shop_id,item_category_id,item_id,item_price,item_cnt_day,month,year
0,2013-01-02,0,59,37,22154,999.0,1.0,1,2013
1,2013-01-03,0,25,55,2574,399.0,2.0,1,2013
2,2013-01-05,0,25,55,2574,399.0,1.0,1,2013
3,2013-01-07,0,25,55,2574,399.0,1.0,1,2013
4,2013-01-08,0,25,55,2574,399.0,2.0,1,2013


# group by specific time features

In [25]:
# .agg : Aggregate using one or more operations over the specified axis.
# date may appear sorted but in fact it isn't

train = train.sort_values('date')

# group by
# group by below will automatically eliminate date column !!! what will stay is only what is 
# in groupby and what is in the function after groupby
# if we don't include month and year in the group by they will be excluded to 
# group by should always be succeded by a function or an ".agg" , else change won't appear
train = train.groupby(by=['year', 'month', 'date_block_num',  'shop_id', 'item_category_id', 'item_id'], as_index=False).agg({'item_price':['sum', 'mean'], 'item_cnt_day':['sum', 'mean','count']})

train.head()


Unnamed: 0_level_0,year,month,date_block_num,shop_id,item_category_id,item_id,item_price,item_price,item_cnt_day,item_cnt_day,item_cnt_day
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,sum,mean,sum,mean,count
0,2013,1,0,2,2,5572,10730.0,1532.86,9.0,1.29,7
1,2013,1,0,2,2,5643,4775.21,2387.61,2.09,1.05,2
2,2013,1,0,2,5,5583,1188.3,594.15,2.0,1.0,2
3,2013,1,0,2,6,7893,5970.0,1990.0,3.0,1.0,3
4,2013,1,0,2,6,7894,1490.0,1490.0,1.0,1.0,1


In [26]:
# Rename features: must be by order , or else columns will have other names

train.columns = ['year', 'month', 'date_block_num', 'shop_id', 'item_category_id', 'item_id', 'item_price', 'mean_item_price', 'item_cnt', 'mean_item_cnt', 'transactions']

#print( train.head() )
print ( train.describe() )



           year     month  date_block_num   shop_id  item_category_id  \
count 600159.00 600159.00       600159.00 600159.00         600159.00   
mean    2014.22      6.54           20.16     32.06             42.49   
std        0.77      3.36            9.14     16.89             17.74   
min     2013.00      1.00            0.00      2.00              2.00   
25%     2014.00      4.00           13.00     19.00             30.00   
50%     2014.00      7.00           22.00     31.00             40.00   
75%     2015.00      9.00           28.00     47.00             55.00   
max     2015.00     12.00           33.00     59.00             83.00   

        item_id  item_price  mean_item_price  item_cnt  mean_item_cnt  \
count 600159.00   600159.00        600159.00 600159.00      600159.00   
mean   10014.88     2102.76           911.49      2.71           1.10   
std     6192.91     7012.47          1565.82     11.29           1.77   
min       30.00        0.50             0.50      

# eliminate unnecessary date columns


In [27]:

train = train.drop(['date_block_num'], axis = 1)

train.head()



Unnamed: 0,year,month,shop_id,item_category_id,item_id,item_price,mean_item_price,item_cnt,mean_item_cnt,transactions
0,2013,1,2,2,5572,10730.0,1532.86,9.0,1.29,7
1,2013,1,2,2,5643,4775.21,2387.61,2.09,1.05,2
2,2013,1,2,5,5583,1188.3,594.15,2.0,1.0,2
3,2013,1,2,6,7893,5970.0,1990.0,3.0,1.0,3
4,2013,1,2,6,7894,1490.0,1490.0,1.0,1.0,1


# empty_df strategy

In [15]:
# we shouldn't have missing records so we make a new DataFrame
# Build a data set with all the possible combinations of ['month', 'year','shop_id','item_id'] so we won't have missing records.

"""
shop_ids = train['shop_id'].unique()
item_ids = train['item_id'].unique()
empty_df = []
for i in range (2013,2015):
    for j in range(1,13):
        for shop in shop_ids:
            for item in item_ids:
                empty_df.append([i, j, shop, item])
#2015
for j in range(1,11):
    for shop in shop_ids:
        for item in item_ids:
            empty_df.append([2015, j, shop, item])

    
empty_df = pd.DataFrame(empty_df, columns=['year', 'month','shop_id','item_id'])

"""

"\nshop_ids = train['shop_id'].unique()\nitem_ids = train['item_id'].unique()\nempty_df = []\nfor i in range (2013,2015):\n    for j in range(1,13):\n        for shop in shop_ids:\n            for item in item_ids:\n                empty_df.append([i, j, shop, item])\n#2015\nfor j in range(1,11):\n    for shop in shop_ids:\n        for item in item_ids:\n            empty_df.append([2015, j, shop, item])\n\n    \nempty_df = pd.DataFrame(empty_df, columns=['year', 'month','shop_id','item_id'])\n\n"

In [16]:
#empty_df.head()

In [17]:
"""
# Merge the train set with the complete set (missing records will be filled with 0).
train = pd.merge(empty_df, train, on=['year', 'month','shop_id','item_id'], how='left')
train.fillna(0, inplace=True)

print( train.head() )

print( 'train:  ', train.shape ) # became 6 million rows
print( train.describe() )

"""

"\n# Merge the train set with the complete set (missing records will be filled with 0).\ntrain = pd.merge(empty_df, train, on=['year', 'month','shop_id','item_id'], how='left')\ntrain.fillna(0, inplace=True)\n\nprint( train.head() )\n\nprint( 'train:  ', train.shape ) # became 6 million rows\n\n"

# Checking for outliers if you can delete rows without effecting result on test


In [29]:
train.shape

(600159, 10)

In [30]:
# we don't have problem with item_price
train = train[(train['item_cnt'] >= 0) & (train['item_cnt'] <= 20) & (train['item_price']< 400000)]

#if there is a problem with query : check null values: they give false result

print( train.shape )


(594043, 10)


# shift 

In [31]:
    

train['item_cnt_month'] =  train.sort_values(['year','month']).groupby(['shop_id', 'item_id'])['item_cnt'].shift(-1)

print ( train.head(5) )
print( train.isnull().sum())



   year  month  shop_id  item_category_id  item_id  item_price  \
0  2013      1        2                 2     5572    10730.00   
1  2013      1        2                 2     5643     4775.21   
2  2013      1        2                 5     5583     1188.30   
3  2013      1        2                 6     7893     5970.00   
4  2013      1        2                 6     7894     1490.00   

   mean_item_price  item_cnt  mean_item_cnt  transactions  item_cnt_month  
0          1532.86      9.00           1.29             7            1.00  
1          2387.61      2.09           1.05             2            5.00  
2           594.15      2.00           1.00             2            1.00  
3          1990.00      3.00           1.00             3            2.00  
4          1490.00      1.00           1.00             1            2.00  
year                     0
month                    0
shop_id                  0
item_category_id         0
item_id                  0
item_price  

# unitary item price

In [33]:


train['item_price_unit'] = train['item_price'] // train['item_cnt']

train['item_price_unit'].fillna(0, inplace=True)

print( train.isnull().sum())




year                     0
month                    0
shop_id                  0
item_category_id         0
item_id                  0
item_price               0
mean_item_price          0
item_cnt                 0
mean_item_cnt            0
transactions             0
item_cnt_month      111292
item_price_unit          0
dtype: int64


# Group based features.

# min and max item price

In [34]:
# always year before month in sort_values
gp_item_price = train.sort_values(['year', 'month']).groupby(['item_id'], as_index=False).agg({'item_price':[np.min, np.max]})

gp_item_price

Unnamed: 0_level_0,item_id,item_price,item_price
Unnamed: 0_level_1,Unnamed: 1_level_1,amin,amax
0,30,129.00,4389.00
1,31,160.00,9786.00
2,32,75.00,5235.00
3,33,133.00,2994.00
4,38,1586.00,4798.00
5,42,249.00,747.00
6,45,299.00,1196.00
7,51,127.00,2241.00
8,53,298.70,1196.00
9,57,155.00,897.00


In [35]:
gp_item_price.columns = ['item_id', 'hist_min_item_price', 'hist_max_item_price']
gp_item_price

Unnamed: 0,item_id,hist_min_item_price,hist_max_item_price
0,30,129.00,4389.00
1,31,160.00,9786.00
2,32,75.00,5235.00
3,33,133.00,2994.00
4,38,1586.00,4798.00
5,42,249.00,747.00
6,45,299.00,1196.00
7,51,127.00,2241.00
8,53,298.70,1196.00
9,57,155.00,897.00


## what we group by is what we mege on .

In [36]:
train = pd.merge(train, gp_item_price, on='item_id', how='left')
train

Unnamed: 0,year,month,shop_id,item_category_id,item_id,item_price,mean_item_price,item_cnt,mean_item_cnt,transactions,item_cnt_month,item_price_unit,hist_min_item_price,hist_max_item_price
0,2013,1,2,2,5572,10730.00,1532.86,9.00,1.29,7,1.00,1192.00,1300.00,18979.50
1,2013,1,2,2,5643,4775.21,2387.61,2.09,1.05,2,5.00,2281.00,2036.00,35260.00
2,2013,1,2,5,5583,1188.30,594.15,2.00,1.00,2,1.00,594.00,367.00,5592.00
3,2013,1,2,6,7893,5970.00,1990.00,3.00,1.00,3,2.00,1990.00,895.00,24169.50
4,2013,1,2,6,7894,1490.00,1490.00,1.00,1.00,1,2.00,1490.00,1192.00,25880.00
5,2013,1,2,6,7895,2697.00,899.00,4.00,1.33,3,1.00,674.00,449.50,8447.00
6,2013,1,2,6,7956,13780.00,6890.00,2.00,1.00,2,2.00,6890.00,4032.00,35230.00
7,2013,1,2,19,1409,1398.50,1398.50,1.00,1.00,1,1.00,1398.00,248.00,5596.00
8,2013,1,2,19,1467,899.00,899.00,1.00,1.00,1,1.00,899.00,599.50,5994.00
9,2013,1,2,19,3076,1399.00,1399.00,1.00,1.00,1,3.00,1399.00,399.50,12559.76


In [37]:

train['price_increase'] = train['item_price'] - train['hist_min_item_price']
train['price_decrease'] = train['hist_max_item_price'] - train['item_price']

train.head()

Unnamed: 0,year,month,shop_id,item_category_id,item_id,item_price,mean_item_price,item_cnt,mean_item_cnt,transactions,item_cnt_month,item_price_unit,hist_min_item_price,hist_max_item_price,price_increase,price_decrease
0,2013,1,2,2,5572,10730.0,1532.86,9.0,1.29,7,1.0,1192.0,1300.0,18979.5,9430.0,8249.5
1,2013,1,2,2,5643,4775.21,2387.61,2.09,1.05,2,5.0,2281.0,2036.0,35260.0,2739.21,30484.79
2,2013,1,2,5,5583,1188.3,594.15,2.0,1.0,2,1.0,594.0,367.0,5592.0,821.3,4403.7
3,2013,1,2,6,7893,5970.0,1990.0,3.0,1.0,3,2.0,1990.0,895.0,24169.5,5075.0,18199.5
4,2013,1,2,6,7894,1490.0,1490.0,1.0,1.0,1,2.0,1490.0,1192.0,25880.0,298.0,24390.0


In [38]:
train.isnull().sum()

year                        0
month                       0
shop_id                     0
item_category_id            0
item_id                     0
item_price                  0
mean_item_price             0
item_cnt                    0
mean_item_cnt               0
transactions                0
item_cnt_month         111292
item_price_unit             0
hist_min_item_price         0
hist_max_item_price         0
price_increase              0
price_decrease              0
dtype: int64

# rolling window

In [39]:
# min_prediods= 1 =) we don't have null values ( at least
# one observation required to have a value)
# Min value 
f_min = lambda x: x.rolling(window=3, min_periods=1).min() 
# Max value
f_max = lambda x: x.rolling(window=3, min_periods=1).max()
# Mean value
f_mean = lambda x: x.rolling(window=3, min_periods=1).mean()
# Standard deviation
# std: dispersion of a dataset relative to its mean and is calculated as the square root of the variance
f_std = lambda x: x.rolling(window=3, min_periods=1).std()

function_list = [f_min, f_max, f_mean, f_std]
function_name = ['min', 'max', 'mean', 'std']
for i in range(len(function_list)):
    train[('item_cnt_%s' % function_name[i])] = train.groupby(['shop_id', 'item_category_id', 'item_id'])['item_cnt'].apply(function_list[i])

# Fill the empty std features with 0
train['item_cnt_std'].fillna(0, inplace=True)

print(train.head())

print(train.isnull().sum())

   year  month  shop_id  item_category_id  item_id  item_price  \
0  2013      1        2                 2     5572    10730.00   
1  2013      1        2                 2     5643     4775.21   
2  2013      1        2                 5     5583     1188.30   
3  2013      1        2                 6     7893     5970.00   
4  2013      1        2                 6     7894     1490.00   

   mean_item_price  item_cnt  mean_item_cnt  transactions  item_cnt_month  \
0          1532.86      9.00           1.29             7            1.00   
1          2387.61      2.09           1.05             2            5.00   
2           594.15      2.00           1.00             2            1.00   
3          1990.00      3.00           1.00             3            2.00   
4          1490.00      1.00           1.00             1            2.00   

   item_price_unit  hist_min_item_price  hist_max_item_price  price_increase  \
0          1192.00              1300.00             18979.50

# item_cnt_shifted%s' % lag

In [40]:
lag_list = [1, 2, 3]

for lag in lag_list:
    ft_name = ('item_cnt_shifted%s' % lag)
    train[ft_name] = train.sort_values(['year', 'month']).groupby(['shop_id', 'item_category_id', 'item_id'])['item_cnt'].shift(lag)
    # Fill the empty shifted features with 0
    train[ft_name].fillna(0, inplace=True)

train.head()
    

Unnamed: 0,year,month,shop_id,item_category_id,item_id,item_price,mean_item_price,item_cnt,mean_item_cnt,transactions,...,hist_max_item_price,price_increase,price_decrease,item_cnt_min,item_cnt_max,item_cnt_mean,item_cnt_std,item_cnt_shifted1,item_cnt_shifted2,item_cnt_shifted3
0,2013,1,2,2,5572,10730.0,1532.86,9.0,1.29,7,...,18979.5,9430.0,8249.5,9.0,9.0,9.0,0.0,0.0,0.0,0.0
1,2013,1,2,2,5643,4775.21,2387.61,2.09,1.05,2,...,35260.0,2739.21,30484.79,2.09,2.09,2.09,0.0,0.0,0.0,0.0
2,2013,1,2,5,5583,1188.3,594.15,2.0,1.0,2,...,5592.0,821.3,4403.7,2.0,2.0,2.0,0.0,0.0,0.0,0.0
3,2013,1,2,6,7893,5970.0,1990.0,3.0,1.0,3,...,24169.5,5075.0,18199.5,3.0,3.0,3.0,0.0,0.0,0.0,0.0
4,2013,1,2,6,7894,1490.0,1490.0,1.0,1.0,1,...,25880.0,298.0,24390.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0


# item trend

In [41]:
train['item_trend'] = train['item_cnt']

for lag in lag_list:
    ft_name = ('item_cnt_shifted%s' % lag)
    train['item_trend'] -= train[ft_name]

train['item_trend'] /= len(lag_list) + 1

train.head()


Unnamed: 0,year,month,shop_id,item_category_id,item_id,item_price,mean_item_price,item_cnt,mean_item_cnt,transactions,...,price_increase,price_decrease,item_cnt_min,item_cnt_max,item_cnt_mean,item_cnt_std,item_cnt_shifted1,item_cnt_shifted2,item_cnt_shifted3,item_trend
0,2013,1,2,2,5572,10730.0,1532.86,9.0,1.29,7,...,9430.0,8249.5,9.0,9.0,9.0,0.0,0.0,0.0,0.0,2.25
1,2013,1,2,2,5643,4775.21,2387.61,2.09,1.05,2,...,2739.21,30484.79,2.09,2.09,2.09,0.0,0.0,0.0,0.0,0.52
2,2013,1,2,5,5583,1188.3,594.15,2.0,1.0,2,...,821.3,4403.7,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.5
3,2013,1,2,6,7893,5970.0,1990.0,3.0,1.0,3,...,5075.0,18199.5,3.0,3.0,3.0,0.0,0.0,0.0,0.0,0.75
4,2013,1,2,6,7894,1490.0,1490.0,1.0,1.0,1,...,298.0,24390.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.25


In [42]:
train.describe()

Unnamed: 0,year,month,shop_id,item_category_id,item_id,item_price,mean_item_price,item_cnt,mean_item_cnt,transactions,...,price_increase,price_decrease,item_cnt_min,item_cnt_max,item_cnt_mean,item_cnt_std,item_cnt_shifted1,item_cnt_shifted2,item_cnt_shifted3,item_trend
count,594043.0,594043.0,594043.0,594043.0,594043.0,594043.0,594043.0,594043.0,594043.0,594043.0,...,594043.0,594043.0,594043.0,594043.0,594043.0,594043.0,594043.0,594043.0,594043.0,594043.0
mean,2014.22,6.53,32.07,42.52,10014.66,1921.7,904.68,2.11,1.06,1.9,...,1345.85,7130.28,1.62,3.01,2.25,0.79,1.85,1.63,1.44,-0.7
std,0.77,3.36,16.9,17.69,6181.83,5737.56,1545.69,2.31,0.29,1.72,...,5017.91,12997.7,1.76,3.1,2.22,1.27,2.39,2.42,2.42,1.32
min,2013.0,1.0,2.0,2.0,30.0,0.5,0.5,1.0,1.0,1.0,...,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,-12.5
25%,2014.0,4.0,19.0,30.0,4418.0,299.0,299.0,1.0,1.0,1.0,...,104.02,1196.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,-1.0
50%,2014.0,7.0,31.0,40.0,9171.0,700.0,454.0,1.0,1.0,1.0,...,329.0,3354.7,1.0,2.0,1.33,0.58,1.0,1.0,1.0,-0.5
75%,2015.0,9.0,47.0,55.0,15334.0,1798.0,1099.0,2.0,1.0,2.0,...,1054.0,8196.5,1.0,3.0,2.33,1.0,2.0,2.0,2.0,0.0
max,2015.0,12.0,59.0,83.0,22167.0,366860.0,42990.0,20.0,20.0,19.0,...,348869.0,348869.0,20.0,20.0,20.0,13.44,20.0,20.0,20.0,5.0


# fill null values with 0

In [43]:
train = train.fillna(0)

# add shop_mean

In [44]:
# Shop mean encoding
gp_shop_mean = train.groupby(['shop_id']).agg({'item_cnt_month': ['mean']})
print( gp_shop_mean ) # groupby eliminate all columns but shop_id and item_cnt_month: mean
gp_shop_mean.columns = ['shop_mean']
print( gp_shop_mean )
gp_shop_mean.reset_index(inplace=True)
print( gp_shop_mean )

train = pd.merge(train, gp_shop_mean, on=['shop_id'], how='left')
train.head()


        item_cnt_month
                  mean
shop_id               
2                 1.49
3                 1.30
4                 1.42
5                 1.40
6                 1.60
7                 1.69
10                1.12
12                1.43
14                1.44
15                1.66
16                1.57
18                1.55
19                1.62
21                1.44
22                1.66
24                1.66
25                2.17
26                1.46
28                2.21
31                2.51
34                0.95
35                1.57
36                0.00
37                1.33
38                1.65
39                1.00
41                1.49
42                1.83
44                1.24
45                1.32
46                1.66
47                1.72
48                1.35
49                1.15
50                1.61
52                1.40
53                1.62
55                3.13
56                1.48
57                1.92
58         

Unnamed: 0,year,month,shop_id,item_category_id,item_id,item_price,mean_item_price,item_cnt,mean_item_cnt,transactions,...,price_decrease,item_cnt_min,item_cnt_max,item_cnt_mean,item_cnt_std,item_cnt_shifted1,item_cnt_shifted2,item_cnt_shifted3,item_trend,shop_mean
0,2013,1,2,2,5572,10730.0,1532.86,9.0,1.29,7,...,8249.5,9.0,9.0,9.0,0.0,0.0,0.0,0.0,2.25,1.49
1,2013,1,2,2,5643,4775.21,2387.61,2.09,1.05,2,...,30484.79,2.09,2.09,2.09,0.0,0.0,0.0,0.0,0.52,1.49
2,2013,1,2,5,5583,1188.3,594.15,2.0,1.0,2,...,4403.7,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.5,1.49
3,2013,1,2,6,7893,5970.0,1990.0,3.0,1.0,3,...,18199.5,3.0,3.0,3.0,0.0,0.0,0.0,0.0,0.75,1.49
4,2013,1,2,6,7894,1490.0,1490.0,1.0,1.0,1,...,24390.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.25,1.49


# add item_mean, shop_item_mean, year_mean, month_mean

In [45]:
# Item mean encoding.
gp_item_mean = train.groupby(['item_id']).agg({'item_cnt_month': ['mean']})
gp_item_mean.columns = ['item_mean']
gp_item_mean.reset_index(inplace=True)
train = pd.merge(train, gp_item_mean, on=['item_id'], how='left')

# Shop with item mean encoding.
gp_shop_item_mean = train.groupby(['shop_id', 'item_id']).agg({'item_cnt_month': ['mean']})
gp_shop_item_mean.columns = ['shop_item_mean']
gp_shop_item_mean.reset_index(inplace=True)
train = pd.merge(train, gp_shop_item_mean, on=['shop_id', 'item_id'], how='left')

# Year mean encoding.
gp_year_mean = train.groupby(['year']).agg({'item_cnt_month': ['mean']})
gp_year_mean.columns = ['year_mean']
gp_year_mean.reset_index(inplace=True)
train = pd.merge(train, gp_year_mean, on=['year'], how='left')

# Month mean encoding.
gp_month_mean = train.groupby(['month']).agg({'item_cnt_month': ['mean']})
gp_month_mean.columns = ['month_mean']
gp_month_mean.reset_index(inplace=True)
train = pd.merge(train, gp_month_mean, on=['month'], how='left')

print( train.shape )
print( train.head() )
del(gp_shop_mean, gp_shop_item_mean, gp_year_mean, gp_month_mean )


(594043, 29)
   year  month  shop_id  item_category_id  item_id  item_price  \
0  2013      1        2                 2     5572    10730.00   
1  2013      1        2                 2     5643     4775.21   
2  2013      1        2                 5     5583     1188.30   
3  2013      1        2                 6     7893     5970.00   
4  2013      1        2                 6     7894     1490.00   

   mean_item_price  item_cnt  mean_item_cnt  transactions     ...      \
0          1532.86      9.00           1.29             7     ...       
1          2387.61      2.09           1.05             2     ...       
2           594.15      2.00           1.00             2     ...       
3          1990.00      3.00           1.00             3     ...       
4          1490.00      1.00           1.00             1     ...       

   item_cnt_std  item_cnt_shifted1  item_cnt_shifted2  item_cnt_shifted3  \
0          0.00               0.00               0.00               0.00   

In [46]:
train.describe()

Unnamed: 0,year,month,shop_id,item_category_id,item_id,item_price,mean_item_price,item_cnt,mean_item_cnt,transactions,...,item_cnt_std,item_cnt_shifted1,item_cnt_shifted2,item_cnt_shifted3,item_trend,shop_mean,item_mean,shop_item_mean,year_mean,month_mean
count,594043.0,594043.0,594043.0,594043.0,594043.0,594043.0,594043.0,594043.0,594043.0,594043.0,...,594043.0,594043.0,594043.0,594043.0,594043.0,594043.0,594043.0,594043.0,594043.0,594043.0
mean,2014.22,6.53,32.07,42.52,10014.66,1921.7,904.68,2.11,1.06,1.9,...,0.79,1.85,1.63,1.44,-0.7,1.69,1.69,1.69,1.69,1.69
std,0.77,3.36,16.9,17.69,6181.83,5737.56,1545.69,2.31,0.29,1.72,...,1.27,2.39,2.42,2.42,1.32,0.36,1.03,1.39,0.39,0.33
min,2013.0,1.0,2.0,2.0,30.0,0.5,0.5,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,-12.5,0.0,0.0,0.0,1.24,1.03
25%,2014.0,4.0,19.0,30.0,4418.0,299.0,299.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,-1.0,1.44,1.05,0.91,1.24,1.51
50%,2014.0,7.0,31.0,40.0,9171.0,700.0,454.0,1.0,1.0,1.0,...,0.58,1.0,1.0,1.0,-0.5,1.62,1.36,1.27,1.96,1.73
75%,2015.0,9.0,47.0,55.0,15334.0,1798.0,1099.0,2.0,1.0,2.0,...,1.0,2.0,2.0,2.0,0.0,1.83,2.09,2.0,1.96,1.85
max,2015.0,12.0,59.0,83.0,22167.0,366860.0,42990.0,20.0,20.0,19.0,...,13.44,20.0,20.0,20.0,5.0,3.13,13.07,15.0,2.12,2.61


# making train_set and validation_set

In [47]:
train2 = train.copy()

In [48]:
# test set : without first 3 months

#print( train14[-1:][['month', 'year']])


"""
# for pandas dataframes we use & for and | for or
train_set = train14[ np.logical_or( (train14['month']>= 3 & train14['year']==2013), (train14['year'] == 2014),(train14['month']<= 3 & train14['year']==2015)) ]
train_set.shape
"""

L1 = train2[ (train2['year']==2013) & (train2['month']>=4) ]
L2 = train2[ train2['year']==2014 ]
L3 = train2[ (train2['year']==2015) & (train2['month']<=4) ]

L1 = L1.append(L2)
L1 = L1.append(L3)

train_set = L1.copy()
print( train_set.shape )

del(L1,L2,L3)

validation_set = train2[(train2['year']==2015) & (train2['month'] >=5)]
print ( validation_set.shape)

print( 'remember we have excluded first 3 months')





(411547, 29)
(157049, 29)
remember we have excluded first 3 months


# X_train, Y_train, X_validation, Y_validation, X_test

In [49]:
X_train = train_set.drop(['item_cnt_month'], axis=1)
Y_train = train_set['item_cnt_month'].astype(int)
X_validation = validation_set.drop(['item_cnt_month'], axis=1)
Y_validation = validation_set['item_cnt_month'].astype(int)

In [50]:
int_features = ['shop_id', 'item_id', 'year', 'month']

X_train[int_features] = X_train[int_features].astype('int32')
X_validation[int_features] = X_validation[int_features].astype('int32')


In [51]:
print( 'train shape:  ', train2.shape )
print('train_set shape: ', train_set.shape )
print( 'validation_set shape: ' , validation_set.shape )

#X_test2 = pd.merge(test, train4, on=['shop_id', 'item_id'], how='left')
#print( X_test2.shape )


latest_records = pd.concat([train_set, validation_set]).drop_duplicates(subset=['shop_id', 'item_id'], keep='last')

X_test = pd.merge(test, latest_records, on=['shop_id', 'item_id'], how='left', suffixes=['', '_'])
print('X_test shape:   ', X_test.shape )



train shape:   (594043, 29)
train_set shape:  (411547, 29)
validation_set shape:  (157049, 29)
X_test shape:    (214200, 30)


In [52]:
X_test['year'] = 2015
X_test['month'] = 9
X_test.drop('item_cnt_month', axis=1, inplace=True)
X_test[int_features] = X_test[int_features].astype('int32')
X_test = X_test[X_train.columns]


# check if test contain null values after merge

In [53]:
X_test.isnull().sum()

year                        0
month                       0
shop_id                     0
item_category_id       104076
item_id                     0
item_price             104076
mean_item_price        104076
item_cnt               104076
mean_item_cnt          104076
transactions           104076
item_price_unit        104076
hist_min_item_price    104076
hist_max_item_price    104076
price_increase         104076
price_decrease         104076
item_cnt_min           104076
item_cnt_max           104076
item_cnt_mean          104076
item_cnt_std           104076
item_cnt_shifted1      104076
item_cnt_shifted2      104076
item_cnt_shifted3      104076
item_trend             104076
shop_mean              104076
item_mean              104076
shop_item_mean         104076
year_mean              104076
month_mean             104076
dtype: int64

In [54]:
X_test.fillna(X_test.mean(), inplace=True)
X_test

Unnamed: 0,year,month,shop_id,item_category_id,item_id,item_price,mean_item_price,item_cnt,mean_item_cnt,transactions,...,item_cnt_std,item_cnt_shifted1,item_cnt_shifted2,item_cnt_shifted3,item_trend,shop_mean,item_mean,shop_item_mean,year_mean,month_mean
0,2015,9,5,19.00,5037,749.50,749.50,1.00,1.00,1.00,...,1.15,3.00,1.00,1.00,-1.00,1.40,2.40,1.33,1.24,1.44
1,2015,9,5,43.83,5320,1347.61,963.37,1.38,1.03,1.31,...,0.40,1.18,1.02,0.88,-0.42,1.60,1.23,0.98,1.39,1.49
2,2015,9,5,19.00,5233,1199.00,1199.00,1.00,1.00,1.00,...,1.15,3.00,1.00,2.00,-1.25,1.40,2.22,1.40,1.24,1.03
3,2015,9,5,23.00,5232,599.00,599.00,1.00,1.00,1.00,...,0.00,0.00,0.00,0.00,0.25,1.40,0.83,0.00,1.24,1.51
4,2015,9,5,43.83,5268,1347.61,963.37,1.38,1.03,1.31,...,0.40,1.18,1.02,0.88,-0.42,1.60,1.23,0.98,1.39,1.49
5,2015,9,5,23.00,5039,1499.00,1499.00,1.00,1.00,1.00,...,1.15,1.00,3.00,1.00,-1.00,1.40,2.14,1.50,1.24,1.03
6,2015,9,5,20.00,5041,7998.00,3999.00,2.00,1.00,2.00,...,0.71,3.00,0.00,0.00,-0.25,1.40,0.88,1.00,1.24,1.03
7,2015,9,5,55.00,5046,349.00,349.00,1.00,1.00,1.00,...,2.31,1.00,5.00,1.00,-1.50,1.40,1.69,1.75,1.24,1.74
8,2015,9,5,55.00,5319,897.00,299.00,3.00,1.00,3.00,...,0.58,2.00,3.00,4.00,-1.50,1.40,3.04,3.00,1.24,1.44
9,2015,9,5,43.83,5003,1347.61,963.37,1.38,1.03,1.31,...,0.40,1.18,1.02,0.88,-0.42,1.60,1.23,0.98,1.39,1.49


# drop unnecessary columns: try dropping and see the result

In [57]:
# I'm dropping "item_category_id", we don't have it on test set and would be a little hard to create categories for items that exist only on test set.
X_train.drop(['item_category_id'], axis=1, inplace=True)
X_validation.drop(['item_category_id'], axis=1, inplace=True)
X_test.drop(['item_category_id'], axis=1, inplace=True)

In [58]:
print('\n\n\n', X_train.head() )
print( '\n\n\n', X_validation.head() )
print( '\n\n\n', X_test.head() )




        year  month  shop_id  item_id  item_price  mean_item_price  item_cnt  \
25447  2013      4        2     5572     2980.00          1490.00      2.00   
25448  2013      4        2     5581      499.00           499.00      1.00   
25449  2013      4        2     7893     9350.00          1870.00      5.00   
25450  2013      4        2     2754     1999.00          1999.00      1.00   
25451  2013      4        2     2919      899.00           899.00      1.00   

       mean_item_cnt  transactions  item_price_unit     ...      item_cnt_std  \
25447           1.00             2          1490.00     ...              0.58   
25448           1.00             1           499.00     ...              0.00   
25449           1.00             5          1870.00     ...              2.08   
25450           1.00             1          1999.00     ...              0.00   
25451           1.00             1           899.00     ...              0.00   

       item_cnt_shifted1  item_cnt

# catboost

In [59]:
"""
import catboost
from catboost import Pool
from catboost import CatBoostRegressor



cat_features = [0, 1, 7, 8]

catboost_model = CatBoostRegressor(
    iterations=500,
    max_ctr_complexity=4,
    random_seed=0,
    od_type='Iter',
    od_wait=25,
    verbose=50,
    depth=4
)

catboost_model.fit(
    X_train, Y_train,
    cat_features=cat_features,
    eval_set=(X_validation, Y_validation)
)

catboost_train_pred = catboost_model.predict(X_train)
catboost_val_pred = catboost_model.predict(X_validation)
catboost_test_pred = catboost_model.predict(X_test)

from sklearn.metrics import mean_squared_error 

rmse = np.sqrt(mean_squared_error(catboost_train_pred, Y_train) )
print(rmse)
"""

ModuleNotFoundError: No module named 'catboost'

# random forest regressor

In [60]:
#rf_features = ['shop_id', 'item_id', 'item_cnt', 'transactions', 'year',
 #              'item_cnt_mean', 'item_cnt_std', 'item_cnt_shifted1', 
  #             'shop_mean', 'item_mean', 'item_trend', 'mean_item_cnt']


rf_features = X_train.columns


rf_train = X_train[rf_features]
rf_val = X_validation[rf_features]
rf_test = X_test[rf_features]



In [61]:
rf_model = RandomForestRegressor(n_estimators=50, max_depth=7, random_state=0, n_jobs=-1)
rf_model.fit(rf_train, Y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=7,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [62]:

rf_train_pred = rf_model.predict(rf_train)
rf_val_pred = rf_model.predict(rf_val)
rf_test_pred = rf_model.predict(rf_test)

rmse = np.sqrt(mean_squared_error(rf_train_pred, Y_train))
print('rmse of train : ' , rmse)
               
rmse = np.sqrt(mean_squared_error(rf_val_pred, Y_validation))
print('rmse of validation : ' , rmse)



rmse of train :  1.3794975829697087
rmse of validation :  1.411538357456815


In [63]:
print( rf_model.score(X_train, Y_train) )
print( rf_model.score(X_validation, Y_validation) )

0.6039575336096015
0.33711358436530636


# Linear Regression

In [64]:
#lr_features = ['item_cnt', 'item_cnt_shifted1', 'item_trend', 'mean_item_cnt', 'shop_mean']

lr_features = X_train.columns


lr_train = X_train[lr_features]
lr_val = X_validation[lr_features]
lr_test = X_test[lr_features]

In [65]:
lr_scaler = MinMaxScaler()
lr_scaler.fit(lr_train)
lr_train = lr_scaler.transform(lr_train)
lr_val = lr_scaler.transform(lr_val)
lr_test = lr_scaler.transform(lr_test)
lr_model = LinearRegression(n_jobs=-1)
lr_model.fit(lr_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

In [66]:
lr_train_pred = lr_model.predict(lr_train)
lr_val_pred = lr_model.predict(lr_val)
lr_test_pred = lr_model.predict(lr_test)

In [67]:
rmse = np.sqrt(mean_squared_error(lr_train_pred, Y_train))
print('rmse of train : ' , rmse)
               
rmse = np.sqrt(mean_squared_error(lr_val_pred, Y_validation))
print('rmse of validation : ' , rmse)

print( lr_model.score(lr_train, Y_train) )
print( lr_model.score(lr_val, Y_validation) )

rmse of train :  1.4787845019334536
rmse of validation :  1.4699445111674136
0.544897051202116
0.28112127246930063


# KNN regressor

In [65]:
# Use only part of features on KNN.
#knn_features = ['item_cnt', 'item_cnt_mean', 'item_cnt_std', 'item_cnt_shifted1',
#               'item_cnt_shifted2', 'shop_mean', 'shop_item_mean', 
#              'item_trend', 'mean_item_cnt']

"""
knn_features = X_train.columns



# Subsample train set (using the whole data was taking too long).
#X_train_sampled = X_train[:100000]
#Y_train_sampled = Y_train[:100000]

knn_train = X_train[knn_features]
knn_val = X_validation[knn_features]
knn_test = X_test[knn_features]
"""

'\nknn_features = X_train.columns\n\n\n\n# Subsample train set (using the whole data was taking too long).\n#X_train_sampled = X_train[:100000]\n#Y_train_sampled = Y_train[:100000]\n\nknn_train = X_train[knn_features]\nknn_val = X_validation[knn_features]\nknn_test = X_test[knn_features]\n'

In [66]:
"""
knn_scaler = MinMaxScaler()
knn_scaler.fit(knn_train)
knn_train = knn_scaler.transform(knn_train)
knn_val = knn_scaler.transform(knn_val)
knn_test = knn_scaler.transform(knn_test)

"""

'\nknn_scaler = MinMaxScaler()\nknn_scaler.fit(knn_train)\nknn_train = knn_scaler.transform(knn_train)\nknn_val = knn_scaler.transform(knn_val)\nknn_test = knn_scaler.transform(knn_test)\n\n'

In [67]:

#knn_model = KNeighborsRegressor(n_neighbors=9, leaf_size=13, n_jobs=-1)
#knn_model.fit(knn_train, Y_train)

In [68]:
#knn_train_pred = knn_model.predict(knn_train)
#knn_val_pred = knn_model.predict(knn_val)
#knn_test_pred = knn_model.predict(knn_test)

In [69]:
#rmse = np.sqrt(mean_squared_error(knn_train_pred, Y_train_sampled))
#print('rmse of train : ' , rmse)
               
#rmse = np.sqrt(mean_squared_error(knn_val_pred, Y_validation))
#print('rmse of validation : ' , rmse)

# lightgbm

In [68]:
lgbm_model=LGBMRegressor(
        n_estimators=200,
        learning_rate=0.03,
        num_leaves=32,
        colsample_bytree=0.9497036,
        subsample=0.8715623,
        max_depth=8,
        reg_alpha=0.04,
        reg_lambda=0.073,
        min_split_gain=0.0222415,
        min_child_weight=40)



In [69]:
lgbm_model.fit(X_train, Y_train)


lightgbm_train_pred = lgbm_model.predict(X_train)
lightgbm_val_pred = lgbm_model.predict(X_validation)
lightgbm_test_pred = lgbm_model.predict(X_test)

rmse = np.sqrt(mean_squared_error(Y_train, lightgbm_train_pred ))
print('rmse of train : ' , rmse)
               
rmse = np.sqrt(mean_squared_error(Y_validation, lightgbm_val_pred))
print('rmse of validation : ' , rmse)

rmse of train :  1.3182918897851479
rmse of validation :  1.3381554041488075


In [70]:
print( lgbm_model.score(X_train, Y_train) )
print( lgbm_model.score(X_validation, Y_validation) )

0.6383212198656871
0.40424615308472756


# xgboost

In [73]:
# Use only part of features on XGBoost.
#xgb_features = ['item_cnt','item_cnt_mean', 'item_cnt_std', 'item_cnt_shifted1', 
 #               'item_cnt_shifted2', 'item_cnt_shifted3', 'shop_mean', 
  #              'shop_item_mean', 'item_trend', 'mean_item_cnt']

"""
xgb_features = X_train.columns


xgb_train = X_train[xgb_features]
xgb_val = X_validation[xgb_features]
xgb_test = X_test[xgb_features]

xgb_model = XGBRegressor(max_depth=8, 
                         n_estimators=500, 
                         min_child_weight=1000,  
                         colsample_bytree=0.7, 
                         subsample=0.7, 
                         eta=0.3, 
                         seed=0)
xgb_model.fit(xgb_train, 
              Y_train, 
              eval_metric="rmse", 
              eval_set=[(xgb_train, Y_train), (xgb_val, Y_validation)], 
              verbose=20, 
              early_stopping_rounds=20)

"""

'\nxgb_features = X_train.columns\n\n\nxgb_train = X_train[xgb_features]\nxgb_val = X_validation[xgb_features]\nxgb_test = X_test[xgb_features]\n\nxgb_model = XGBRegressor(max_depth=8, \n                         n_estimators=500, \n                         min_child_weight=1000,  \n                         colsample_bytree=0.7, \n                         subsample=0.7, \n                         eta=0.3, \n                         seed=0)\nxgb_model.fit(xgb_train, \n              Y_train, \n              eval_metric="rmse", \n              eval_set=[(xgb_train, Y_train), (xgb_val, Y_validation)], \n              verbose=20, \n              early_stopping_rounds=20)\n\n'

In [74]:
"""
xgb_train_pred = xgb_model.predict(xgb_train)
xgb_val_pred = xgb_model.predict(xgb_val)
xgb_test_pred = xgb_model.predict(xgb_test)
"""

'\nxgb_train_pred = xgb_model.predict(xgb_train)\nxgb_val_pred = xgb_model.predict(xgb_val)\nxgb_test_pred = xgb_model.predict(xgb_test)\n'

In [75]:
"""
from sklearn.metrics import mean_squared_error 

rmse = np.sqrt(mean_squared_error(xgb_train_pred, Y_train))
print('rmse of train : ' , rmse)
               
rmse = np.sqrt(mean_squared_error(xgb_val_pred, Y_validation))
print('rmse of validation : ' , rmse)

"""

"\nfrom sklearn.metrics import mean_squared_error \n\nrmse = np.sqrt(mean_squared_error(xgb_train_pred, Y_train))\nprint('rmse of train : ' , rmse)\n               \nrmse = np.sqrt(mean_squared_error(xgb_val_pred, Y_validation))\nprint('rmse of validation : ' , rmse)\n\n"

# DMatrix of xgboost

In [74]:
def XGB_regressor(train_X, train_y, test_X, test_y, feature_names=None, seed_val=2017, num_rounds=500):
    param = {}
    param['objective'] = 'reg:linear'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['eval_metric'] = 'rmse'
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())

    xgtrain = xgboost.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgboost.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgboost.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgboost.DMatrix(test_X)
        model = xgboost.train(plst, xgtrain, num_rounds)
        
    return model    
    
    
dmatrix_model = XGB_regressor(train_X = X_train, train_y = Y_train, test_X = X_validation, test_y = Y_validation)

[0]	train-rmse:2.42796	test-rmse:1.70156
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 20 rounds.
[1]	train-rmse:2.27296	test-rmse:1.60103
[2]	train-rmse:2.13837	test-rmse:1.52772
[3]	train-rmse:2.0217	test-rmse:1.4695
[4]	train-rmse:1.92131	test-rmse:1.42242
[5]	train-rmse:1.83542	test-rmse:1.38522
[6]	train-rmse:1.76886	test-rmse:1.36573
[7]	train-rmse:1.70408	test-rmse:1.34271
[8]	train-rmse:1.6496	test-rmse:1.32623
[9]	train-rmse:1.60882	test-rmse:1.31982
[10]	train-rmse:1.56813	test-rmse:1.31002
[11]	train-rmse:1.53484	test-rmse:1.30614
[12]	train-rmse:1.50608	test-rmse:1.30206
[13]	train-rmse:1.48502	test-rmse:1.30303
[14]	train-rmse:1.46418	test-rmse:1.30163
[15]	train-rmse:1.44629	test-rmse:1.30193
[16]	train-rmse:1.4302	test-rmse:1.30204
[17]	train-rmse:1.41624	test-rmse:1.30286
[18]	train-rmse:1.40572	test-rmse:1.30298
[19]	train-rmse:1.39543	test-rmse:1.30327
[20]	train-rmse:1.38743	test-rm

In [75]:
# Use only part of features on XGBoost.
#xgb_features = ['item_cnt','item_cnt_mean', 'item_cnt_std', 'item_cnt_shifted1', 
 #               'item_cnt_shifted2', 'item_cnt_shifted3', 'shop_mean', 
  #              'shop_item_mean', 'item_trend', 'mean_item_cnt']

xgb_features = X_train.columns


xgb_train = X_train[xgb_features]
xgb_val = X_validation[xgb_features]
xgb_test = X_test[xgb_features]

dmatrix_train_pred = dmatrix_model.predict(xgboost.DMatrix( xgb_train ), ntree_limit = dmatrix_model.best_ntree_limit)
dmatrix_val_pred = dmatrix_model.predict(xgboost.DMatrix( xgb_val ), ntree_limit = dmatrix_model.best_ntree_limit)
dmatrix_test_pred = dmatrix_model.predict(xgboost.DMatrix( xgb_test ), ntree_limit = dmatrix_model.best_ntree_limit)

In [76]:
rmse = np.sqrt(mean_squared_error(Y_train, lightgbm_train_pred ))
print('rmse of train : ' , rmse)
               
rmse = np.sqrt(mean_squared_error(Y_validation, lightgbm_val_pred))
print('rmse of validation : ' , rmse)

rmse of train :  1.3182918897851479
rmse of validation :  1.3381554041488075


# lgb Dataset

In [79]:
params = {
        'nthread': 10,
         'max_depth': 5,
#         'max_depth': 9,
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression_l1',
        'metric': 'rmse', # this is abs(a-e)/max(1,a)
#         'num_leaves': 39,
        'num_leaves': 64,
        'learning_rate': 0.2,
       'feature_fraction': 0.9,
#         'feature_fraction': 0.8108472661400657,
#         'bagging_fraction': 0.9837558288375402,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'lambda_l1': 3.097758978478437,
        'lambda_l2': 2.9482537987198496,
#       'lambda_l1': 0.06,
#       'lambda_l2': 0.1,
        'verbose': 1,
        'min_child_weight': 6.996211413900573,
        'min_split_gain': 0.037310344962162616,
        }
    
lgbdataset_train = lgb.Dataset(X_train,Y_train)
lgbdataset_valid = lgb.Dataset(X_validation,Y_validation)

lgbdataset_model = lgb.train(params, lgbdataset_train, 3000, valid_sets=[lgbdataset_train, lgbdataset_valid],early_stopping_rounds=50, verbose_eval=50)

lgbdataset_pred_train = lgbdataset_model.predict(X_train)
lgbdataset_pred_val = lgbdataset_model.predict(X_validation)

rmse = np.sqrt(mean_squared_error(lgbdataset_pred_train, Y_train))
print('rmse of train : ' , rmse)
               
rmse = np.sqrt(mean_squared_error(lgbdataset_pred_val, Y_validation))
print('rmse of validation : ' , rmse)



Training until validation scores don't improve for 50 rounds.
[50]	training's rmse: 1.44329	valid_1's rmse: 1.29491
[100]	training's rmse: 1.4225	valid_1's rmse: 1.29078
[150]	training's rmse: 1.41764	valid_1's rmse: 1.28997
[200]	training's rmse: 1.40562	valid_1's rmse: 1.28684
[250]	training's rmse: 1.40364	valid_1's rmse: 1.28432
[300]	training's rmse: 1.39368	valid_1's rmse: 1.28786
Early stopping, best iteration is:
[268]	training's rmse: 1.40102	valid_1's rmse: 1.28343
rmse of train :  1.4010200422173864
rmse of validation :  1.2834304122971392


# submission

In [76]:
prediction_df = pd.DataFrame(test['ID'], columns=['ID'])
prediction_df['item_cnt_month'] = lightgbm_test_pred
prediction_df.to_csv(r'C:\Users\pc\Desktop\Data Science Folder\Retail2\submission.csv', index=False)
prediction_df.head(10)

Unnamed: 0,ID,item_cnt_month
0,0,1.16
1,1,0.87
2,2,1.21
3,3,0.04
4,4,0.87
5,5,1.27
6,6,1.06
7,7,1.45
8,8,2.2
9,9,0.87
