In [0]:
##   ~~~ REQUIREMENTS: ~~~

# This notebook was created and run in Google Colab (https://colab.research.google.com).
# Runtime type was set to GPU. 

# This notebook is ready to run. 
# All computation as it is (with loaded models) takes ~ 4 minutes 46 seconds.

In [2]:
## Imports:

import numpy                   as np
import pandas                  as pd

from   itertools               import product
from   time                    import time

!pip install catboost
import catboost
from   catboost                import CatBoostRegressor, Pool

import sklearn
from   sklearn.preprocessing   import LabelEncoder
from   sklearn.model_selection import KFold
from   sklearn.linear_model    import LinearRegression

import joblib
from   google.colab            import files


# Print your packages versions:
print('\nImported packages:')
for p in [np, pd, catboost, sklearn]:
    print (p.__name__, p.__version__)

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/14/14/35b211f9b1c779faba8061b696760f6da743a15d7b215cd6babb211ced0c/catboost-0.18-cp36-none-manylinux1_x86_64.whl (62.9MB)
[K     |████████████████████████████████| 62.9MB 5.6MB/s 
Installing collected packages: catboost
Successfully installed catboost-0.18

Imported packages:
numpy 1.17.3
pandas 0.25.2
catboost 0.18
sklearn 0.21.3


In [0]:
## Packages versions should be:
# numpy 1.17.3
# pandas 0.25.2
# catboost 0.18
# sklearn 0.21.3

In [4]:
# Check if GPU is available:   (should output True and some Warning about Tensorflow upgrade)
import tensorflow as tf
tf.test.is_gpu_available()

True

In [0]:
##                 ~~~~***~~ LOAD DATA ~~***~~~~
##==============================================================================
    
train           = pd.read_csv('https://raw.githubusercontent.com/Ritchizh/PredictFutureSales_notebook/master/Raw_Data/sales_train.csv.gz')
test            = pd.read_csv('https://raw.githubusercontent.com/Ritchizh/PredictFutureSales_notebook/master/Raw_Data/test.csv.gz')
items           = pd.read_csv('https://raw.githubusercontent.com/Ritchizh/PredictFutureSales_notebook/master/Raw_Data/items.csv')
cats            = pd.read_csv('https://raw.githubusercontent.com/Ritchizh/PredictFutureSales_notebook/master/Raw_Data/item_categories.csv')
shops           = pd.read_csv('https://raw.githubusercontent.com/Ritchizh/PredictFutureSales_notebook/master/Raw_Data/shops.csv')
sample_submission = pd.read_csv('https://raw.githubusercontent.com/Ritchizh/PredictFutureSales_notebook/master/Raw_Data/sample_submission.csv.gz')

In [0]:
##              ~~~~***~~ DATA EDA AND PREPROCESSING ~~***~~~~
##==============================================================================
# Remove outliers:

train = train[train.item_price<100000]
train = train[train.item_cnt_day<1001]

##==============================================================================
# Fill negative item price with median value:

median = train[(train.shop_id==32)&(train.item_id==2973)&(train.date_block_num==4)&(train.item_price>0)].item_price.median()
train.loc[train.item_price<0, 'item_price'] = median

##==============================================================================
# Several shops are duplicates of each other.

train.loc[train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57

train.loc[train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58

train.loc[train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

##==============================================================================
# Shops/Cats/Items preprocessing:


# Includes simple feature extraction from text - extracting subtypes from splitted strings.


# Each shop_name starts with the city name. Replace shop_names with city_codes (label encoding).
# Each category contains type and subtype in its name. Replace category_names with cat and subcat codes.


shops.loc[shops.shop_name == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])
shops.loc[shops.city == '!Якутск', 'city'] = 'Якутск'
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])
shops = shops[['shop_id','city_code']]

cats['split']        = cats['item_category_name'].str.split('-')
cats['type']         = cats['split'].map(lambda x: x[0].strip())
cats['type_code']    = LabelEncoder().fit_transform(cats['type'])
cats['subtype']      = cats['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
cats['subtype_code'] = LabelEncoder().fit_transform(cats['subtype'])
cats = cats[['item_category_id','type_code', 'subtype_code']]

items.drop(['item_name'], axis=1, inplace=True)

In [0]:
## Reformat to TRAINTEST - to make Train set similar to Test set.
# This involves forming a grid of unique pairs of ('shop_id','item_id) for each month and
# filling zeros for unknown sales.

##==============================================================================

# Add TRAIN pairs to TRAINTEST:

traintest = []
for i in range(34):
    sales = train[train.date_block_num==i]
    traintest.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique()))  ))
# 'itertools.product' is equivalent to nested for-loops. 
# For example, product(A, B) returns the same as ((x,y) for x in A for y in B)

cols   = ['date_block_num','shop_id','item_id']
traintest = pd.DataFrame(np.vstack(traintest), columns=cols)

# Specifying data types to make it more compact:
traintest['date_block_num'] = traintest['date_block_num'].astype(np.int8)
traintest['shop_id']        = traintest['shop_id'].astype(np.int8)
traintest['item_id']        = traintest['item_id'].astype(np.int16)

traintest.sort_values(cols,inplace=True)


# Form aggregated TARGET values with sum[cnt] for each month, shop, item:
group = train.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day': ['sum']})
group.columns = ['item_cnt_month']
group.reset_index(inplace=True)

traintest = pd.merge(traintest, group, on=cols, how='left')
traintest['item_cnt_month'] = (traintest['item_cnt_month']
                                .fillna(0)
                                .clip(0,20)          # clip
                                .astype(np.float16)) # to preserve float format

# Add TEST pairs to TRAINTEST:

test['date_block_num'] = 34
test = test.drop('ID', axis=1)


test['date_block_num'] = test['date_block_num'].astype(np.int8)
test['shop_id'] = test['shop_id'].astype(np.int8)
test['item_id'] = test['item_id'].astype(np.int16)

traintest = pd.concat([traintest, test], ignore_index=True, sort=False, keys=cols)
traintest.fillna(0, inplace=True) # 34 month - here the new item_ids (not present in Train) get 0 sales

In [0]:
# Add Shops/Items/Cats and other features:

traintest = pd.merge(traintest, shops, on=['shop_id'], how='left')
traintest = pd.merge(traintest, items, on=['item_id'], how='left')
traintest = pd.merge(traintest, cats,  on=['item_category_id'], how='left')
traintest['city_code']        = traintest['city_code'].astype(np.int8)
traintest['item_category_id'] = traintest['item_category_id'].astype(np.int8)
traintest['type_code']        = traintest['type_code'].astype(np.int8)
traintest['subtype_code']     = traintest['subtype_code'].astype(np.int8)


# Add the month number as a new feature:
traintest['month'] = traintest['date_block_num'] % 12


# Add number of workdays and holidays in each month as new features:
hol_num  = [14, 8,  11, 8,  12, 11, 8,  10, 8,  8,  12, 8]
work_num = [17, 20, 20, 22, 19, 19, 23, 21, 22, 23, 18, 23]

def func_hols(ser):
  return (hol_num[ser])

def func_works(ser):
  return (work_num[ser])

traintest['month_hols']  = traintest['month'].apply(func_hols)
traintest['month_hols']  = traintest['month_hols'].astype(np.int8)

traintest['month_works'] = traintest['month'].apply(func_works)
traintest['month_works'] = traintest['month_works'].astype(np.int8)

In [0]:
## (This cell takes 3 minutes 34 secs to compute)

##          ~*~ Getting trends via Mean Target encoding:  ~*~

# Take the closest non-zero price, compare it with the average price. 
# If an item costs less than in the past - it is a positive trend, otherwise - negative.


def trend_feature(df, lagstep, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lagstep:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df
  

traintest = trend_feature(traintest, [1,2,3,4,5,6], 'item_cnt_month')           # [1,2,3,4,5,6] - are the periods
traintest['item_cnt_month_lag_1'] = traintest['item_cnt_month_lag_1'].fillna(0)
traintest['item_cnt_month_lag_2'] = traintest['item_cnt_month_lag_2'].fillna(0)
traintest['item_cnt_month_lag_3'] = traintest['item_cnt_month_lag_3'].fillna(0)
traintest['item_cnt_month_lag_4'] = traintest['item_cnt_month_lag_4'].fillna(0)
traintest['item_cnt_month_lag_5'] = traintest['item_cnt_month_lag_5'].fillna(0)
traintest['item_cnt_month_lag_6'] = traintest['item_cnt_month_lag_6'].fillna(0)


# Mean monthly cnt for each item:

group = traintest.groupby(['date_block_num', 'item_id']).agg({'item_cnt_month': ['mean']}) # Target encoding!
group.columns = [ 'monthly_byitem_itemcnt_mean' ]
group.reset_index(inplace=True)

traintest = pd.merge(traintest, group, on=['date_block_num','item_id'], how='left')
traintest['monthly_byitem_itemcnt_mean'] = traintest['monthly_byitem_itemcnt_mean'].astype(np.float16)


traintest = trend_feature(traintest, [1,2,3,4,5,6], 'monthly_byitem_itemcnt_mean')

traintest['monthly_byitem_itemcnt_mean_lag_1'] = traintest['monthly_byitem_itemcnt_mean_lag_1'].fillna(0)
traintest['monthly_byitem_itemcnt_mean_lag_2'] = traintest['monthly_byitem_itemcnt_mean_lag_2'].fillna(0)
traintest['monthly_byitem_itemcnt_mean_lag_3'] = traintest['monthly_byitem_itemcnt_mean_lag_3'].fillna(0)
traintest['monthly_byitem_itemcnt_mean_lag_4'] = traintest['monthly_byitem_itemcnt_mean_lag_4'].fillna(0)
traintest['monthly_byitem_itemcnt_mean_lag_5'] = traintest['monthly_byitem_itemcnt_mean_lag_5'].fillna(0)
traintest['monthly_byitem_itemcnt_mean_lag_6'] = traintest['monthly_byitem_itemcnt_mean_lag_6'].fillna(0)

traintest.drop(['monthly_byitem_itemcnt_mean'], axis=1, inplace=True) # not to overfit!


# Mean monthly cnt for each shop:

group = traintest.groupby(['date_block_num', 'shop_id']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'monthly_byshop_itemcnt_mean' ]
group.reset_index(inplace=True)

traintest = pd.merge(traintest, group, on=['date_block_num','shop_id'], how='left')
traintest['monthly_byshop_itemcnt_mean'] = traintest['monthly_byshop_itemcnt_mean'].astype(np.float16)

traintest = trend_feature(traintest, [1,2,3,4,5,6], 'monthly_byshop_itemcnt_mean')

traintest['monthly_byshop_itemcnt_mean_lag_1'] = traintest['monthly_byshop_itemcnt_mean_lag_1'].fillna(0)
traintest['monthly_byshop_itemcnt_mean_lag_2'] = traintest['monthly_byshop_itemcnt_mean_lag_2'].fillna(0)
traintest['monthly_byshop_itemcnt_mean_lag_3'] = traintest['monthly_byshop_itemcnt_mean_lag_3'].fillna(0)
traintest['monthly_byshop_itemcnt_mean_lag_4'] = traintest['monthly_byshop_itemcnt_mean_lag_4'].fillna(0)
traintest['monthly_byshop_itemcnt_mean_lag_5'] = traintest['monthly_byshop_itemcnt_mean_lag_5'].fillna(0)
traintest['monthly_byshop_itemcnt_mean_lag_6'] = traintest['monthly_byshop_itemcnt_mean_lag_6'].fillna(0)


traintest.drop(['monthly_byshop_itemcnt_mean'], axis=1, inplace=True) # not to overfit!

In [10]:
# Rename to save traintest unchanged (just in case).
# Take a look at the data.

data = traintest
data.tail()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,city_code,item_category_id,type_code,subtype_code,month,month_hols,month_works,item_cnt_month_lag_1,item_cnt_month_lag_2,item_cnt_month_lag_3,item_cnt_month_lag_4,item_cnt_month_lag_5,item_cnt_month_lag_6,monthly_byitem_itemcnt_mean_lag_1,monthly_byitem_itemcnt_mean_lag_2,monthly_byitem_itemcnt_mean_lag_3,monthly_byitem_itemcnt_mean_lag_4,monthly_byitem_itemcnt_mean_lag_5,monthly_byitem_itemcnt_mean_lag_6,monthly_byshop_itemcnt_mean_lag_1,monthly_byshop_itemcnt_mean_lag_2,monthly_byshop_itemcnt_mean_lag_3,monthly_byshop_itemcnt_mean_lag_4,monthly_byshop_itemcnt_mean_lag_5,monthly_byshop_itemcnt_mean_lag_6
11127999,34,45,18454,0.0,20,55,13,2,10,12,18,1.0,0.0,0.0,0.0,0.0,0.0,0.045441,0.023254,0.071411,0.279053,0.441895,0.59082,0.126709,0.128784,0.139038,0.126831,0.119446,0.137573
11128000,34,45,16188,0.0,20,64,14,42,10,12,18,0.0,0.0,0.0,0.0,0.0,0.0,0.02272,0.069763,0.0,0.0,0.0,0.0,0.126709,0.128784,0.0,0.0,0.0,0.0
11128001,34,45,15757,0.0,20,55,13,2,10,12,18,0.0,0.0,0.0,0.0,0.0,0.0,0.113647,0.069763,0.095215,0.093018,0.186035,0.25,0.126709,0.128784,0.139038,0.126831,0.119446,0.137573
11128002,34,45,19648,0.0,20,40,11,4,10,12,18,0.0,0.0,0.0,0.0,0.0,0.0,0.045441,0.069763,0.166626,0.046509,0.093018,0.090881,0.126709,0.128784,0.139038,0.126831,0.119446,0.137573
11128003,34,45,969,0.0,20,37,11,1,10,12,18,0.0,0.0,0.0,0.0,0.0,0.0,0.068176,0.116272,0.023804,0.046509,0.046509,0.068176,0.126709,0.128784,0.139038,0.126831,0.119446,0.137573


In [0]:
# Separate X_test (because it won't be used for mean encoding):

X_test = data[data.date_block_num == 34].drop(['date_block_num', 'item_cnt_month'], axis=1) # 'date_block_num' is not a useful feature

data = data[data.date_block_num < 34]

In [12]:

#     ~~~~ *** ~~ K-fold Mean Encoding for the TRAIN set: ~~***~~~~


kf = KFold(n_splits=5, shuffle=False)

# Add mean target values for each item_id, shop_id, item_category_id - as new features:


# (Gives out some Warnings because of slicing, but works anyway)


data['kf_enc_item_id']    = 'NaN'
data['kf_enc_shop_id']    = 'NaN'
data['kf_enc_itemcat_id'] = 'NaN'

global_mean = data['item_cnt_month'].values.mean()

for tr_ind, val_ind in kf.split(data):
    X_tr, X_val = data.iloc[tr_ind], data.iloc[val_ind]    
    data['kf_enc_item_id']    = X_val['item_id'].map(X_tr.groupby('item_id').item_cnt_month.mean())
    data['kf_enc_shop_id']    = X_val['shop_id'].map(X_tr.groupby('shop_id').item_cnt_month.mean())
    data['kf_enc_itemcat_id'] = X_val['item_category_id'].map(X_tr.groupby('item_category_id').item_cnt_month.mean())
    
            
data['kf_enc_item_id'].fillna(global_mean, inplace=True)     # Fill NaNs
data['kf_enc_shop_id'].fillna(global_mean, inplace=True)     # Fill NaNs
data['kf_enc_itemcat_id'].fillna(global_mean, inplace=True)  # Fill NaNs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value

In [0]:
# Map encodings to Test:

kf_enc__item_dict = dict(zip(data.item_id, data.kf_enc_item_id))
X_test['kf_enc_item_id'] = X_test['item_id'].map(kf_enc__item_dict) 
X_test['kf_enc_item_id'].fillna(global_mean, inplace=True)  # Fill NaNs
X_test['kf_enc_item_id'] = X_test['kf_enc_item_id'].astype(np.float16)


kf_enc__shop_dict = dict(zip(data.shop_id, data.kf_enc_shop_id))
X_test['kf_enc_shop_id'] = X_test['shop_id'].map(kf_enc__shop_dict) 
X_test['kf_enc_shop_id'].fillna(global_mean, inplace=True)  # Fill NaNs
X_test['kf_enc_shop_id'] = X_test['kf_enc_shop_id'].astype(np.float16)


kf_enc__cat_dict = dict(zip(data.item_category_id, data.kf_enc_itemcat_id))
X_test['kf_enc_itemcat_id'] = X_test['item_category_id'].map(kf_enc__cat_dict) 
X_test['kf_enc_itemcat_id'].fillna(global_mean, inplace=True)  # Fill NaNs
X_test['kf_enc_itemcat_id'] = X_test['kf_enc_itemcat_id'].astype(np.float16)


In [0]:
# Separate X_train:

Y_train = data['item_cnt_month']
X_train = data.drop(['date_block_num', 'item_cnt_month'], axis=1)

In [15]:
##  ~~ Summarize data before training: ~~

print('\nX_train info:\n')
X_train.info()  


# Data columns (total 30 columns):
# dtypes: float16(21), int16(1), int8(8)
# memory usage: 624.5 MB


X_train info:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10913804 entries, 0 to 10913803
Data columns (total 30 columns):
shop_id                              int8
item_id                              int16
city_code                            int8
item_category_id                     int8
type_code                            int8
subtype_code                         int8
month                                int8
month_hols                           int8
month_works                          int8
item_cnt_month_lag_1                 float16
item_cnt_month_lag_2                 float16
item_cnt_month_lag_3                 float16
item_cnt_month_lag_4                 float16
item_cnt_month_lag_5                 float16
item_cnt_month_lag_6                 float16
monthly_byitem_itemcnt_mean_lag_1    float16
monthly_byitem_itemcnt_mean_lag_2    float16
monthly_byitem_itemcnt_mean_lag_3    float16
monthly_byitem_itemcnt_mean_lag_4    float16
monthly_byitem_itemcnt_mean_lag_5    flo

In [16]:
print('\nX_test info:\n')
X_test.info()

# Data columns (total 30 columns):
# dtypes: float16(21), int16(1), int8(8)
# memory usage: 12.3 MB


X_test info:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 214200 entries, 10913804 to 11128003
Data columns (total 30 columns):
shop_id                              214200 non-null int8
item_id                              214200 non-null int16
city_code                            214200 non-null int8
item_category_id                     214200 non-null int8
type_code                            214200 non-null int8
subtype_code                         214200 non-null int8
month                                214200 non-null int8
month_hols                           214200 non-null int8
month_works                          214200 non-null int8
item_cnt_month_lag_1                 214200 non-null float16
item_cnt_month_lag_2                 214200 non-null float16
item_cnt_month_lag_3                 214200 non-null float16
item_cnt_month_lag_4                 214200 non-null float16
item_cnt_month_lag_5                 214200 non-null float16
item_cnt_month_lag_6                 

In [0]:

##                    ~~~~***~~ TRAINING ~~***~~~~
##==============================================================================

## (Models were first validated (Holdout scheme) to choose roughly optimal hyperparameters.
## First-level models validation cells are ommitted in this version of notebook for clarity.)  

In [0]:
##                 ~~~~***~~ MODEL 1 - CATBOOST ~~***~~~~
##==============================================================================

In [0]:
## This cell computes for 7 minutes 5 seconds. 

## Uncomment this cell to train Model_cat from scratch. Otherwise a pretrained model is loaded in the next cell.


# train_pool = Pool(X_train, Y_train)

# ## X_test is already created!

# ##==============================================================================
# # MODEL - Catboost:

# train_pool = Pool(X_train, Y_train)
# Model_cat  = CatBoostRegressor(iterations=1067, 
#                                loss_function='RMSE', 
#                                boosting_type='Ordered',
#                                task_type='GPU',
#                                verbose= True)

# Model_cat.fit(train_pool)

In [19]:
## Load model:

!wget https://raw.githubusercontent.com/Ritchizh/PredictFutureSales_notebook/master/Models/Model_cat.bin

Model_cat = CatBoostRegressor(iterations=1067, 
                      loss_function='RMSE', 
                      boosting_type='Ordered',
                      task_type='GPU',
                      verbose= True)
Model_cat.load_model('Model_cat.bin')

--2019-10-28 09:24:54--  https://raw.githubusercontent.com/Ritchizh/PredictFutureSales_notebook/master/Models/Model_cat.bin
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1137032 (1.1M) [application/octet-stream]
Saving to: ‘Model_cat.bin’


2019-10-28 09:24:55 (15.4 MB/s) - ‘Model_cat.bin’ saved [1137032/1137032]



<catboost.core.CatBoostRegressor at 0x7f59dd2d79e8>

In [0]:
# Predict:

pred_cat = Model_cat.predict(X_test).clip(0, 20)

In [0]:
##            ~~~~***~~ MODEL 2 - Linear Regression ~~***~~~~
##==============================================================================

In [0]:
## This cell computes for 13 seconds.

## Uncomment this cell to train Model_Lr from scratch. Otherwise a pretrained model is loaded in the next cell.


# Model_Lr = LinearRegression(n_jobs=-1)
# Model_Lr.fit(X_train, Y_train)


In [22]:
## Load model:

!wget https://raw.githubusercontent.com/Ritchizh/PredictFutureSales_notebook/master/Models/Model_Lr.pkl
Model_Lr = joblib.load('Model_Lr.pkl') 


--2019-10-28 09:24:56--  https://raw.githubusercontent.com/Ritchizh/PredictFutureSales_notebook/master/Models/Model_Lr.pkl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 794 [application/octet-stream]
Saving to: ‘Model_Lr.pkl’


2019-10-28 09:24:56 (136 MB/s) - ‘Model_Lr.pkl’ saved [794/794]



In [0]:
# Predict:

pred_lr = Model_Lr.predict(X_test).clip(0, 20)

In [0]:
##               ~~~~***~~ ENSEMBLING - Stacking ~~***~~~~
##==============================================================================

## * Stacking is done as in Week 4 Programming assignment.

## For validation scheme f) from the reading material (Week 4) is used. 
## Here, duration T is equal to month and M=15.

In [0]:
# Concatenate test predictions to get test meta-features
X_test_level2 = np.c_[pred_cat, pred_lr]

dates = traintest['date_block_num']

dates_train = dates[dates <  34]
dates_test  = dates[dates == 34]

dates_train_level2 = dates_train[dates_train.isin([27, 28, 29, 30, 31, 32, 33])]

y_train_level2 = Y_train[dates_train.isin([27, 28, 29, 30, 31, 32, 33])]

In [0]:
## This cell computes for 56 minutes 41 seconds.

## Uncomment this cell to compute X_train_level2 from scratch. Otherwise precomputed data is loaded in the next cell.


# X_train_level2 = np.zeros([y_train_level2.shape[0], 2])

# # Now fill `X_train_level2` with metafeatures
# for cur_block_num in [27, 28, 29, 30, 31, 32, 33]:
    
#     print(cur_block_num)
    
    
#     # Split X_train:
#     cur_X_train = X_train.loc[dates_train < cur_block_num]
#     cur_y_train = Y_train.loc[dates_train < cur_block_num]
    
#     cur_X_pred  = X_train.loc[dates_train == cur_block_num]
        
    
#     # -- Model 1 - Catboost --
    
#     model_cat = CatBoostRegressor(iterations=1067, 
#                       loss_function='RMSE', 
#                       boosting_type='Ordered',
#                       task_type='GPU')
    

#     model_cat.fit(cur_X_train, cur_y_train)
#     cur_cat_pred = model_cat.predict(cur_X_pred)
    
  
#     # -- Model 2 - Linear Regression --

#     lr = LinearRegression()
#     lr.fit(cur_X_train.values, cur_y_train)
#     cur_lr_pred = lr.predict(cur_X_pred.values)
    
#     # Store preds:
#     X_train_level2[dates_train_level2 == cur_block_num, 0] = cur_cat_pred
#     X_train_level2[dates_train_level2 == cur_block_num, 1] = cur_lr_pred
    
  

In [0]:
# Load precomputed X_train_level2:

X_train_level2 = pd.read_csv('https://raw.githubusercontent.com/Ritchizh/PredictFutureSales_notebook/master/Models/meta_X_train.csv.gz', compression='gzip', header=None)


In [27]:
##              ~~~~***~~ META MODEL ~~***~~~~

# Linear Regression:

meta_lr = LinearRegression()
meta_lr.fit(X_train_level2, y_train_level2)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [0]:
test_preds = meta_lr.predict(X_test_level2)

In [0]:
##                 ~~~~***~~ SUBMISSION ~~***~~~~
##==============================================================================

submission = pd.DataFrame({"ID":sample_submission["ID"],
                           "item_cnt_month":test_preds})

In [30]:
# SAVE FILE TO PC:

submission.to_csv("Predict_Sales_submit.csv", index=False, header=True) 



time elapsed:  0:04:46


In [0]:
# files.download("Predict_Sales_submit.csv")