In [1]:
import numpy as np 
import pandas as pd 
import random as rnd
from tqdm import tqdm_notebook #gives a progress bar

# visualizatoin
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from scipy import stats
plt.style.use('ggplot')

# Models
import catboost
from catboost import Pool
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import xgboost as xgb
from xgboost import plot_importance
from xgboost import XGBRegressor

# Datasets
#traintest = pd.read_csv('../input/predict-future-sales-feature-engineering/Traintest.csv')
#train = pd.read_csv('../input/predict-future-sales-feature-engineering/Train_set.csv')
#val = pd.read_csv('../input/predict-future-sales-feature-engineering/Val_set.csv')
#first_level = pd.read_csv('../input/predict-future-sales-model-validation/Ensemble_pred.csv')

This is code that I copied to downsize the data because the kernel was not commiting the data at its original size. 

In [2]:
from joblib import Parallel, delayed

class Reducer:
    """
    Class that takes a dict of increasingly big numpy datatypes to transform
    the data of a pandas dataframe to in order to save memory usage.
    """
    memory_scale_factor = 1024**2  # memory in MB

    def __init__(self, conv_table=None):
        """
        :param conv_table: dict with np.dtypes-strings as keys
        """
        if conv_table is None:
            self.conversion_table = \
                {'int': [np.int8, np.int16, np.int32, np.int64],
                 'uint': [np.uint8, np.uint16, np.uint32, np.uint64],
                 'float': [ np.float32, ]}
        else:
            self.conversion_table = conv_table

    def _type_candidates(self, k):
        for c in self.conversion_table[k]:
            i = np.iinfo(c) if 'int' in k else np.finfo(c)
            yield c, i

    def reduce(self, df, verbose=False):
        """Takes a dataframe and returns it with all data transformed to the
        smallest necessary types.

        :param df: pandas dataframe
        :param verbose: If True, outputs more information
        :return: pandas dataframe with reduced data types
        """
        ret_list = Parallel(n_jobs=-1)(delayed(self._reduce)
                                                (df[c], c, verbose) for c in
                                                df.columns)

        return pd.concat(ret_list, axis=1)

    def _reduce(self, s, colname, verbose):

        # skip NaNs
        if s.isnull().any():
            if verbose:
                print(colname, 'has NaNs - Skip..')
            return s

        # detect kind of type
        coltype = s.dtype
        if np.issubdtype(coltype, np.integer):
            conv_key = 'int' if s.min() < 0 else 'uint'
        elif np.issubdtype(coltype, np.floating):
            conv_key = 'float'
        else:
            if verbose:
                print(colname, 'is', coltype, '- Skip..')
            print(colname, 'is', coltype, '- Skip..')
            return s

        # find right candidate
        for cand, cand_info in self._type_candidates(conv_key):
            if s.max() <= cand_info.max and s.min() >= cand_info.min:

                if verbose:
                    print('convert', colname, 'to', str(cand))
                return s.astype(cand)

        # reaching this code is bad. Probably there are inf, or other high numbs
        print(("WARNING: {} " 
               "doesn't fit the grid with \nmax: {} "
               "and \nmin: {}").format(colname, s.max(), s.min()))
        print('Dropping it..')

In [3]:
def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    reducer = Reducer()
    df = reducer.reduce(df)
    return df

In [4]:
traintest = import_data('../input/predict-future-sales-feature-engineering/Traintest.csv')
train = import_data('../input/predict-future-sales-feature-engineering/Train_set.csv')
val = import_data('../input/predict-future-sales-feature-engineering/Val_set.csv')
first_level = import_data('../input/predict-future-sales-model-validation/Ensemble_pred.csv')

### Prepare Train and Validation Data

We need to prepare this because we use this data to train the boosting models

In [5]:
train.head()

Unnamed: 0,Month,date_block_num,item_category_id,item_cnt_month,item_id,item_price,m_num,shop_id,Revenue,year,...,prev_month_itm_cnt,6_mnth_mvg_avg,12_mnth_mvg_avg,shop_mean,item_mean,shop_item_mean,year_mean,month_mean,item_price_mean,shop_item_price_mean
0,Jan,0,2.0,9.0,5572,10730.0,1.0,2,96570.0,2013,...,0.0,9.0,9.0,0.11552,1.136364,1.060606,0.154522,0.208116,1590.135864,1505.449951
1,Jan,0,2.0,1.0,5643,2390.0,1.0,2,2390.0,2013,...,0.0,1.0,1.0,0.11552,1.785714,1.424242,0.154522,0.208116,4042.562744,3299.081787
2,Jan,0,5.0,2.0,5583,1188.300049,1.0,2,2376.600098,2013,...,0.0,2.0,2.0,0.11552,0.547619,0.242424,0.154522,0.208116,364.164062,163.100006
3,Jan,0,6.0,3.0,7893,5970.0,1.0,2,17910.0,2013,...,0.0,3.0,3.0,0.11552,2.975469,2.909091,0.154522,0.208116,5166.950195,5220.36377
4,Jan,0,6.0,1.0,7894,1490.0,1.0,2,1490.0,2013,...,0.0,1.0,1.0,0.11552,3.562771,1.484848,0.154522,0.208116,4874.250977,2264.818115


In [6]:
train.dtypes

Month                    object
date_block_num            uint8
item_category_id        float32
item_cnt_month          float32
item_id                  uint16
item_price              float32
m_num                   float32
shop_id                   uint8
Revenue                 float32
year                     uint16
max_item_price          float32
shop_max_price          float32
max_item_cnt            float32
shop_max_cnt            float32
prev_month_itm_cnt      float32
6_mnth_mvg_avg          float32
12_mnth_mvg_avg         float32
shop_mean               float32
item_mean               float32
shop_item_mean          float32
year_mean               float32
month_mean              float32
item_price_mean         float32
shop_item_price_mean    float32
dtype: object

In [7]:
val.drop(['Unnamed: 0'], axis=1, inplace=True)

In [8]:
val.head()

Unnamed: 0,Month,date_block_num,item_category_id,item_cnt_month,item_id,item_price,m_num,shop_id,Revenue,year,...,prev_month_itm_cnt,6_mnth_mvg_avg,12_mnth_mvg_avg,shop_mean,item_mean,shop_item_mean,year_mean,month_mean,item_price_mean,shop_item_price_mean
0,Oct,33,0.0,0.0,5572,0.0,10.0,2,0.0,2015,...,0.0,0.333333,0.416667,0.11552,1.136364,1.060606,0.272309,0.218626,1590.135864,1505.449951
1,Oct,33,0.0,0.0,5643,0.0,10.0,2,0.0,2015,...,0.0,0.0,0.833333,0.11552,1.785714,1.424242,0.272309,0.218626,4042.562744,3299.081787
2,Oct,33,0.0,0.0,5583,0.0,10.0,2,0.0,2015,...,0.0,0.666667,0.416667,0.11552,0.547619,0.242424,0.272309,0.218626,364.164062,163.100006
3,Oct,33,0.0,0.0,7893,0.0,10.0,2,0.0,2015,...,1.0,1.166667,1.666667,0.11552,2.975469,2.909091,0.272309,0.218626,5166.950195,5220.36377
4,Oct,33,6.0,4.0,7894,9160.0,10.0,2,36640.0,2015,...,1.0,1.333333,1.333333,0.11552,3.562771,1.484848,0.272309,0.218626,4874.250977,2264.818115


In [9]:
# Drop features that can't be used in models
train.drop(['Month', 'item_category_id', 'item_price', 'Revenue'], axis=1, inplace=True)
val.drop(['Month', 'item_category_id', 'item_price', 'Revenue'], axis=1, inplace=True)

In [10]:
train.head()

Unnamed: 0,date_block_num,item_cnt_month,item_id,m_num,shop_id,year,max_item_price,shop_max_price,max_item_cnt,shop_max_cnt,prev_month_itm_cnt,6_mnth_mvg_avg,12_mnth_mvg_avg,shop_mean,item_mean,shop_item_mean,year_mean,month_mean,item_price_mean,shop_item_price_mean
0,0,9.0,5572,1.0,2,2013,18979.5,274298.5625,17.0,96.0,0.0,9.0,9.0,0.11552,1.136364,1.060606,0.154522,0.208116,1590.135864,1505.449951
1,0,1.0,5643,1.0,2,2013,35260.0,274298.5625,23.0,96.0,0.0,1.0,1.0,0.11552,1.785714,1.424242,0.154522,0.208116,4042.562744,3299.081787
2,0,2.0,5583,1.0,2,2013,5592.0,274298.5625,9.0,96.0,0.0,2.0,2.0,0.11552,0.547619,0.242424,0.154522,0.208116,364.164062,163.100006
3,0,3.0,7893,1.0,2,2013,42630.0,274298.5625,38.0,96.0,0.0,3.0,3.0,0.11552,2.975469,2.909091,0.154522,0.208116,5166.950195,5220.36377
4,0,1.0,7894,1.0,2,2013,31290.0,274298.5625,37.0,96.0,0.0,1.0,1.0,0.11552,3.562771,1.484848,0.154522,0.208116,4874.250977,2264.818115


### Prepare Full Train and Test Data

In [11]:
print(traintest.shape)
traintest.head()

(6948502, 17)


Unnamed: 0,Month,date_block_num,item_category_id,item_cnt_month,item_id,item_price,m_num,shop_id,Revenue,year,max_item_price,shop_max_price,max_item_cnt,shop_max_cnt,prev_month_itm_cnt,6_mnth_mvg_avg,12_mnth_mvg_avg
0,Jan,0,2.0,9.0,5572,10730.0,1.0,2,96570.0,2013,18979.5,274298.5625,17.0,96.0,0.0,9.0,9.0
1,Jan,0,2.0,1.0,5643,2390.0,1.0,2,2390.0,2013,35260.0,274298.5625,23.0,96.0,0.0,1.0,1.0
2,Jan,0,5.0,2.0,5583,1188.3,1.0,2,2376.6,2013,5592.0,274298.5625,9.0,96.0,0.0,2.0,2.0
3,Jan,0,6.0,3.0,7893,5970.0,1.0,2,17910.0,2013,42630.0,274298.5625,38.0,96.0,0.0,3.0,3.0
4,Jan,0,6.0,1.0,7894,1490.0,1.0,2,1490.0,2013,31290.0,274298.5625,37.0,96.0,0.0,1.0,1.0


In [12]:
# Split traintest into train and test data
# drop month
traintest.drop('Month', axis=1,inplace=True)
train_final = traintest.query('date_block_num >= 0 and date_block_num < 34').copy()
test_final = traintest.query('date_block_num == 34').copy()
train_final.head()

Unnamed: 0,date_block_num,item_category_id,item_cnt_month,item_id,item_price,m_num,shop_id,Revenue,year,max_item_price,shop_max_price,max_item_cnt,shop_max_cnt,prev_month_itm_cnt,6_mnth_mvg_avg,12_mnth_mvg_avg
0,0,2.0,9.0,5572,10730.0,1.0,2,96570.0,2013,18979.5,274298.5625,17.0,96.0,0.0,9.0,9.0
1,0,2.0,1.0,5643,2390.0,1.0,2,2390.0,2013,35260.0,274298.5625,23.0,96.0,0.0,1.0,1.0
2,0,5.0,2.0,5583,1188.3,1.0,2,2376.6,2013,5592.0,274298.5625,9.0,96.0,0.0,2.0,2.0
3,0,6.0,3.0,7893,5970.0,1.0,2,17910.0,2013,42630.0,274298.5625,38.0,96.0,0.0,3.0,3.0
4,0,6.0,1.0,7894,1490.0,1.0,2,1490.0,2013,31290.0,274298.5625,37.0,96.0,0.0,1.0,1.0


In [13]:
test_final.head()

Unnamed: 0,date_block_num,item_category_id,item_cnt_month,item_id,item_price,m_num,shop_id,Revenue,year,max_item_price,shop_max_price,max_item_cnt,shop_max_cnt,prev_month_itm_cnt,6_mnth_mvg_avg,12_mnth_mvg_avg
6734302,34,,,5037,,11.0,5,,2015,25990.0,217310.0,23.0,155.0,0.0,1.2,1.0
6734303,34,,,5320,,11.0,5,,2015,6296.064453,217310.0,10.016115,155.0,0.0,0.0,0.0
6734304,34,,,5233,,11.0,5,,2015,7191.75,217310.0,10.0,155.0,1.0,1.4,0.909091
6734305,34,,,5232,,11.0,5,,2015,4796.0,217310.0,6.0,155.0,0.0,0.2,0.090909
6734306,34,,,5268,,11.0,5,,2015,6296.064453,217310.0,10.016115,155.0,0.0,0.0,0.0


### Mean encoding entire training set

In [14]:
# Mean encoding
# Use mean encoding to create shop mean
shop_mean = train_final.groupby('shop_id').item_cnt_month.mean()
train_final['shop_mean'] = train_final['shop_id'].map(shop_mean)
test_final['shop_mean'] = test_final['shop_id'].map(shop_mean)

# item mean
item_mean = train_final.groupby('item_id').item_cnt_month.mean()
train_final['item_mean'] = train_final['item_id'].map(item_mean)
test_final['item_mean'] = test_final['item_id'].map(item_mean)

# shop/item mean 
# had to use different format
shop_item_mean = train_final.groupby(['shop_id', 'item_id']).agg({'item_cnt_month': ['mean']})
shop_item_mean.columns = ['shop_item_mean']
shop_item_mean.reset_index(inplace=True)
train_final = pd.merge(train_final, shop_item_mean, on =['shop_id', 'item_id'], how='left')
test_final = pd.merge(test_final, shop_item_mean, on=['shop_id', 'item_id'], how='left')

# year mean
year_mean = train_final.groupby('year').item_cnt_month.mean()
train_final['year_mean'] = train_final['year'].map(year_mean)
test_final['year_mean'] = test_final['year'].map(year_mean)
# month mean
month_mean = train_final.groupby('m_num').item_cnt_month.mean()
train_final['month_mean'] = train_final['m_num'].map(month_mean)
test_final['month_mean'] = test_final['m_num'].map(month_mean)

# item price mean
item_price_mean = train_final.groupby('item_id').item_price.mean()
train_final['item_price_mean'] = train_final['item_id'].map(item_price_mean)
test_final['item_price_mean'] = test_final['item_id'].map(item_price_mean)

# shop_item_price_mean
shop_item_price_mean = train_final.groupby(['shop_id', 'item_id']).agg({'item_price': ['mean']})
shop_item_price_mean.columns = ['shop_item_price_mean']
shop_item_price_mean.reset_index(inplace=True)
train_final = pd.merge(train_final, shop_item_price_mean, on =['shop_id', 'item_id'], how='left')
test_final = pd.merge(test_final, shop_item_price_mean, on=['shop_id', 'item_id'], how='left')


In [15]:
shop_mean.head(30)

shop_id
2     0.116581
3     0.103534
4     0.148662
5     0.151424
6     0.305021
7     0.230773
10    0.081300
12    0.191970
14    0.158797
15    0.227798
16    0.196980
18    0.204852
19    0.237521
21    0.229628
22    0.194518
24    0.218220
25    0.663687
26    0.207272
28    0.542839
31    0.859844
34    0.032948
35    0.235955
36    0.002039
37    0.156152
38    0.193721
39    0.076492
41    0.155160
42    0.428628
44    0.133594
45    0.132446
Name: item_cnt_month, dtype: float64

In [16]:
train_final.head()

Unnamed: 0,date_block_num,item_category_id,item_cnt_month,item_id,item_price,m_num,shop_id,Revenue,year,max_item_price,...,prev_month_itm_cnt,6_mnth_mvg_avg,12_mnth_mvg_avg,shop_mean,item_mean,shop_item_mean,year_mean,month_mean,item_price_mean,shop_item_price_mean
0,0,2.0,9.0,5572,10730.0,1.0,2,96570.0,2013,18979.5,...,0.0,9.0,9.0,0.116581,1.105042,1.029412,0.154522,0.208116,1546.784559,1461.172059
1,0,2.0,1.0,5643,2390.0,1.0,2,2390.0,2013,35260.0,...,0.0,1.0,1.0,0.116581,1.752801,1.382353,0.154522,0.208116,3988.173716,3202.05
2,0,5.0,2.0,5583,1188.3,1.0,2,2376.6,2013,5592.0,...,0.0,2.0,2.0,0.116581,0.546919,0.235294,0.154522,0.208116,364.222269,158.302941
3,0,6.0,3.0,7893,5970.0,1.0,2,17910.0,2013,42630.0,...,0.0,3.0,3.0,0.116581,2.934874,2.823529,0.154522,0.208116,5142.180392,5066.823529
4,0,6.0,1.0,7894,1490.0,1.0,2,1490.0,2013,31290.0,...,0.0,1.0,1.0,0.116581,3.523109,1.558824,0.154522,0.208116,4867.039704,2467.617647


In [17]:
test_final.head()

Unnamed: 0,date_block_num,item_category_id,item_cnt_month,item_id,item_price,m_num,shop_id,Revenue,year,max_item_price,...,prev_month_itm_cnt,6_mnth_mvg_avg,12_mnth_mvg_avg,shop_mean,item_mean,shop_item_mean,year_mean,month_mean,item_price_mean,shop_item_price_mean
0,34,,,5037,,11.0,5,,2015,25990.0,...,0.0,1.2,1.0,0.151424,0.80112,0.382353,0.27422,0.255791,1275.788127,624.647059
1,34,,,5320,,11.0,5,,2015,6296.064453,...,0.0,0.0,0.0,0.151424,,,0.27422,0.255791,,
2,34,,,5233,,11.0,5,,2015,7191.75,...,1.0,1.4,0.909091,0.151424,0.341737,0.294118,0.27422,0.255791,244.425718,229.147059
3,34,,,5232,,11.0,5,,2015,4796.0,...,0.0,0.2,0.090909,0.151424,0.098739,0.029412,0.27422,0.255791,70.51722,17.617647
4,34,,,5268,,11.0,5,,2015,6296.064453,...,0.0,0.0,0.0,0.151424,,,0.27422,0.255791,,


### Fill missing values with average value of those features

In [18]:
# fill in item_mean, shop_item_mean, and shop_item_price_mean with average value of those columns
train_final['item_mean'].fillna((train_final['item_mean'].mean()), inplace=True)
test_final['item_mean'].fillna((train_final['item_mean'].mean()), inplace=True)

train_final['shop_item_mean'].fillna((train_final['shop_item_mean'].mean()), inplace=True)
test_final['shop_item_mean'].fillna((train_final['shop_item_mean'].mean()), inplace=True)

train_final['item_price_mean'].fillna((train_final['item_price_mean'].mean()), inplace=True)
test_final['item_price_mean'].fillna((train_final['item_price_mean'].mean()), inplace=True)


train_final['shop_item_price_mean'].fillna((train_final['shop_item_price_mean'].mean()), inplace=True)
test_final['shop_item_price_mean'].fillna((train_final['shop_item_price_mean'].mean()), inplace=True)

In [19]:
test_final.head()

Unnamed: 0,date_block_num,item_category_id,item_cnt_month,item_id,item_price,m_num,shop_id,Revenue,year,max_item_price,...,prev_month_itm_cnt,6_mnth_mvg_avg,12_mnth_mvg_avg,shop_mean,item_mean,shop_item_mean,year_mean,month_mean,item_price_mean,shop_item_price_mean
0,34,,,5037,,11.0,5,,2015,25990.0,...,0.0,1.2,1.0,0.151424,0.80112,0.382353,0.27422,0.255791,1275.788127,624.647059
1,34,,,5320,,11.0,5,,2015,6296.064453,...,0.0,0.0,0.0,0.151424,0.222416,0.222416,0.27422,0.255791,185.125394,185.125394
2,34,,,5233,,11.0,5,,2015,7191.75,...,1.0,1.4,0.909091,0.151424,0.341737,0.294118,0.27422,0.255791,244.425718,229.147059
3,34,,,5232,,11.0,5,,2015,4796.0,...,0.0,0.2,0.090909,0.151424,0.098739,0.029412,0.27422,0.255791,70.51722,17.617647
4,34,,,5268,,11.0,5,,2015,6296.064453,...,0.0,0.0,0.0,0.151424,0.222416,0.222416,0.27422,0.255791,185.125394,185.125394


In [20]:
# Drop features that we can't use in test set
train_final.drop(['item_category_id','item_price', 'Revenue'], axis=1, inplace=True)
test_final.drop(['item_category_id', 'item_price', 'Revenue'], axis=1, inplace=True)
test_final.head()

Unnamed: 0,date_block_num,item_cnt_month,item_id,m_num,shop_id,year,max_item_price,shop_max_price,max_item_cnt,shop_max_cnt,prev_month_itm_cnt,6_mnth_mvg_avg,12_mnth_mvg_avg,shop_mean,item_mean,shop_item_mean,year_mean,month_mean,item_price_mean,shop_item_price_mean
0,34,,5037,11.0,5,2015,25990.0,217310.0,23.0,155.0,0.0,1.2,1.0,0.151424,0.80112,0.382353,0.27422,0.255791,1275.788127,624.647059
1,34,,5320,11.0,5,2015,6296.064453,217310.0,10.016115,155.0,0.0,0.0,0.0,0.151424,0.222416,0.222416,0.27422,0.255791,185.125394,185.125394
2,34,,5233,11.0,5,2015,7191.75,217310.0,10.0,155.0,1.0,1.4,0.909091,0.151424,0.341737,0.294118,0.27422,0.255791,244.425718,229.147059
3,34,,5232,11.0,5,2015,4796.0,217310.0,6.0,155.0,0.0,0.2,0.090909,0.151424,0.098739,0.029412,0.27422,0.255791,70.51722,17.617647
4,34,,5268,11.0,5,2015,6296.064453,217310.0,10.016115,155.0,0.0,0.0,0.0,0.151424,0.222416,0.222416,0.27422,0.255791,185.125394,185.125394


In [21]:
train_final = train_final.query('date_block_num > 5').copy()
train_final.head()

Unnamed: 0,date_block_num,item_cnt_month,item_id,m_num,shop_id,year,max_item_price,shop_max_price,max_item_cnt,shop_max_cnt,prev_month_itm_cnt,6_mnth_mvg_avg,12_mnth_mvg_avg,shop_mean,item_mean,shop_item_mean,year_mean,month_mean,item_price_mean,shop_item_price_mean
1188421,6,1.0,5572,7.0,2,2013,18979.5,274298.5625,17.0,96.0,2.0,1.5,2.571429,0.116581,1.105042,1.029412,0.154522,0.198165,1546.784559,1461.172059
1188422,6,3.0,5643,7.0,2,2013,35260.0,274298.5625,23.0,96.0,2.0,1.666667,1.571429,0.116581,1.752801,1.382353,0.154522,0.198165,3988.173716,3202.05
1188423,6,0.0,5583,7.0,2,2013,5592.0,274298.5625,9.0,96.0,0.0,0.166667,0.428571,0.116581,0.546919,0.235294,0.154522,0.198165,364.222269,158.302941
1188424,6,6.0,7893,7.0,2,2013,42630.0,274298.5625,38.0,96.0,3.0,3.0,3.0,0.116581,2.934874,2.823529,0.154522,0.198165,5142.180392,5066.823529
1188425,6,1.0,7894,7.0,2,2013,31290.0,274298.5625,37.0,96.0,1.0,0.833333,0.857143,0.116581,3.523109,1.558824,0.154522,0.198165,4867.039704,2467.617647


## Train Test Split

In [22]:
# prepare train and test data
X_train = train_final.drop(['item_cnt_month', 'date_block_num'], axis=1)
y_train = train_final['item_cnt_month']
#val
X_test = test_final.drop(['item_cnt_month', 'date_block_num'], axis=1)

In [23]:
X_test.head()

Unnamed: 0,item_id,m_num,shop_id,year,max_item_price,shop_max_price,max_item_cnt,shop_max_cnt,prev_month_itm_cnt,6_mnth_mvg_avg,12_mnth_mvg_avg,shop_mean,item_mean,shop_item_mean,year_mean,month_mean,item_price_mean,shop_item_price_mean
0,5037,11.0,5,2015,25990.0,217310.0,23.0,155.0,0.0,1.2,1.0,0.151424,0.80112,0.382353,0.27422,0.255791,1275.788127,624.647059
1,5320,11.0,5,2015,6296.064453,217310.0,10.016115,155.0,0.0,0.0,0.0,0.151424,0.222416,0.222416,0.27422,0.255791,185.125394,185.125394
2,5233,11.0,5,2015,7191.75,217310.0,10.0,155.0,1.0,1.4,0.909091,0.151424,0.341737,0.294118,0.27422,0.255791,244.425718,229.147059
3,5232,11.0,5,2015,4796.0,217310.0,6.0,155.0,0.0,0.2,0.090909,0.151424,0.098739,0.029412,0.27422,0.255791,70.51722,17.617647
4,5268,11.0,5,2015,6296.064453,217310.0,10.016115,155.0,0.0,0.0,0.0,0.151424,0.222416,0.222416,0.27422,0.255791,185.125394,185.125394


# Models

### Boosting Models Split

In [24]:
# drop train months
train = train.query('date_block_num >= 5').copy()
train.head()

Unnamed: 0,date_block_num,item_cnt_month,item_id,m_num,shop_id,year,max_item_price,shop_max_price,max_item_cnt,shop_max_cnt,prev_month_itm_cnt,6_mnth_mvg_avg,12_mnth_mvg_avg,shop_mean,item_mean,shop_item_mean,year_mean,month_mean,item_price_mean,shop_item_price_mean
990357,5,2.0,5572,6.0,2,2013,18979.5,274298.5625,17.0,96.0,2.0,2.833333,2.833333,0.11552,1.136364,1.060606,0.154522,0.198324,1590.135864,1505.449951
990358,5,2.0,5643,6.0,2,2013,35260.0,274298.5625,23.0,96.0,5.0,1.333333,1.333333,0.11552,1.785714,1.424242,0.154522,0.198324,4042.562744,3299.081787
990359,5,0.0,5583,6.0,2,2013,5592.0,274298.5625,9.0,96.0,0.0,0.5,0.5,0.11552,0.547619,0.242424,0.154522,0.198324,364.164062,163.100006
990360,5,3.0,7893,6.0,2,2013,42630.0,274298.5625,38.0,96.0,1.0,2.5,2.5,0.11552,2.975469,2.909091,0.154522,0.198324,5166.950195,5220.36377
990361,5,1.0,7894,6.0,2,2013,31290.0,274298.5625,37.0,96.0,0.0,0.833333,0.833333,0.11552,3.562771,1.484848,0.154522,0.198324,4874.250977,2264.818115


In [25]:
# just used by xgboost and catboost
bx_train = train.drop(['date_block_num', 'item_cnt_month'], axis=1)
by_train = train['item_cnt_month']
bx_val = val.drop(['date_block_num', 'item_cnt_month'], axis=1)
by_val = val['item_cnt_month']

#### XGBoost

In [26]:
# XGB
my_model = XGBRegressor(n_estimators=150, max_depth = 8,
                        eta = 0.2, seed = 0, min_child_weight = 1000, 
                        subsample = 0.7, colsample_bytree = 0.7)

my_model.fit(bx_train, by_train, 
             early_stopping_rounds = 10,
             eval_set = [(bx_val, by_val)],
             eval_metric = 'rmse',
             verbose=20)

xgb_test = my_model.predict(X_test)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


[0]	validation_0-rmse:1.52852
Will train until validation_0-rmse hasn't improved in 10 rounds.
[20]	validation_0-rmse:1.0526
[40]	validation_0-rmse:1.03881
Stopping. Best iteration:
[32]	validation_0-rmse:1.03481



#### CatBoost

In [27]:
# Catboost
cat_model = CatBoostRegressor(n_estimators=300, depth=8, verbose=40, random_seed=0)
cat_model.fit(bx_train, by_train, eval_set = (bx_val, by_val))
cat_test = cat_model.predict(X_test)

0:	learn: 1.8563789	test: 1.5624893	best: 1.5624893 (0)	total: 977ms	remaining: 4m 52s
40:	learn: 1.1275670	test: 1.0156125	best: 1.0156125 (40)	total: 32.1s	remaining: 3m 23s
80:	learn: 0.9799085	test: 0.9632318	best: 0.9622846 (78)	total: 1m 3s	remaining: 2m 52s
120:	learn: 0.9326561	test: 0.9554245	best: 0.9550891 (114)	total: 1m 36s	remaining: 2m 22s
160:	learn: 0.9031421	test: 0.9492998	best: 0.9491121 (158)	total: 2m 9s	remaining: 1m 51s
200:	learn: 0.8808784	test: 0.9442076	best: 0.9442076 (200)	total: 2m 41s	remaining: 1m 19s
240:	learn: 0.8618967	test: 0.9449711	best: 0.9433764 (205)	total: 3m 14s	remaining: 47.6s
280:	learn: 0.8477860	test: 0.9444742	best: 0.9433764 (205)	total: 3m 47s	remaining: 15.4s
299:	learn: 0.8402862	test: 0.9410344	best: 0.9404865 (292)	total: 4m 3s	remaining: 0us

bestTest = 0.9404865465
bestIteration = 292

Shrink model to first 293 iterations.


#### Random Forest

In [28]:
# Random Forest
n_est = 100
rf_model = RandomForestRegressor(n_estimators=n_est, max_depth=5, random_state=0)
rf_model.fit(X_train, y_train)
rf_test = rf_model.predict(X_test)

#### KNN

In [29]:
knn_model = KNeighborsRegressor(n_neighbors = 8)
knn_model.fit(X_train, y_train)
knn_test = knn_model.predict(X_test)

#### Linear Regression

In [30]:
# Scaling
LR_scaler = MinMaxScaler()
LR_scaler.fit(X_train)
LR_train = LR_scaler.transform(X_train)
LR_test = LR_scaler.transform(X_test)

In [31]:
# Linear Regression
lr_model = LinearRegression(n_jobs=-1)
lr_model.fit(LR_train, y_train)
lr_test = lr_model.predict(LR_test)

# Ensemble

### Create New Dataset with Predictions from Models

* This will be a simple ensembling technique. We will use the predictions from the above models as the 1st level to feed into the second level, which will ensemble them.
* We make each models outputs (predictions) a feature
* Then we will use a linear regression model to get predictions from the first level models

In [32]:
# This is the first_level predictions from the Models notebook
print(first_level.shape)
first_level.head(10)

(198068, 6)


Unnamed: 0,xgb,catboost,random_forest,knn,linear_regression,label
0,0.27659,0.471972,0.528588,0.428571,0.603149,0.0
1,0.0,0.014001,0.017761,0.714286,0.306099,0.0
2,0.761488,0.796042,0.537921,0.142857,0.536396,0.0
3,0.718346,1.250241,1.251059,3.0,1.762702,0.0
4,0.967628,1.294118,1.251059,2.285714,1.467538,4.0
5,1.185085,1.399071,1.251059,1.142857,1.789657,3.0
6,0.168875,0.284169,0.022278,0.714286,0.407693,0.0
7,0.328686,0.226562,0.022278,0.0,0.181348,0.0
8,0.128178,0.313981,0.528588,1.0,0.797558,0.0
9,0.625365,0.843892,1.156165,2.571429,1.356138,0.0


In [33]:
# The outputs from the models will be the training data for this new model
first_level_test = pd.DataFrame(xgb_test, columns=['xgb'])
first_level_test['catboost'] = cat_test
first_level_test['random_forest'] = rf_test
first_level_test['linear_regression'] = lr_test
first_level_test['knn'] = knn_test
# This validation is if we want to test the model; we don't need to test linear regression
#first_level['label'] = Y_validation.values
first_level_test.head(20)

Unnamed: 0,xgb,catboost,random_forest,linear_regression,knn
0,1.125357,1.399866,1.128799,0.861908,0.5
1,0.022567,0.046292,0.0,0.14796,0.125
2,1.18511,1.303754,1.552275,1.206797,0.5
3,0.315608,0.222836,0.219909,0.211959,0.0
4,0.022567,0.046292,0.0,0.147956,0.125
5,0.252675,0.324288,0.407494,0.398147,0.875
6,1.088691,1.099822,1.059297,1.17369,0.625
7,0.12487,0.217286,0.219909,0.073001,0.75
8,1.925328,1.973668,2.836224,0.771746,2.125
9,0.013818,0.053165,0.0,0.082505,0.0


In [34]:
stack_model = LinearRegression(n_jobs=-1)

In [35]:
# drop label column
first_level.drop('label', axis=1, inplace=True)

For some reason I'm having to drop the label column before putting in EX_train

In [36]:
EX_train = first_level
EY_train = by_val

In [37]:
EX_train.head()

Unnamed: 0,xgb,catboost,random_forest,knn,linear_regression
0,0.27659,0.471972,0.528588,0.428571,0.603149
1,0.0,0.014001,0.017761,0.714286,0.306099
2,0.761488,0.796042,0.537921,0.142857,0.536396
3,0.718346,1.250241,1.251059,3.0,1.762702
4,0.967628,1.294118,1.251059,2.285714,1.467538


In [38]:
stack_model.fit(EX_train, EY_train)

Ensemble_Pred = stack_model.predict(first_level_test)

In [39]:
preds = list(map(lambda x: min(20,max(x,0)), list(Ensemble_Pred)))
ensemble_df = pd.DataFrame({'ID':test_final.index,'item_cnt_month': preds })
ensemble_df.head()

Unnamed: 0,ID,item_cnt_month
0,0,1.789209
1,1,0.0
2,2,1.718037
3,3,0.279751
4,4,0.0


In [40]:
ensemble_df.to_csv("PFS_Submission.csv", index=False)