# Capstone Two: Modeling

In this step of my capstone, I am going to be assessing different models (and tuning hyperparamteters) in order to find the best model I can. 

1. Import training and testing files and libraries. 
2. Use the last month's sales as a benchmark. 
3. Machine learning

### 1. Import

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

import os
from library.sb_utils import save_file

In [2]:
training = pd.read_csv('./data/training_data_feature_engineered.csv')
testing = pd.read_csv('./data/test.csv')
sample_submission = pd.read_csv("./data/sample_submission.csv")

In [3]:
testing.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [4]:
training.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id,datetime,April,August,...,June,March,May,November,October,September,2013,2014,2015,holiday
0,2013-01-02,0,59,22154,999.0,1.0,37,2013-01-02,0,0,...,0,0,0,0,0,0,1,0,0,1
1,2013-01-03,0,25,2552,899.0,1.0,58,2013-01-03,0,0,...,0,0,0,0,0,0,1,0,0,1
2,2013-01-05,0,25,2552,899.0,-1.0,58,2013-01-05,0,0,...,0,0,0,0,0,0,1,0,0,1
3,2013-01-06,0,25,2554,1709.05,1.0,58,2013-01-06,0,0,...,0,0,0,0,0,0,1,0,0,1
4,2013-01-15,0,25,2555,1099.0,1.0,56,2013-01-15,0,0,...,0,0,0,0,0,0,1,0,0,0


In [5]:
# I want to group by monthly sales, since our final prediction will be for the enitre month (Nov 2015). 
months_years =['April','August','December','February','January',\
               'July','June','March','May','November','October','September', '2013', '2014', '2015']
grouped = training.groupby(['item_id','shop_id','item_category_id','date_block_num'])
agg = grouped.agg({'item_cnt_day':'sum', 'item_price':'mean','holiday':'sum'}).reset_index()
agg = agg.rename(columns = {'item_cnt_day' : 'item_cnt_month', 'item_price':'item_month_avg_price','holiday':'num_holidays'})
agg.head()

Unnamed: 0,item_id,shop_id,item_category_id,date_block_num,item_cnt_month,item_month_avg_price,num_holidays
0,0,54,40,20,1.0,58.0,0
1,1,55,76,15,2.0,4490.0,0
2,1,55,76,18,1.0,4490.0,0
3,1,55,76,19,1.0,4490.0,0
4,1,55,76,20,1.0,4490.0,0


In [6]:
training = training.merge(agg, on=["shop_id", "item_id","date_block_num",'item_category_id'], how='left')

In [7]:
training = training.drop(['date','datetime'], axis=1)

In [8]:
training.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id,April,August,December,February,...,November,October,September,2013,2014,2015,holiday,item_cnt_month,item_month_avg_price,num_holidays
0,0,59,22154,999.0,1.0,37,0,0,0,0,...,0,0,0,1,0,0,1,1.0,999.0,1
1,0,25,2552,899.0,1.0,58,0,0,0,0,...,0,0,0,1,0,0,1,0.0,899.0,2
2,0,25,2552,899.0,-1.0,58,0,0,0,0,...,0,0,0,1,0,0,1,0.0,899.0,2
3,0,25,2554,1709.05,1.0,58,0,0,0,0,...,0,0,0,1,0,0,1,1.0,1709.05,1
4,0,25,2555,1099.0,1.0,56,0,0,0,0,...,0,0,0,1,0,0,0,1.0,1099.0,0


### 2. Use last months sales as a benchmark. 

In [9]:
prev_month_selector = (training['October'] == 1) & (training['2015'] == 1)
last_month = training[prev_month_selector]
groups = last_month[["shop_id", "item_id",'item_cnt_month']].groupby(by=['shop_id','item_id'])
groups = groups.agg({'item_cnt_month':'sum'}).reset_index()

In [10]:
merged = testing.merge(groups, on=["shop_id", "item_id"], how="left")[["ID", "item_cnt_month"]] 
print(testing.shape)
print(groups.shape)
print(merged.shape)
merged.isna().sum()

(214200, 3)
(31531, 3)
(214200, 2)


ID                     0
item_cnt_month    185520
dtype: int64

In [11]:
merged.item_cnt_month = merged.item_cnt_month.fillna(0)
merged.head()
merged.shape

(214200, 2)

In [12]:
datapath = './data'
save_file(merged, 'baseline.csv', datapath)

A file already exists with this name.

Do you want to overwrite? (Y/N)n

Please re-run this cell with a new filename.


Sumbitting this to the Kaggle competition gives me a RMSE of 4.56066, which is a bad score. However, this does give us a very rudimentary baseline. 

### 3. Machine Learning

In [13]:
# I want to get all the columns into the testing data 
columns = training.columns
columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_price', 'item_cnt_day',
       'item_category_id', 'April', 'August', 'December', 'February',
       'January', 'July', 'June', 'March', 'May', 'November', 'October',
       'September', '2013', '2014', '2015', 'holiday', 'item_cnt_month',
       'item_month_avg_price', 'num_holidays'],
      dtype='object')

In [14]:
# we have ID, shop_id, item_id
testing['date_block_num'] = 34

In [15]:
# for item price, we have use the mean item price for that item id for that shop
grouped = training.groupby(by = ['item_id','shop_id'])
result = grouped.agg({'item_price':'mean'})
result = result.reset_index()
testing = testing.merge(result, on=['item_id', 'shop_id'], how='left')
testing.head()

Unnamed: 0,ID,shop_id,item_id,date_block_num,item_price
0,0,5,5037,34,1633.692308
1,1,5,5320,34,
2,2,5,5233,34,865.666667
3,3,5,5232,34,599.0
4,4,5,5268,34,


In [16]:
# fill item price NA with average price for that item
grouped = training.groupby(by = ['item_id'])
result = grouped.agg({'item_price':'mean'})
result = result.reset_index()
testing = testing.merge(result, on=['item_id'], how='left')
testing['item_price_x'].fillna(testing['item_price_y'])
testing.drop('item_price_y', axis=1, inplace=True)
testing.rename(columns={'item_price_x':'item_price'}, inplace=True)
testing.head()

Unnamed: 0,ID,shop_id,item_id,date_block_num,item_price
0,0,5,5037,34,1633.692308
1,1,5,5320,34,
2,2,5,5233,34,865.666667
3,3,5,5232,34,599.0
4,4,5,5268,34,


In [17]:
# item_cnt_month
testing = pd.merge(testing, training[['item_id', 'item_cnt_month']], on = 'item_id', how='left')

In [21]:
# Get item categories
testing = pd.merge(testing, training[['item_id', 'item_month_avg_price']], on = 'item_id', how='left')

MemoryError: Unable to allocate 980. GiB for an array with shape (131546493456,) and data type int64

In [22]:
testing[testing.item_id==5320].head()

Unnamed: 0,ID,shop_id,item_id,date_block_num,item_price,item_cnt_month
1089,1,5,5320,34,,
1447453,5101,4,5320,34,,
2893817,10201,6,5320,34,,
4340181,15301,3,5320,34,,
5786545,20401,2,5320,34,,


In [23]:
# Fill the rest in manually 
testing[['November', '2015', 'num_holidays', 'April', 'August', 'December', 'February',
       'January', 'July', 'June', 'March', 'May', 'October',
       'September', '2013', '2014']] = [1, 1, 1, 0,0,0,0,0,0,0,0,0,0,0,0,0]

In [24]:
for item in training.columns: 
    if item not in testing.columns: 
        print(item)

item_cnt_day
item_category_id
holiday
item_month_avg_price


In [32]:
testing.fillna(0, inplace=True)

In [33]:
training.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_category_id,April,August,December,February,January,...,May,November,October,September,2013,2014,2015,item_cnt_month,item_month_avg_price,num_holidays
0,0,59,22154,999.0,37,0,0,0,0,1,...,0,0,0,0,1,0,0,1.0,999.0,1
1,0,25,2552,899.0,58,0,0,0,0,1,...,0,0,0,0,1,0,0,0.0,899.0,2
2,0,25,2552,899.0,58,0,0,0,0,1,...,0,0,0,0,1,0,0,0.0,899.0,2
3,0,25,2554,1709.05,58,0,0,0,0,1,...,0,0,0,0,1,0,0,1.0,1709.05,1
4,0,25,2555,1099.0,56,0,0,0,0,1,...,0,0,0,0,1,0,0,1.0,1099.0,0


In [34]:
training.drop(['item_cnt_day','holiday','item_month_avg_price'], axis=1, inplace=True) #

KeyError: "['item_cnt_day' 'holiday'] not found in axis"

#### Let's start the actual learning! 

In [35]:
X = training.drop(['item_cnt_month'], axis=1)
y = training['item_cnt_month']

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [37]:
model = xgb.XGBRegressor()
model.fit(X_train, y_train, eval_metric='rmse')

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [38]:
y_pred = model.predict(X_test)

In [39]:
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 17.61%
