# Capstone Two: Modeling

In this step of my capstone, I am going to be assessing different models (and tuning hyperparamteters) in order to find the best model I can. 

1. Import training and testing files and libraries. 
2. Use the last month's sales as a benchmark. 
3. Machine learning

### 1. Import

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

import os
from library.sb_utils import save_file

In [2]:
training = pd.read_csv('./data/training_data_feature_engineered.csv')
testing = pd.read_csv('./data/test.csv')
sample_submission = pd.read_csv("./data/sample_submission.csv")

In [3]:
testing.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [4]:
training.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id,datetime,day_of_month,Friday,...,June,March,May,November,October,September,2013,2014,2015,holiday
0,2013-01-02,0,59,22154,999.0,1.0,37,2013-01-02,2,0,...,0,0,0,0,0,0,1,0,0,1
1,2013-01-03,0,25,2552,899.0,1.0,58,2013-01-03,3,0,...,0,0,0,0,0,0,1,0,0,1
2,2013-01-05,0,25,2552,899.0,-1.0,58,2013-01-05,5,0,...,0,0,0,0,0,0,1,0,0,1
3,2013-01-06,0,25,2554,1709.05,1.0,58,2013-01-06,6,0,...,0,0,0,0,0,0,1,0,0,1
4,2013-01-15,0,25,2555,1099.0,1.0,56,2013-01-15,15,0,...,0,0,0,0,0,0,1,0,0,0


In [5]:
# I want to group by monthly sales, since our final prediction will be for the enitre month (Nov 2015). 
grouped = training.groupby(['item_id','shop_id','date_block_num'])
agg = grouped.agg({'item_cnt_day':'sum', 'item_price':'mean','holiday':'mean'}).reset_index()
agg = agg.rename(columns = {'item_cnt_day' : 'item_cnt_month', 'item_price':'item_month_avg_price','holiday':'percent_holidays'})
agg.head()

Unnamed: 0,item_id,shop_id,date_block_num,item_cnt_month,item_month_avg_price,percent_holidays
0,0,54,20,1.0,58.0,0.0
1,1,55,15,2.0,4490.0,0.0
2,1,55,18,1.0,4490.0,0.0
3,1,55,19,1.0,4490.0,0.0
4,1,55,20,1.0,4490.0,0.0


In [6]:
training = training.merge(agg, on=["shop_id", "item_id","date_block_num"], how='left')

In [7]:
training.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id,datetime,day_of_month,Friday,...,November,October,September,2013,2014,2015,holiday,item_cnt_month,item_month_avg_price,percent_holidays
0,2013-01-02,0,59,22154,999.0,1.0,37,2013-01-02,2,0,...,0,0,0,1,0,0,1,1.0,999.0,1.0
1,2013-01-03,0,25,2552,899.0,1.0,58,2013-01-03,3,0,...,0,0,0,1,0,0,1,0.0,899.0,1.0
2,2013-01-05,0,25,2552,899.0,-1.0,58,2013-01-05,5,0,...,0,0,0,1,0,0,1,0.0,899.0,1.0
3,2013-01-06,0,25,2554,1709.05,1.0,58,2013-01-06,6,0,...,0,0,0,1,0,0,1,1.0,1709.05,1.0
4,2013-01-15,0,25,2555,1099.0,1.0,56,2013-01-15,15,0,...,0,0,0,1,0,0,0,1.0,1099.0,0.0


### 2. Use last months sales as a benchmark. 

In [8]:
prev_month_selector = (training['October'] == 1) & (training['2015'] == 1)
last_month = training[prev_month_selector]
groups = last_month[["shop_id", "item_id",'item_cnt_month']].groupby(by=['shop_id','item_id'])
groups = groups.agg({'item_cnt_month':'sum'}).reset_index()

In [9]:
merged = testing.merge(groups, on=["shop_id", "item_id"], how="left")[["ID", "item_cnt_month"]] 
# In this step, testing is correct num rows but merged isn't. why
print(testing.shape)
print(groups.shape)
print(merged.shape)
merged.isna().sum()

(214200, 3)
(31531, 3)
(214200, 2)


ID                     0
item_cnt_month    185520
dtype: int64

In [10]:
merged.item_cnt_month = merged.item_cnt_month.fillna(0)
merged.head()
merged.shape

(214200, 2)

In [11]:
#baseline = merged.set_index("ID")
datapath = './data'
save_file(merged, 'baseline.csv', datapath)

A file already exists with this name.

Do you want to overwrite? (Y/N)n

Please re-run this cell with a new filename.


Sumbitting this to the Kaggle competition gives me a RMSE of 4.56066, which is a bad score. However, this does give us a very rudimentary baseline. 

### 3. Machine Learning

Let's add the features we have in our training set (that we added in feature engineering) to our testing set so that our ML algorithms can use them! 

In [15]:
# testing['November'] = 1
# testing['2015'] = 1
# testing['percent_holiday'] = 1. / 30
# testing['item_category_id'] 
# testing['date_block_num'] = 34

aggg = pd.merge(testing, training, on = ['item_id'])[['shop_id','item_id','item_category_id']]

MemoryError: Unable to allocate 10.9 GiB for an array with shape (24, 60732042) and data type int64

In [16]:
aggg.head()

NameError: name 'aggg' is not defined

In [17]:
testing.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [18]:
training.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id,datetime,day_of_month,Friday,...,November,October,September,2013,2014,2015,holiday,item_cnt_month,item_month_avg_price,percent_holidays
0,2013-01-02,0,59,22154,999.0,1.0,37,2013-01-02,2,0,...,0,0,0,1,0,0,1,1.0,999.0,1.0
1,2013-01-03,0,25,2552,899.0,1.0,58,2013-01-03,3,0,...,0,0,0,1,0,0,1,0.0,899.0,1.0
2,2013-01-05,0,25,2552,899.0,-1.0,58,2013-01-05,5,0,...,0,0,0,1,0,0,1,0.0,899.0,1.0
3,2013-01-06,0,25,2554,1709.05,1.0,58,2013-01-06,6,0,...,0,0,0,1,0,0,1,1.0,1709.05,1.0
4,2013-01-15,0,25,2555,1099.0,1.0,56,2013-01-15,15,0,...,0,0,0,1,0,0,0,1.0,1099.0,0.0


In [60]:
# drop categoricals
training = training.drop(['date','datetime'], axis=1)
training.dtypes

date_block_num            int64
shop_id                   int64
item_id                   int64
item_price              float64
item_cnt_day            float64
item_category_id          int64
day_of_month              int64
Friday                    int64
Monday                    int64
Saturday                  int64
Sunday                    int64
Thursday                  int64
Tuesday                   int64
Wednesday                 int64
April                     int64
August                    int64
December                  int64
February                  int64
January                   int64
July                      int64
June                      int64
March                     int64
May                       int64
November                  int64
October                   int64
September                 int64
2013                      int64
2014                      int64
2015                      int64
holiday                   int64
item_cnt_month          float64
item_mon

In [61]:
X = training.drop('item_cnt_month', axis=1)
y = training['item_cnt_month']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [62]:
model = xgb.XGBRegressor()
model.fit(X_train, y_train, eval_metric='rmse')

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [63]:
y_pred = model.predict(X_test)

In [64]:
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 20.18%


In [65]:
testing_predict = model.predict(testing)

ValueError: Feature shape mismatch, expected: 32, got 7

In [None]:
testing.head()