# 0.0 Imports

In [30]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import sklearn.preprocessing as pp
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import pickle as pkl

## 0.1 Datasets

In [2]:
features = pd.read_csv('../data/raw/features.csv')
stores = pd.read_csv('../data/raw/stores.csv')
train = pd.read_csv('../data/raw/train.csv')

# 1.0 Data Description

## 1.1 Looking Datasets

In [3]:
features.head()

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
0,1,2010-02-05,42.31,2.572,,,,,,211.096358,8.106,False
1,1,2010-02-12,38.51,2.548,,,,,,211.24217,8.106,True
2,1,2010-02-19,39.93,2.514,,,,,,211.289143,8.106,False
3,1,2010-02-26,46.63,2.561,,,,,,211.319643,8.106,False
4,1,2010-03-05,46.5,2.625,,,,,,211.350143,8.106,False


In [4]:
stores.head()

Unnamed: 0,Store,Type,Size
0,1,A,151315
1,2,A,202307
2,3,B,37392
3,4,A,205863
4,5,B,34875


In [5]:
train.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2010-02-05,24924.5,False
1,1,1,2010-02-12,46039.49,True
2,1,1,2010-02-19,41595.55,False
3,1,1,2010-02-26,19403.54,False
4,1,1,2010-03-05,21827.9,False


## 1.2 Data Dimension

In [6]:
train.shape

(421570, 5)

In [7]:
stores.shape

(45, 3)

In [8]:
features.shape

(8190, 12)

## 1.3 Change Column Names

In [None]:
temp = []
for i in features.columns:
    temp.append(i.lower())
features.columns = temp

In [None]:
temp = []
for i in stores.columns:
    temp.append(i.lower())
stores.columns = temp

In [None]:
temp = []
for i in train.columns:
    temp.append(i.lower())
train.columns = temp

## 1.4 Merging Data

In [10]:
df = train.merge(features.drop('isholiday', axis=1), on=['store', 'date'], how='left')

df = df.merge(stores, on='store', how='left')

In [11]:
df.head()

Unnamed: 0,store,dept,date,weekly_sales,isholiday,temperature,fuel_price,markdown1,markdown2,markdown3,markdown4,markdown5,cpi,unemployment,type,size
0,1,1,2010-02-05,24924.5,False,42.31,2.572,,,,,,211.096358,8.106,A,151315
1,1,1,2010-02-12,46039.49,True,38.51,2.548,,,,,,211.24217,8.106,A,151315
2,1,1,2010-02-19,41595.55,False,39.93,2.514,,,,,,211.289143,8.106,A,151315
3,1,1,2010-02-26,19403.54,False,46.63,2.561,,,,,,211.319643,8.106,A,151315
4,1,1,2010-03-05,21827.9,False,46.5,2.625,,,,,,211.350143,8.106,A,151315


## 1.5 Check Na

In [12]:
df.isna().sum() / df.shape[0] 

store           0.000000
dept            0.000000
date            0.000000
weekly_sales    0.000000
isholiday       0.000000
temperature     0.000000
fuel_price      0.000000
markdown1       0.642572
markdown2       0.736110
markdown3       0.674808
markdown4       0.679847
markdown5       0.640790
cpi             0.000000
unemployment    0.000000
type            0.000000
size            0.000000
dtype: float64

## 1.6 Removing Nas

In [13]:
df.drop('markdown1 markdown2 markdown3 markdown4 markdown5'.split(), axis=1, inplace=True)

## 1.7 Datatypes

In [14]:
df.dtypes

store             int64
dept              int64
date             object
weekly_sales    float64
isholiday          bool
temperature     float64
fuel_price      float64
cpi             float64
unemployment    float64
type             object
size              int64
dtype: object

## 1.8 Changing Datatypes

In [15]:
df['date'] = pd.to_datetime(df['date'])

df['isholiday'] = df['isholiday'].astype(int)

In [16]:
df['type'].value_counts()

type
A    215478
B    163495
C     42597
Name: count, dtype: int64

In [17]:
df['type'] = df['type'].apply(lambda x: 0 if x == 'A' else (1 if x == 'B' else 2))

## 1.9 Descriptive Statistical

# .0 Feature Engineering

In [18]:
df = df.groupby(['store', 'date']).agg({'weekly_sales':'sum', 'isholiday':'first', 'temperature':'first', 'fuel_price':'first', 'cpi':'first', 'unemployment':'first', 'type':'first', 'size':'first'}).reset_index()

# .0 Data Preparation

## Target

In [19]:
pp_weekly_sales = pp.MinMaxScaler()
pp_weekly_sales = pp_weekly_sales.fit(df[['weekly_sales']].values)
df['weekly_sales'] = pp_weekly_sales.transform(df[['weekly_sales']].values)

## Features

### Categorical Attributes

In [20]:
# 'isholiday',
map_isholiday = df.groupby('isholiday').agg({'weekly_sales':'mean'})['weekly_sales'] # target encoding
df['isholiday'] = df['isholiday'].map(map_isholiday)

# 'type',
map_type = df.groupby('type').agg({'weekly_sales':'mean'})['weekly_sales'] # target encoding
df['type'] = df['type'].map(map_type)

### Numerical Attributes

In [21]:
# 'temperature',
pp_temperature = pp.MinMaxScaler()
pp_temperature = pp_temperature.fit(df[['temperature']].values)
df['temperature'] = pp_temperature.transform(df[['temperature']].values)

# 'fuel_price',
pp_fuel_price = pp.MinMaxScaler()
pp_fuel_price = pp_fuel_price.fit(df[['fuel_price']].values)
df['fuel_price'] = pp_fuel_price.transform(df[['fuel_price']].values)

# 'cpi',
pp_cpi = pp.MinMaxScaler()
pp_cpi = pp_cpi.fit(df[['cpi']].values)
df['cpi'] = pp_cpi.transform(df[['cpi']].values)

# 'unemployment',
pp_unemployment = pp.MinMaxScaler()
pp_unemployment = pp_unemployment.fit(df[['unemployment']].values)
df['unemployment'] = pp_unemployment.transform(df[['unemployment']].values)

# 'size']
pp_size = pp.MinMaxScaler()
pp_size = pp_size.fit(df[['size']].values)
df['size'] = pp_size.transform(df[['size']].values)

## Splits

In [22]:
train = df[df['date'] < df['date'].max() - pd.Timedelta(6, 'w')]
test = df[df['date'] >= df['date'].max() - pd.Timedelta(6, 'w')]

In [23]:
X_train = train.drop(['date', 'weekly_sales', 'store'], axis=1)
y_train = train['weekly_sales']

X_train_array = X_train.values
y_train_array = np.array(y_train).reshape(-1, 1)

In [24]:
X_test = test.drop(['date', 'weekly_sales', 'store'], axis=1)
y_test = test['weekly_sales']

X_test_array = X_test.values
y_test_array = np.array(y_test).reshape(-1, 1)

# .0 ML Modeling

## Linear Regression

In [25]:
lr_model = LinearRegression()
lr_model = lr_model.fit(X_train, y_train)
predicts = lr_model.predict(X_test)

test['predicts'] = predicts
test['absolute_error'] = np.abs(test['predicts'] - test['weekly_sales'])
mae = test['absolute_error'].mean()
test['proportional_absolute_error'] = test['absolute_error'] / test['weekly_sales']
mape = test['proportional_absolute_error'].mean()

mape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['predicts'] = predicts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['absolute_error'] = np.abs(test['predicts'] - test['weekly_sales'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['proportional_absolute_error'] = test['absolute_error'] / test['weekly_sales']


0.3951442206249105

## Random Forest

In [26]:
rf_model = RandomForestRegressor(max_depth=7)
rf_model = rf_model.fit(X_train, y_train)
predicts = rf_model.predict(X_test)

test['predicts'] = predicts
test['absolute_error'] = np.abs(test['predicts'] - test['weekly_sales'])
mae = test['absolute_error'].mean()
test['proportional_absolute_error'] = test['absolute_error'] / test['weekly_sales']
mape = test['proportional_absolute_error'].mean()

mape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['predicts'] = predicts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['absolute_error'] = np.abs(test['predicts'] - test['weekly_sales'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['proportional_absolute_error'] = test['absolute_error'] / test['weekly_sales']


0.1513102134818894

## XGBoost Regressor

In [27]:
xgb_model = XGBRegressor(max_depth=7)
xgb_model = xgb_model.fit(X_train, y_train)
predicts = xgb_model.predict(X_test)

test['predicts'] = predicts
test['absolute_error'] = np.abs(test['predicts'] - test['weekly_sales'])
mae = test['absolute_error'].mean()
test['proportional_absolute_error'] = test['absolute_error'] / test['weekly_sales']
mape = test['proportional_absolute_error'].mean()

mape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['predicts'] = predicts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['absolute_error'] = np.abs(test['predicts'] - test['weekly_sales'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['proportional_absolute_error'] = test['absolute_error'] / test['weekly_sales']


0.13032780535369384

# .0 API