### Imports

In [85]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from utils import clean_data, train_val_set

In [2]:
# Source ext. scripts when saved
%load_ext autoreload
%autoreload 2

### Load data

In [3]:
%ls ../data/

calendar.csv                 sample_submission.csv
m5-forecasting-accuracy.zip  sell_prices.csv
sales_train_validation.csv


In [18]:
calendar = pd.read_csv('../data/calendar.csv')
sell_prices = pd.read_csv('../data/sell_prices.csv')
sales = pd.read_csv('../data/sales_train_validation.csv')

### Tidy dataframe

In [19]:
sales.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4


In [20]:
id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
ret_cols = id_vars + ['day', 'sales']

sales = clean_data(sales, id_vars, 'day', 'sales', ret_cols); sales.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day,sales
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,1,0
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,1,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,1,0
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,1,0
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,1,0


### Baseline - Random Forest

#### Feature Engineering

In [56]:
sales_rf = sales.copy(); sales_rf.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day,sales
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,1,0
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,1,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,1,0
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,1,0
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,1,0


In [57]:
# Subsample
sales_rf = sales_rf[sales_rf['day'] >= 1500]; sales_rf['day'].min()

1880

In [58]:
# Clean store_id
sales_rf['store_id'] = sales_rf['store_id'].str.extract('(\d+)', expand=False).astype(int)
sales_rf['item_id'] = sales_rf['item_id'].str.extract('(\d{3}$)', expand=False).astype(int)
# Drop unused features
sales_rf_ft = sales_rf.drop('id', axis=1)
sales_rf_ft.head()

Unnamed: 0,item_id,dept_id,cat_id,store_id,state_id,day,sales
57290710,1,HOBBIES_1,HOBBIES,1,CA,1880,2
57290711,2,HOBBIES_1,HOBBIES,1,CA,1880,0
57290712,3,HOBBIES_1,HOBBIES,1,CA,1880,0
57290713,4,HOBBIES_1,HOBBIES,1,CA,1880,1
57290714,5,HOBBIES_1,HOBBIES,1,CA,1880,3


In [81]:
# Split categorical variables
cat_vars = sales_rf_ft.drop(['sales', 'day'], axis=1)
other_vars = sales_rf_ft[['sales', 'day']]

# One hot encoding
one_hot = OneHotEncoder(drop='first')
cat_vars_oh = one_hot.fit_transform(cat_vars).toarray()
cat_vars_oh = pd.DataFrame(cat_vars_oh, columns=one_hot.get_feature_names(cat_vars.columns))
cat_vars_oh.index = cat_vars.index

# Append Y again
sales_rf_ft_oh = pd.concat([cat_vars_oh, other_vars], axis=1); sales_rf_ft_oh.head()

Unnamed: 0,item_id_2,item_id_3,item_id_4,item_id_5,item_id_6,item_id_7,item_id_8,item_id_9,item_id_10,item_id_11,...,dept_id_HOUSEHOLD_2,cat_id_HOBBIES,cat_id_HOUSEHOLD,store_id_2,store_id_3,store_id_4,state_id_TX,state_id_WI,sales,day
57290710,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2,1880
57290711,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1880
57290712,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1880
57290713,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1880
57290714,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3,1880


#### Cross-Validation

In [None]:
mae_list = []
for n_iter in range(0, 91, 30):
    print(20*'-')
    print(f'Iteration {n_iter+1}')
    # Create split
    train_x, train_y, val_x, val_y = train_val_set(sales_rf_ft_oh, 'day', 28, 
                                                   n_iter, 1500, 'sales')
    # Train model
    rf = RandomForestRegressor(n_estimators=10, n_jobs=-1).fit(train_x, train_y)
    # Predict
    pred_y = rf.predict(val_x)
    # Evaluate
    mae = mean_absolute_error(val_y, pred_y)
    print(f'Error: {mae:.2f}')
    mae_list.append(mae)
    
mean_mae = np.mean(mae_list)
print(f'Mean MAE: {mean_mae:.3f}')

--------------------
Iteration 1
Training set from day 1880 to 1884 with 152450 observations.
Validation set from day 1885 to 1913 with 884210 observations.


### Submission

In [None]:
# Train model
train_x, train_y, val_x, val_y = train_val_set(sales_rf, 'day', 0, 0, 1, 'sales')