# AML Project - Random Forest Regression ####



### Loading Data & Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
import time
import os
from itertools import product
from pylab import rcParams
from sklearn.metrics import mean_squared_log_error
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess, Fourier

In [2]:
import os
for dirname , _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname,filename))

datafolder ='../input/store-sales-time-series-forecasting/'

In [3]:
oil= pd.read_csv(os.path.join(datafolder,'oil.csv'))
holidays_events=pd.read_csv(os.path.join(datafolder,'holidays_events.csv'))
stores= pd.read_csv(os.path.join(datafolder,'stores.csv'))
train= pd.read_csv(os.path.join(datafolder,'train.csv'))
test= pd.read_csv(os.path.join(datafolder,'test.csv'))
transactions= pd.read_csv(os.path.join(datafolder,'transactions.csv'))

# Preprocessing Data


### Creating Calenar of events

clean up oil data, 

simple event handling (only national holidays)

unpack sales and promotions

In [5]:


calendar = pd.DataFrame(index=pd.date_range('2013-01-01', '2017-08-31'))

# moving average

df_oil = pd.read_csv(datafolder + 'oil.csv', parse_dates=['date'], infer_datetime_format=True, index_col='date')


calendar = calendar.merge(df_oil, how='left', left_index=True, right_index=True)
calendar['ma_oil'] =calendar['dcoilwtico'].interpolate('linear').ffill().bfill()
calendar['dofw'] = calendar.index.dayofweek
calendar

In [7]:


df_events = pd.read_csv(datafolder + 'holidays_events.csv', parse_dates=['date'], infer_datetime_format=True)
holidays_short = df_events[df_events.locale == 'National'] 
holidays_short = holidays_short.set_index('date').sort_index()
holidays_short = holidays_short.groupby(holidays_short.index).first()[['type', 'transferred']]


holidays_short


calendar['holiday'] = False

calendar.loc[calendar.holiday > 4, 'holiday'] = True
calendar = calendar.merge(holidays_short, how = 'left', left_index=True, right_index=True)


free = ['Bridge', 'Transfer', 'Holiday']
calendar.loc[calendar.type.isin(free), 'holiday'] = True
calendar.loc[calendar.type == 'Work Day', 'holiday'] = False
#exception: Transferred holidays
calendar.loc[(calendar.type == 'Holiday') & (calendar.transferred == True), 'holiday'] = False

calendar

In [8]:
promo_train = pd.read_csv(datafolder + 'train.csv',
                       usecols=['store_nbr', 'family', 'date', 'onpromotion'],
                       dtype={'store_nbr': 'category', 'family': 'category', 'onpromotion': 'float32'},
                       parse_dates=['date'], infer_datetime_format=True)
promo_train.date = promo_train.date.dt.to_period('D')
promo_train = promo_train.set_index(['store_nbr', 'family', 'date']).sort_index()

df_train = pd.read_csv(datafolder + 'train.csv',
                       usecols=['store_nbr', 'family', 'date', 'sales'],
                       dtype={'store_nbr': 'category', 'family': 'category', 'sales': 'float32'},
                       parse_dates=['date'], infer_datetime_format=True)
df_train.date = df_train.date.dt.to_period('D')
df_train = df_train.set_index(['store_nbr', 'family', 'date']).sort_index()



promo_test = pd.read_csv(datafolder + 'test.csv',
                      usecols=['store_nbr', 'family', 'date', 'onpromotion'],
                      dtype={'store_nbr': 'category', 'family': 'category', 'onpromotion': 'float32'},
                      parse_dates=['date'], infer_datetime_format=True)
promo_test.date = promo_test.date.dt.to_period('D')
promo_test = promo_test.set_index(['store_nbr', 'family', 'date']).sort_index()

df_test = pd.read_csv(datafolder + 'test.csv',
                      usecols=['store_nbr', 'family', 'date'],
                      dtype={'store_nbr': 'category', 'family': 'category'},
                      parse_dates=['date'], infer_datetime_format=True)
df_test.date = df_test.date.dt.to_period('D')
df_test = df_test.set_index(['store_nbr', 'family', 'date']).sort_index()

In [9]:
end_date='2017-08-15'
start_date='2017-04-01'


test_start = '2017-08-16'
test_end = '2017-08-31'

promo_1 = promo_train.unstack(['store_nbr', 'family']).loc[start_date:end_date]
promo_1_test =promo_1.loc[promo_1.index >= '2017-08-01'].copy()
promo_1_train =promo_1.loc[promo_1.index < '2017-08-01'].copy()


promo_2 = promo_test.unstack(['store_nbr', 'family']).loc[test_start:test_end]

promo_2

In [None]:
end_test='2017-08-31'
start_test='2017-08-16'
test = dp.out_of_sample(steps=16)

# Extentions
test['oil']  = calendar.loc[start_test:end_test]['ma_oil'].values
test['dofw'] = calendar.loc[start_test:end_test]['dofw'].values
test['wd']   = calendar.loc[start_test:end_test]['holiday'].values

#test = pd.get_dummies(test, columns=['dofw'], drop_first=True)

# No national level events in this period
test[['type_Additional', 'type_Event', 'type_Holiday', 'type_Transfer']] = 0

## Test & Define Model

Test on all data

In [10]:
from sklearn.ensemble import RandomForestRegressor

y = df_train.unstack(['store_nbr', 'family']).loc[start_date:end_date]

fourier = CalendarFourier(freq='W', order=4)
dp = DeterministicProcess(index=y.index,
                          constant=False,
                          order=1,
                          seasonal=False,
                          additional_terms=[fourier],
                          drop=True)
X = dp.in_sample()

# Extentions

X['oil']  = calendar.loc[start_date:end_date]['ma_oil'].values
X['dofw'] = calendar.loc[start_date:end_date]['dofw'].values
X['wd']   = calendar.loc[start_date:end_date]['holiday'].values
X['type'] = calendar.loc[start_date:end_date]['type'].values

#X = pd.get_dummies(X, columns=['dofw'], drop_first=True)
X = pd.get_dummies(X, columns=['type'], drop_first=False)

model = RandomForestRegressor(n_estimators = 50, n_jobs=-1, random_state=1)# try alpha,0.1 ,0.3 ,0.6 and 0.9
model.fit(X, y)
y_pred = pd.DataFrame(model.predict(X), index=X.index, columns=y.columns)

In [12]:
X_train = X.loc[X.index < '2017-08-01'].copy()
X_test = X.loc[X.index >='2017-08-01'].copy()
Y_train = y.loc[y.index <'2017-08-01'].copy()
Y_test = y.loc[y.index >= '2017-08-01'].copy()
cols = X_train.columns

from sklearn import metrics
from sklearn import preprocessing

def scale(X_1,X_2):
  scaler = preprocessing.StandardScaler().fit(X_1)
  X_scaled = scaler.transform(X_1)
  Y_scaled = scaler.transform(X_2)
  return X_scaled, Y_scaled

X_train, X_test = scale(X_train, X_test)

X_train = pd.DataFrame(X_train, columns  = cols)
X_test = pd.DataFrame(X_test, columns  = cols)

model = RandomForestRegressor(n_estimators = 50, n_jobs=-1, random_state=1)# try alpha,0.1 ,0.3 ,0.6 and 0.9
model.fit(X_train, Y_train)
y_pred = pd.DataFrame(model.predict(X_test), index=X_test.index, columns=Y_test.columns)


y_pred   = y_pred.stack(['store_nbr', 'family']).reset_index()
y_target = Y_test.stack(['store_nbr', 'family']).reset_index().copy()
y_target['sales_pred'] = y_pred['sales'].clip(0.) 

print('total: ', np.sqrt(metrics.mean_squared_log_error(y_target['sales'].values, y_target['sales_pred'].values)))
y_target.groupby('family').apply(lambda r: np.sqrt(mean_squared_log_error(r['sales'].values, r['sales_pred'].values)))

Implementing Model that predicts each store / family seperately

In [15]:
from joblib import Parallel, delayed
import warnings

from sklearn.linear_model import Ridge
from sklearn.ensemble     import RandomForestRegressor

class CustomRegressor():
    
    def __init__(self, n_jobs=-1, verbose=0):
        
        self.n_jobs = n_jobs
        self.verbose = verbose
        
        self.estimators_ = None
        
    def _estimator_(self, X, y, promo):
    
        warnings.simplefilter(action='ignore', category=FutureWarning)
        model = RandomForestRegressor(n_estimators = 50, n_jobs=-1, random_state=1)
        X['promo'] = promo.values
        model.fit(X, y)
        return model
    
    def make_prediction(self, model, X, promo, print_X = False):
        X['promo'] = promo.values
        pred = model.predict(X)
        return pred

    def fit(self, X, y, promo):
        self.estimators_ = Parallel(n_jobs=self.n_jobs, 
                              verbose=self.verbose,
                              )(delayed(self._estimator_
                                       )(X, y.iloc[:, i], promo.iloc[:, i]) for i in range(y.shape[1]))
        
        return
    
    def predict(self, X, promo):
        y_pred = np.zeros(promo.shape)
        for i in range(promo.shape[1]):
            pred = self.make_prediction(self.estimators_[i], X, promo.iloc[:, i], i % 33 == 0)
            y_pred[:, i] = pred
            
        #y_pred = Parallel(n_jobs=self.n_jobs, 
        #                  verbose=self.verbose)(delayed(self.make_prediction)(self.estimators_[i], X, promo.iloc[:, i] ) for i in range(promo.shape[1]))

        return y_pred # np.stack(y_pred, axis=1)

In [16]:

model = CustomRegressor(n_jobs=-1, verbose=0)
model.fit(X_train, Y_train, promo_1_train)
print('fitted')
y_pred = pd.DataFrame(model.predict(X_test, promo_1_test), index=X_test.index, columns=Y_test.columns)


In [17]:
y_pred   = y_pred.stack(['store_nbr', 'family']).reset_index()
y_target = Y_test.stack(['store_nbr', 'family']).reset_index().copy()
y_target['sales_pred'] = y_pred['sales'].clip(0.) 

print('total: ', np.sqrt(metrics.mean_squared_log_error(y_target['sales'].values, y_target['sales_pred'].values)))
y_target.groupby('family').apply(lambda r: np.sqrt(mean_squared_log_error(r['sales'].values, r['sales_pred'].values)))
#y_target.groupby('family').apply(lambda r: mean_squared_log_error(r['sales'], r['sales_pred']))mean_squared_log_error(y_target['sales_pred'], y_target['sales']
y_target.groupby('family').apply(lambda r: np.sqrt(mean_squared_log_error(r['sales'].values, r['sales_pred'].values)))

# Create a Submission

In [20]:
model = CustomRegressor(n_jobs=-1, verbose=0)
model.fit(X, y, promo_1)
print('fitted')

sales_pred = pd.DataFrame(model.predict(test, promo_2), index=test.index, columns=y.columns)
sales_pred = sales_pred.stack(['store_nbr', 'family'])
#sales_pred[sales_pred < 0] = 0. 

sales_pred

In [21]:
My_submission = pd.read_csv(datafolder + 'sample_submission.csv', index_col='id')
My_submission.sales = sales_pred.values
My_submission.to_csv('submission.csv', index=True)

In [22]:
My_submission