## Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn')
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')
import time
import datetime
import xgboost as xgb
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [2]:
# data_path = os.path.join(os.getcwd(), 'F21_proj2_data')
train_ini = pd.read_csv('train_ini.csv')

In [3]:
train_ini.head(2)

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2/5/2010,24924.5,False
1,1,1,2/12/2010,46039.49,True


## feature preprocessing

In [4]:
def feature_preprocessing_xtrain(train_ini):
    train_ini['Date']= pd.to_datetime(train_ini['Date'])
    train_ini['Date'] = pd.to_datetime(train_ini['Date'], format='%m%d%y')
    
#     ! this issue is not present in Python
    # substract one day from 2010 to make same weeks in subsequent years
#     mapped = train_ini.loc[train_ini['Date'].dt.year == 2010, :]['Date'].map(lambda x: x- pd.to_timedelta(1, unit='d'))
#     for i, v in zip(mapped.index, mapped):
#         train_ini.at[i, 'Date'] = v
#     train_ini.loc[train_ini['Date'].dt.year == 2010, 'Date'] = mapped
    
#     # again convert date to datetime to avoid .dt accessor error
#     train_ini['Date']= pd.to_datetime(train_ini['Date'])
#     train_ini['Date'] = pd.to_datetime(train_ini['Date'], format='%m%d%y')

    train_ini['week'] = train_ini['Date'].dt.week
    train_ini['year'] = train_ini['Date'].dt.year
    lbl = preprocessing.LabelEncoder()
    train_ini['IsHoliday'] = lbl.fit_transform(train_ini['IsHoliday'].astype(str))
    
    return train_ini

In [5]:
train_ini.head(2)

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2/5/2010,24924.5,False
1,1,1,2/12/2010,46039.49,True


In [6]:
X = feature_preprocessing_xtrain(train_ini)
X.head(2)

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,week,year
0,1,1,2010-02-05,24924.5,0,5,2010
1,1,1,2010-02-12,46039.49,1,6,2010


In [7]:
X.describe()

Unnamed: 0,Store,Dept,Weekly_Sales,IsHoliday,week,year
count,164115.0,164115.0,164115.0,164115.0,164115.0,164115.0
mean,22.151406,44.131889,16079.708833,0.089644,25.061688,2010.142802
std,12.775748,30.388825,22885.720761,0.285673,15.376328,0.349872
min,1.0,1.0,-4988.94,0.0,1.0,2010.0
25%,11.0,18.0,2176.0,0.0,10.0,2010.0
50%,22.0,37.0,7800.92,0.0,24.0,2010.0
75%,33.0,72.0,20271.11,0.0,39.0,2010.0
max,45.0,99.0,693099.36,1.0,52.0,2011.0


## Naive Model Predicting Mean Sales of a particular weeks

In [19]:
train = pd.read_csv('train_ini.csv', parse_dates=['Date'])
test = pd.read_csv('test.csv', parse_dates=['Date'])

In [20]:
def mypredict(train, test, next_fold, t):
    
    if not t==1:
        train = train.append(next_fold)
        X = feature_preprocessing_xtrain(train)
    else:
        X = feature_preprocessing_xtrain(train)
       
    X1 = feature_preprocessing_xtrain(test)
    pred_model = X.groupby(by='week').mean()['Weekly_Sales']
    prediction = X1.merge(pred_model, how='inner', left_on='week', right_on=pred_model.index)
    Y1_Pred = prediction.Weekly_Sales.values
    test['Weekly_Pred'] = Y1_Pred
    
    return train, test

## XG boost model

In [21]:
def mypredict(train, test, next_fold, t):
    
    if not t==1:
        train = train.append(next_fold)
        trn = feature_preprocessing_xtrain(train)
    else:
        trn = feature_preprocessing_xtrain(train)
    
    try: test = test.drop(columns=['Weekly_Pred'])
    except: pass
            
    trn = trn.drop(columns=['Date'])
    X_train, X_test, y_train, y_test = train_test_split(trn.loc[:, ~trn.columns.str.contains('Weekly_Sales')],
                                                        trn.Weekly_Sales,
                                                        test_size=0.20, random_state=42)
    # dtrain = xgb.DMatrix(X_train, label=y_train)
    dtrain = xgb.DMatrix(trn.loc[:, ~trn.columns.str.contains('Weekly_Sales')], label=trn.Weekly_Sales)
    dtest = xgb.DMatrix(X_test, label=y_test)
    param = {'max_depth': 4, 'eta': 0.01, 'min_child_weight': 6, 'subsample': 0.5, 'reg_alpha':100,
         'objective': 'reg:squarederror', 'min_child_weight':8}

    param['nthread'] = 6
    param['eval_metric'] = 'rmse'

    num_round = 500
    evallist = [(dtest, 'eval'), (dtrain, 'train')]
    # final model with best parameters
    bst = xgb.train(param, dtrain, num_round, evallist, verbose_eval=False)
    
    Xtest = feature_preprocessing_xtrain(test)
    testXgb = feature_preprocessing_xtrain(Xtest)
    testXgb = testXgb.drop(columns=['Date'])

    dtest = xgb.DMatrix(testXgb)
    ypred = bst.predict(dtest)
    ypred = np.array(ypred).flatten()

    test['Weekly_Pred'] = ypred
    
    return train, test

In [22]:
trn = feature_preprocessing_xtrain(train)
trn = trn.drop(columns=['Date'])
X_train, X_test, y_train, y_test = train_test_split(trn.loc[:, ~trn.columns.str.contains('Weekly_Sales')],
                                                        trn.Weekly_Sales,
                                                        test_size=0.20, random_state=42)

In [23]:
X_train

Unnamed: 0,Store,Dept,IsHoliday,week,year
25738,7,59,0,8,2011
31486,9,13,0,49,2010
3194,1,79,0,7,2011
52177,14,41,0,7,2011
135626,37,8,0,51,2010
...,...,...,...,...,...
119879,32,25,0,19,2010
103694,27,90,1,6,2010
131932,35,72,0,44,2010
146867,40,46,0,49,2010


In [24]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [25]:
param = {'max_depth': 4, 'eta': 0.01, 'min_child_weight': 6, 'subsample': 0.5, 'reg_alpha':100,
         'objective': 'reg:squarederror', 'min_child_weight':8}

param['nthread'] = 6
param['eval_metric'] = 'rmsle'

num_round = 500
evallist = [(dtest, 'eval'), (dtrain, 'train')]
# final model with best parameters
bst = xgb.train(param, dtrain, num_round, evallist, verbose_eval=False)

try: test = test.drop(columns=['Weekly_Pred'])
except: pass

test = feature_preprocessing_xtrain(test)
test = test.drop(columns=['Date'])

dtest = xgb.DMatrix(test)
ypred = bst.predict(dtest)
ypred = np.array(ypred).flatten()

## Linear Model

In [26]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn import metrics, linear_model

def mypredict(train, test, next_fold, t):
    if not t==1:
        train = train.append(next_fold)
        trn = feature_preprocessing_xtrain(train)
    else:
        trn = feature_preprocessing_xtrain(train)
    
    try: test = test.drop(columns=['Weekly_Pred'])
    except: pass
            
    trn = trn.drop(columns=['Date'])
    X_train = trn.loc[:, ~trn.columns.str.contains('Weekly_Sales')]
    y_train = trn.Weekly_Sales
    ols = LinearRegression()
    ols.fit(X_train, y_train)
    
    Xtest = feature_preprocessing_xtrain(test)
    Xtest = Xtest.drop(columns=['Date'])
    ypred = ols.predict(Xtest)
    
    test['Weekly_Pred'] = ypred
    
    return train, test

In [27]:
trn = feature_preprocessing_xtrain(train)
trn = trn.drop(columns=['Date'])
X_train, X_test, y_train, y_test = train_test_split(trn.loc[:, ~trn.columns.str.contains('Weekly_Sales')],
                                                        trn.Weekly_Sales,
                                                        test_size=0.20, random_state=42)

In [28]:
ols = LinearRegression()
ols.fit(X_train, y_train.values)
ols.predict(X_test)

array([11634.64231523, 15220.45038092, 15781.88675337, ...,
       21043.78815978, 18735.49344503, 19664.26836725])

In [45]:
X_train

Unnamed: 0,Store,Dept,IsHoliday,week,year
25738,7,59,0,8,2011
31486,9,13,0,49,2010
3194,1,79,0,7,2011
52177,14,41,0,7,2011
135626,37,8,0,51,2010
...,...,...,...,...,...
119879,32,25,0,19,2010
103694,27,90,1,6,2010
131932,35,72,0,44,2010
146867,40,46,0,49,2010


## Evaluation Function

In [44]:
### import numpy as np
import pandas as pd

# from mymain import mypredict

train = pd.read_csv('train_ini.csv', parse_dates=['Date'])
test = pd.read_csv('test.csv', parse_dates=['Date'])

# save weighed mean absolute error WMAE
n_folds = 10
next_fold = None
wae = []

# time-series prediction 
for t in range(1, n_folds+1):
    # print(f'Fold{t}...')
    # *** THIS IS YOUR PREDICTION FUNCTION ***
    train, test_pred = mypredict(train, test, next_fold, t)
    # Load fold file
    # You should add this to your training data in the next call to mypredict()
    fold_file = 'fold_{t}.csv'.format(t=t)
    next_fold = pd.read_csv(fold_file, parse_dates=['Date'])

    # extract predictions matching up to the current fold
    scoring_df = next_fold.merge(test_pred, on=['Date', 'Store', 'Dept'], how='left')

    # extract weights and convert to numpy arrays for wae calculation
    weights = scoring_df['IsHoliday_x'].apply(lambda is_holiday:5 if is_holiday else 1).to_numpy()
    actuals = scoring_df['Weekly_Sales'].to_numpy()
    preds = scoring_df['Weekly_Pred'].fillna(0).to_numpy()
    
    wae_instance = (np.sum(weights * np.abs(actuals - preds)) / np.sum(weights)).item()
    wae.append(wae_instance)
    print(wae_instance)

print(wae)
print(sum(wae)/len(wae))

14332.542158350181
14759.890814100454
15098.574394454674
15068.262744738791
17240.71097483717
14776.891004124669
14701.698508156118
15127.080413921754
15307.968421067553
15400.910425153415
[14332.542158350181, 14759.890814100454, 15098.574394454674, 15068.262744738791, 17240.71097483717, 14776.891004124669, 14701.698508156118, 15127.080413921754, 15307.968421067553, 15400.910425153415]
15181.452985890477


In [None]:
# with number of rounds = 5000
[4713.686419060166, 4997.887138808549, 4844.425608900387, 4721.690773909452, 6688.060078294849, 4807.609787338735, 4785.34139969684, 5039.192475650916, 4857.694405702477, 4799.428414907879]
5025.501650227025

In [None]:
# with number of rounds = 1000
[6475.931203797537, 6833.356723244547, 6688.537245104582, 6556.591980000012, 9114.843811862369, 6863.636067508511, 6676.668861953455, 6977.805055598435, 6740.46028862626, 6628.768390490926]
6955.659962818662

In [None]:
[7842.773835080328, 8276.67656489298, 8142.196970558118, 7987.359785391791, 10522.152161564818, 8404.686196797189, 8093.2138132779755, 8445.98773733721, 8173.810432185905, 8062.895157242394]
8395.17526543287