## Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn')
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')
import time
import datetime
import xgboost as xgb
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [2]:
# data_path = os.path.join(os.getcwd(), 'F21_proj2_data')
train_ini = pd.read_csv('train_ini.csv')

In [3]:
train_ini.head(2)

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2/5/2010,24924.5,False
1,1,1,2/12/2010,46039.49,True


## feature preprocessing

In [116]:
def feature_preprocessing_xtrain(df):
    df['Date']= pd.to_datetime(df['Date'])
    df['Date'] = pd.to_datetime(df['Date'], format='%m%d%y')
    df['week'] = df['Date'].dt.week
    df['year'] = df['Date'].dt.year
    lbl = preprocessing.LabelEncoder()
    df['IsHoliday'] = lbl.fit_transform(df['IsHoliday'].astype(str))
    
    df.sort_values(['Store','Dept','Date'], ignore_index=True, ascending=True, inplace=True)
    # Creating a column which has the previous week sales as a separate column
    df['Last_Week_Sales'] = df.groupby(['Store','Dept'])['Weekly_Sales'].shift(1)

    col = 'Weekly_Sales'
    for order in [2,3,4]:
        df[f'{col}_D{order}'] = df.groupby(['Store','Dept'])[col].diff(periods=order)
    
#     df = df.dropna(subset = list(df.columns))
    df = df.fillna(0)
    
    return df

In [71]:
train_ini.head(2)

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,week,year,Last_Week_Sales,Weekly_Sales_D2,Weekly_Sales_D3,Weekly_Sales_D4
0,1,1,2010-02-05,24924.5,0,5,2010,,,,
1,1,1,2010-02-12,46039.49,1,6,2010,24924.5,,,


In [72]:
X = feature_preprocessing_xtrain(train_ini)
X.head(2)

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,week,year,Last_Week_Sales,Weekly_Sales_D2,Weekly_Sales_D3,Weekly_Sales_D4
0,1,1,2010-02-05,24924.5,0,5,2010,,,,
1,1,1,2010-02-12,46039.49,1,6,2010,24924.5,,,


In [73]:
len(train_ini)

164115

In [74]:
len(X)

164115

In [75]:
X.describe()

Unnamed: 0,Store,Dept,Weekly_Sales,IsHoliday,week,year,Last_Week_Sales,Weekly_Sales_D2,Weekly_Sales_D3,Weekly_Sales_D4
count,164115.0,164115.0,164115.0,164115.0,164115.0,164115.0,160909.0,157753.0,154629.0,151524.0
mean,22.151406,44.131889,16079.708833,0.089644,25.061688,2010.142802,16125.664326,-33.61821,-40.524727,-27.245359
std,12.775748,30.388825,22885.720761,0.285673,15.376328,0.349872,22921.207355,9312.592141,9861.778903,9691.719676
min,1.0,1.0,-4988.94,0.0,1.0,2010.0,-4988.94,-527691.73,-494193.99,-398479.16
25%,11.0,18.0,2176.0,0.0,10.0,2010.0,2197.67,-777.58,-858.59,-875.1425
50%,22.0,37.0,7800.92,0.0,24.0,2010.0,7836.99,2.79,3.6,2.345
75%,33.0,72.0,20271.11,0.0,39.0,2010.0,20333.1,864.35,939.57,952.5575
max,45.0,99.0,693099.36,1.0,52.0,2011.0,693099.36,552453.05,572012.33,595772.54


In [76]:
X

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,week,year,Last_Week_Sales,Weekly_Sales_D2,Weekly_Sales_D3,Weekly_Sales_D4
0,1,1,2010-02-05,24924.50,0,5,2010,,,,
1,1,1,2010-02-12,46039.49,1,6,2010,24924.50,,,
2,1,1,2010-02-19,41595.55,0,7,2010,46039.49,16671.05,,
3,1,1,2010-02-26,19403.54,0,8,2010,41595.55,-26635.95,-5520.96,
4,1,1,2010-03-05,21827.90,0,9,2010,19403.54,-19767.65,-24211.59,-3096.60
...,...,...,...,...,...,...,...,...,...,...,...
164110,45,98,2011-01-28,77.00,0,4,2011,2.00,35.00,-45.50,2.45
164111,45,98,2011-02-04,57.25,0,5,2011,77.00,55.25,15.25,-65.25
164112,45,98,2011-02-11,388.80,1,6,2011,57.25,311.80,386.80,346.80
164113,45,98,2011-02-18,284.50,0,7,2011,388.80,227.25,207.50,282.50


## Naive Model Predicting Mean Sales of a particular weeks

In [77]:
train = pd.read_csv('train_ini.csv', parse_dates=['Date'])
test = pd.read_csv('test.csv', parse_dates=['Date'])

In [78]:
def mypredict(train, test, next_fold, t):
    
    if not t==1:
        train = train.append(next_fold)
        X = feature_preprocessing_xtrain(train)
    else:
        X = feature_preprocessing_xtrain(train)
       
    X1 = feature_preprocessing_xtrain(test)
    pred_model = X.groupby(by='week').mean()['Weekly_Sales']
    prediction = X1.merge(pred_model, how='inner', left_on='week', right_on=pred_model.index)
    Y1_Pred = prediction.Weekly_Sales.values
    test['Weekly_Pred'] = Y1_Pred
    
    return train, test

## XG boost model

In [None]:
def mypredict(train, test, next_fold, t):
    
    if not t==1:
        train = train.append(next_fold)
        trn = feature_preprocessing_xtrain(train)
    else:
        trn = feature_preprocessing_xtrain(train)
    
    try: test = test.drop(columns=['Weekly_Pred'])
    except: pass
            
    trn = trn.drop(columns=['Date', 'IsHoliday'])
    X_train, X_test, y_train, y_test = train_test_split(trn.loc[:, trn.columns != 'Weekly_Sales'],
                                                        trn.Weekly_Sales,
                                                        test_size=0.20, random_state=42)
    
    # dtrain = xgb.DMatrix(X_train, label=y_train)
    dtrain = xgb.DMatrix(trn.loc[:, trn.columns != 'Weekly_Sales'], label=trn.Weekly_Sales)
    dtest = xgb.DMatrix(X_test, label=y_test)
    param = {'max_depth': 4, 'eta': 0.01, 'min_child_weight': 6, 'subsample': 0.5, 'reg_alpha':100,
         'objective': 'reg:squarederror', 'min_child_weight':8}

    param['nthread'] = 6
    param['eval_metric'] = 'rmse'

    num_round = 500
    evallist = [(dtest, 'eval'), (dtrain, 'train')]
    bst = xgb.train(param, dtrain, num_round, evallist, verbose_eval=False)
    
    test_fold_file = f'fold_{t}.csv'
    test_fold = pd.read_csv(test_fold_file, parse_dates=['Date'])
    testXgb_1 = feature_preprocessing_xtrain(test_fold.copy())
    testXgb = testXgb_1.drop(columns=['Date','IsHoliday', 'Weekly_Sales'])
    dtest = xgb.DMatrix(testXgb)
    ypred = bst.predict(dtest)
    ypred = np.array(ypred).flatten()
    
#     print(len(ypred), len(test_fold))
#     test_pred = test.loc[test.Date.isin(test_fold.Date),:]
#     test_pred = test_pred.reset_index(drop=True)
    # print(len(ypred), len(test_pred), len(testXgb))
#     test_pred = testXgb_1[['Store', 'Dept', 'Date', 'IsHoliday']]
#     test_pred['Weekly_Pred'] = ypred
    test_fold_file = f'fold_{t}.csv'
    test_fold = pd.read_csv(test_fold_file, parse_dates=['Date'])
    test_fold['Weekly_Pred'] = ypred
    print(test_fold.head())
    
    return train, test_fold

In [22]:
trn = feature_preprocessing_xtrain(train)
trn = trn.drop(columns=['Date'])
X_train, X_test, y_train, y_test = train_test_split(trn.loc[:, ~trn.columns.str.contains('Weekly_Sales')],
                                                        trn.Weekly_Sales,
                                                        test_size=0.20, random_state=42)

In [23]:
X_train

Unnamed: 0,Store,Dept,IsHoliday,week,year
25738,7,59,0,8,2011
31486,9,13,0,49,2010
3194,1,79,0,7,2011
52177,14,41,0,7,2011
135626,37,8,0,51,2010
...,...,...,...,...,...
119879,32,25,0,19,2010
103694,27,90,1,6,2010
131932,35,72,0,44,2010
146867,40,46,0,49,2010


In [24]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [25]:
param = {'max_depth': 4, 'eta': 0.01, 'min_child_weight': 6, 'subsample': 0.5, 'reg_alpha':100,
         'objective': 'reg:squarederror', 'min_child_weight':8}

param['nthread'] = 6
param['eval_metric'] = 'rmsle'

num_round = 500
evallist = [(dtest, 'eval'), (dtrain, 'train')]
# final model with best parameters
bst = xgb.train(param, dtrain, num_round, evallist, verbose_eval=False)

try: test = test.drop(columns=['Weekly_Pred'])
except: pass

test = feature_preprocessing_xtrain(test)
test = test.drop(columns=['Date'])

dtest = xgb.DMatrix(test)
ypred = bst.predict(dtest)
ypred = np.array(ypred).flatten()

## Linear Model

In [107]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn import metrics, linear_model

def mypredict(train, test, next_fold, t):
    if not t==1:
        train = train.append(next_fold)
        trn = feature_preprocessing_xtrain(train)
    else:
        trn = feature_preprocessing_xtrain(train)
    
    try: test = test.drop(columns=['Weekly_Pred'])
    except: pass
            
    trn = trn.drop(columns=['Date', 'IsHoliday'])
    
    X_train = trn.loc[:, trn.columns != 'Weekly_Sales']
    y_train = trn.Weekly_Sales
    ols = LinearRegression()
    ols.fit(X_train, y_train)
    
    test_fold_file = f'fold_{t}.csv'
    test_fold = pd.read_csv(test_fold_file, parse_dates=['Date'])
    Xtest = feature_preprocessing_xtrain(test_fold)
    Xtest = Xtest.drop(columns=['Date', 'IsHoliday', 'Weekly_Sales'])
    ypred = ols.predict(Xtest)
    
    test_pred = test.loc[test.Date.isin(test_fold.Date),:]
    test_pred = test_pred.reset_index(drop=True)
    test_pred['Weekly_Pred'] = ypred
    
    return train, test_pred

In [27]:
trn = feature_preprocessing_xtrain(train)
trn = trn.drop(columns=['Date'])
X_train, X_test, y_train, y_test = train_test_split(trn.loc[:, ~trn.columns.str.contains('Weekly_Sales')],
                                                        trn.Weekly_Sales,
                                                        test_size=0.20, random_state=42)

In [28]:
ols = LinearRegression()
ols.fit(X_train, y_train.values)
ols.predict(X_test)

array([11634.64231523, 15220.45038092, 15781.88675337, ...,
       21043.78815978, 18735.49344503, 19664.26836725])

In [45]:
X_train

Unnamed: 0,Store,Dept,IsHoliday,week,year
25738,7,59,0,8,2011
31486,9,13,0,49,2010
3194,1,79,0,7,2011
52177,14,41,0,7,2011
135626,37,8,0,51,2010
...,...,...,...,...,...
119879,32,25,0,19,2010
103694,27,90,1,6,2010
131932,35,72,0,44,2010
146867,40,46,0,49,2010


## Evaluation Function

In [131]:
### import numpy as np
import pandas as pd

# from mymain import mypredict

train = pd.read_csv('train_ini.csv', parse_dates=['Date'])
test = pd.read_csv('test.csv', parse_dates=['Date'])

# save weighed mean absolute error WMAE
n_folds = 10
next_fold = None
wae = []

# time-series prediction 
for t in range(1, n_folds+1):
    # print(f'Fold{t}...')
    # *** THIS IS YOUR PREDICTION FUNCTION ***
    train, test_pred = mypredict(train, test, next_fold, t)
    # Load fold file
    # You should add this to your training data in the next call to mypredict()
    fold_file = 'fold_{t}.csv'.format(t=t)
    next_fold = pd.read_csv(fold_file, parse_dates=['Date'])

    # extract predictions matching up to the current fold
    scoring_df = next_fold.merge(test_pred, on=['Date', 'Store', 'Dept'], how='left')
    print(scoring_df.head())
    # extract weights and convert to numpy arrays for wae calculation
    weights = scoring_df['IsHoliday_x'].apply(lambda is_holiday:5 if is_holiday else 1).to_numpy()
    actuals = scoring_df['Weekly_Sales'].to_numpy()
    preds = scoring_df['Weekly_Pred'].fillna(0).to_numpy()
    
    wae_instance = (np.sum(weights * np.abs(actuals - preds)) / np.sum(weights)).item()
    wae.append(wae_instance)
    print(wae_instance)

print(wae)
print(sum(wae)/len(wae))

Index(['Store', 'Dept', 'Date', 'Weekly_Sales', 'IsHoliday', 'Weekly_Pred'], dtype='object')
   Store  Dept       Date  Weekly_Sales_x  IsHoliday_x  Weekly_Sales_y  \
0      1     1 2011-03-04        20327.61        False        20327.61   
1      1     1 2011-03-11        21280.40        False        21280.40   
2      1     1 2011-03-18        20334.23        False        20334.23   
3      1     1 2011-03-25        20881.10        False        20881.10   
4      1     1 2011-04-01        20398.09        False        20398.09   

   IsHoliday_y   Weekly_Pred  
0        False  15553.936523  
1        False  19289.484375  
2        False  20205.025391  
3        False  19134.136719  
4        False  19822.677734  


KeyError: 'Weekly_Sales'

In [None]:
# with nrounds = 500
2827.697977069535
2968.303541860396
2798.6941481225213
2720.074218652663
3545.9587414860875
2493.448357754363
3017.5936183542276
2851.880174822556
2881.765244208638
6841.875936641704
[2827.697977069535, 2968.303541860396, 2798.6941481225213, 2720.074218652663, 3545.9587414860875, 2493.448357754363, 3017.5936183542276, 2851.880174822556, 2881.765244208638, 6841.875936641704]
3294.7291958972687

In [14]:
# with number of rounds = 5000
[4713.686419060166, 4997.887138808549, 4844.425608900387, 4721.690773909452, 6688.060078294849, 4807.609787338735, 4785.34139969684, 5039.192475650916, 4857.694405702477, 4799.428414907879]
5025.501650227025

5025.501650227025

In [None]:
# with number of rounds = 1000
[6475.931203797537, 6833.356723244547, 6688.537245104582, 6556.591980000012, 9114.843811862369, 6863.636067508511, 6676.668861953455, 6977.805055598435, 6740.46028862626, 6628.768390490926]
6955.659962818662

In [None]:
[7842.773835080328, 8276.67656489298, 8142.196970558118, 7987.359785391791, 10522.152161564818, 8404.686196797189, 8093.2138132779755, 8445.98773733721, 8173.810432185905, 8062.895157242394]
8395.17526543287