In [None]:
# Setup feedback system
import pandas as pd
from learntools.core import binder
binder.bind(globals())
from learntools.time_series.ex3 import *

# Setup notebook
from pathlib import Path
from learntools.time_series.style import *  # plot style settings
from learntools.time_series.utils import plot_periodogram, seasonal_plot

from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
import matplotlib as mlp
from math import sqrt
import numpy as np
import seaborn as sns
import tensorflow as tf
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
from xgboost import XGBRegressor
from tensorflow import keras
from scipy import stats


from sklearn.linear_model import ElasticNet, Lasso
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

# List input data files available in read-only "../input/" directory
import os
for dirname, _, filenames in os.walk('/kaggle/input/store-sales-time-series-forecasting'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
BATCH_SIZE = 32
WINDOW_SIZE = 32
EPOCHS = 100
START = '2017'
END = '2017'

In [None]:
comp_dir = Path('../input/store-sales-time-series-forecasting')

oil_df = pd.read_csv(comp_dir / 'oil.csv')
holidays_events_df = pd.read_csv(comp_dir / 'holidays_events.csv')
stores_df = pd.read_csv(comp_dir / 'stores.csv')
train_df = pd.read_csv(comp_dir / 'train.csv')
test_df = pd.read_csv(comp_dir / 'test.csv')
transactions_df = pd.read_csv(comp_dir / 'transactions.csv')

In [None]:
train_df = pd.read_csv(
    comp_dir / 'train.csv',
    usecols=['store_nbr', 'family', 'date', 'sales', 'onpromotion'],
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'sales': 'float32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)

train_df['date'] = train_df.date.dt.to_period('D')
train_df['item'] = "S" + train_df['store_nbr'].astype(str) + "_" + train_df['family'].astype(str)
df = train_df.pivot(index='date', columns='item', values='sales')
train_df = train_df.set_index(['date']).sort_index()

In [None]:
df_test = pd.read_csv(
    comp_dir / 'test.csv',
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'onpromotion': 'uint32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
 
df_test['date'] = df_test.date.dt.to_period('D')
df_test['item'] = "S" + df_test['store_nbr'].astype(str) + "_" + df_test['family'].astype(str)
df_test = df_test.set_index(['date']).sort_index()

In [None]:
holidays_events = pd.read_csv(
    comp_dir / "holidays_events.csv",
    dtype={
        'type': 'category',
        'locale': 'category',
        'locale_name': 'category',
        'description': 'category',
        'transferred': 'bool',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)

holidays_events = holidays_events.set_index('date').to_period('D')

In [None]:
# Create a list of dates
missing_dates = ['2013-12-25', '2014-12-25', '2015-12-25', '2016-12-25']

# Convert the list of dates to datetime objects with a specified format
date_objects = pd.to_datetime(missing_dates, format='%Y-%m-%d')

# Create a DataFrame with a single column named 'Date' containing the datetime objects
missing_df = pd.DataFrame({'date': date_objects})
missing_df['date'] = missing_df.date.dt.to_period('D')
missing_df = missing_df.set_index(['date']).sort_index()

# Concate with the main dataset
df = pd.concat([df, missing_df])
df.fillna(0, inplace=True)
df.sort_index(inplace=True)
df = df.rename_axis("item", axis="columns")

In [None]:
# National and regional holidays in the training set
holidays = (
    holidays_events
    .query("locale in ['National', 'Regional']")
    .loc['2016':'2017-08-16', ['description']]
    .assign(description=lambda x: x.description.cat.remove_unused_categories())
)

holidays = pd.get_dummies(holidays, dtype=float)
holidays_train, holidays_test = holidays[:"2017-08-01"], holidays["2017-08-01":"2017-08-15"]

display(holidays_test)

In [None]:
zero_sales = list(df.loc[:, (df == 0).all()].columns)
train_df['available'] = np.where(train_df['item'].isin(zero_sales), 0, 1)
df_test['available'] = np.where(df_test['item'].isin(zero_sales), 0, 1)

In [None]:
df

In [None]:
train_df

In [None]:
# Target series
y = df.loc[START:END]

from sklearn import preprocessing


# X_1: Features for Linear Regression
fourier = CalendarFourier(freq='M', order=4)
dp = DeterministicProcess(
    index=y.index,
    constant=True,
    order=1,
    seasonal=True,
    additional_terms=[fourier],
    drop=True,
)

X_1 = dp.in_sample()
X_1['NewYear'] = (X_1.index.dayofyear == 1)


# X_2: Features for XGBoost
X_2 = train_df.drop(['sales', 'store_nbr', 'item'], axis=1).loc[START:END]  # onpromotion feature

# Label encoding for 'family'
le = LabelEncoder()  # from sklearn.preprocessing
le.fit(X_2.family)
X_2['family'] = le.transform(X_2['family'])

# Label encoding for seasonality
X_2["day"] = X_2.index.day  # values are day of the month
X_2 = X_2.join(holidays_train, on='date').fillna(0.0)

y_train, y_valid = y[:"2017-07-01"], y["2017-07-02":]
X1_train, X1_valid = X_1[: "2017-07-01"], X_1["2017-07-02" :]
X2_train, X2_valid = X_2.loc[:"2017-07-01"], X_2.loc["2017-07-02":]

In [None]:
val_performance = pd.DataFrame()
performance = pd.DataFrame()

In [None]:
model_linear = LinearRegression(fit_intercept=False)
history = model_linear.fit(X1_train, y_train)
y_fit = pd.DataFrame(model_linear.predict(X1_train), index=X1_train.index, columns=y_train.columns)
y_pred = pd.DataFrame(model_linear.predict(X1_valid), index=X1_valid.index, columns=y_valid.columns)

#Record performance:
for i,j,k in zip([0,1],[y_train,y_valid],[y_fit,y_pred]):
    metric = dict()
    metric['MAE'] = mean_absolute_error(j, k)
    metric['MSE'] = mean_squared_error(j, k)
    metric['RMSE'] = sqrt(mean_squared_error(j, k))
    metric['R2'] = r2_score(j, k)
    res = pd.DataFrame([metric])
    res.index = ['Classic_Linear']
    print(res)
    if i == 0:
        val_performance = pd.concat([val_performance, res])
    else:
        performance = pd.concat([performance, res])

In [None]:
train_sizes, train_scores, val_scores = learning_curve(
    model_linear, X1_train, y_train, train_sizes=np.linspace(0.1,0.01,10)[::-1], cv=5, scoring='neg_mean_squared_error')

train_scores_mean = -np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
val_scores_mean = -np.mean(val_scores, axis=1)
val_scores_std = np.std(val_scores, axis=1)

# Plot the learning curve
plt.figure(figsize=(8, 6))
plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training Score')
plt.plot(train_sizes, val_scores_mean, 'o-', color='g', label='Validation Score')
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2, color='r')
plt.fill_between(train_sizes, val_scores_mean - val_scores_std,
                 val_scores_mean + val_scores_std, alpha=0.2, color='g')
plt.xlabel('Training Set Size')
plt.ylabel('Negative Mean Squared Error')
plt.title('Learning Curve for Linear Regression')
plt.legend(loc='best')
plt.grid(True)
plt.show()

In [None]:
items = y.columns[100:106]
axs = y.loc(axis=1)[items].plot(
    subplots=True, sharex=True, figsize=(11, 9), **plot_params, alpha=0.5)

_ = y_fit.loc(axis=1)[items].plot(subplots=True, sharex=True, color='C0', ax=axs)
_ = y_pred.loc(axis=1)[items].plot(subplots=True, sharex=True, color='C3', ax=axs)
for ax, item in zip(axs, items):
    ax.legend([])
    ax.set_ylabel(item)

In [None]:
model_lasso = Lasso(fit_intercept=False)
history = model_lasso.fit(X1_train, y_train)
y_fit = pd.DataFrame(model_lasso.predict(X1_train), index=X1_train.index, columns=y_train.columns)
y_pred = pd.DataFrame(model_lasso.predict(X1_valid), index=X1_valid.index, columns=y_valid.columns)

#Record performance:
for i,j,k in zip([0,1],[y_train,y_valid],[y_fit,y_pred]):
    metric = dict()
    metric['MAE'] = mean_absolute_error(j, k)
    metric['MSE'] = mean_squared_error(j, k)
    metric['RMSE'] = sqrt(mean_squared_error(j, k))
    metric['R2'] = r2_score(j, k)
    res = pd.DataFrame([metric])
    res.index = ['Classic_Lasso']
    print(res)
    if i == 0:
        val_performance = pd.concat([val_performance, res])
    else:
        performance = pd.concat([performance, res])

In [None]:
train_sizes, train_scores, val_scores = learning_curve(
    model_lasso, X1_train, y_train, train_sizes=np.linspace(0.1,0.01,10)[::-1], cv=5, scoring='neg_mean_squared_error')

train_scores_mean = -np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
val_scores_mean = -np.mean(val_scores, axis=1)
val_scores_std = np.std(val_scores, axis=1)

# Plot the learning curve
plt.figure(figsize=(8, 6))
plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training Score')
plt.plot(train_sizes, val_scores_mean, 'o-', color='g', label='Validation Score')
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2, color='r')
plt.fill_between(train_sizes, val_scores_mean - val_scores_std,
                 val_scores_mean + val_scores_std, alpha=0.2, color='g')
plt.xlabel('Training Set Size')
plt.ylabel('Negative Mean Squared Error')
plt.title('Learning Curve for Lasso Regression')
plt.legend(loc='best')
plt.grid(True)
plt.show()

In [None]:
x = np.arange(len(performance))
width = 0.3
metric_name = 'mean_absolute_error'
val_mae = val_performance['MAE'].tolist()
test_mae = performance['MAE'].tolist()

plt.ylabel('mean_absolute_error')
plt.bar(x - 0.17, val_mae, width, label='Validation')
plt.bar(x + 0.17, test_mae, width, label='Test')
plt.xticks(ticks=x, labels=['Classic_Linear','Classic_Lasso'],
           rotation=0)
_ = plt.legend()

In [None]:
x = np.arange(len(performance))
width = 0.3
metric_name = 'mean_squared_error'
val_mae = val_performance['MSE'].tolist()
test_mae = performance['MSE'].tolist()

plt.ylabel('mean_squared_error')
plt.bar(x - 0.17, val_mae, width, label='Validation')
plt.bar(x + 0.17, test_mae, width, label='Test')
plt.xticks(ticks=x, labels=['Classic_Linear','Classic_Lasso'],
           rotation=0)
_ = plt.legend()

In [None]:
x = np.arange(len(performance))
width = 0.3
metric_name = 'root_mean_squared_error'
val_mae = val_performance['RMSE'].tolist()
test_mae = performance['RMSE'].tolist()

plt.ylabel('root_mean_squared_error')
plt.bar(x - 0.17, val_mae, width, label='Validation')
plt.bar(x + 0.17, test_mae, width, label='Test')
plt.xticks(ticks=x, labels=['Classic_Linear','Classic_Lasso'],
           rotation=0)
_ = plt.legend()

In [None]:
print("Training Performance")
print(performance.head())

print("\n\nValidation Performance")
print(val_performance.head())

In [None]:
# Create features for test set
X1_test = dp.out_of_sample(steps=16)
X1_test.index.name = 'date'
X1_test['NewYear'] = (X1_test.index.dayofyear == 1)

In [None]:
y_submit = pd.DataFrame(model_linear.predict(X1_test), index=X1_test.index, columns=y.columns)
y_submit = pd.DataFrame(y_submit.stack(['item']))
y_submit = pd.merge(y_submit, df_test, how='inner', on=['date','item']).reset_index()
y_submit.drop(['date', 'item', 'store_nbr', 'family', 'onpromotion','available'], axis=1, inplace=True)
y_submit.columns = ['sales', 'id']
y_submit.to_csv('submission.csv', index=False)

In [None]:
xgb_params = {
    'max_depth':10,           # maximum depth of each tree - try 2 to 10
    'learning_rate':0.01,    # effect of each tree - try 0.0001 to 0.1
    'n_estimators':1000,     # number of trees (that is, boosting rounds) - try 1000 to 8000
    'min_child_weight':1,    # minimum number of houses in a leaf - try 1 to 10
    'colsample_bytree':0.7,  # fraction of features (columns) per tree - try 0.2 to 1.0
    'subsample':0.7,         # fraction of instances (rows) per tree - try 0.2 to 1.0
    'reg_alpha':0.5,         # L1 regularization (like LASSO) - try 0.0 to 10.0
    'reg_lambda':1.0,        # L2 regularization (like Ridge) - try 0.0 to 10.0
    'num_parallel_tree':1,
}

mlp_params = {
    'alpha':0.0001, 
    'early_stopping':True,
}


mlp_deep_params = {
    'hidden_layer_sizes':512, 
    'activation':'tanh',  
    'early_stopping':True, 
    'alpha':0.01
}

In [None]:
y_fit.shape

In [None]:
y_pred.shape