In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
work_dir = '/kaggle/input/store-sales-time-series-forecasting/'

In [None]:
# Setup notebook
from pathlib import Path
from learntools.time_series.style import *  # plot style settings
from learntools.time_series.utils import plot_periodogram, seasonal_plot
from learntools.time_series.utils import plot_lags, make_lags, make_leads

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge
from sklearn.preprocessing import LabelEncoder
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from statsmodels.graphics.tsaplots import plot_pacf
from xgboost import XGBRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import RobustScaler, StandardScaler, Normalizer, MinMaxScaler
from sklearn.model_selection import train_test_split

import catboost as cb
from catboost import CatBoostRegressor
import lightgbm as lgb

# **EDA**

In [None]:
train_data = pd.read_csv(
    work_dir + '/train.csv',
)
train_data['date'] = pd.to_datetime(train_data['date'])

In [None]:
train_data.head()

In [None]:
holidays_data = pd.read_csv(
    work_dir + '/holidays_events.csv',
)
holidays_data['date'] = pd.to_datetime(holidays_data['date'])
holidays_data.head()

In [None]:
transactions_data = pd.read_csv(
    work_dir + '/transactions.csv'
)
transactions_data['date'] = pd.to_datetime(transactions_data['date'])
transactions_data.head()

In [None]:
train_data.isna().sum()

In [None]:
train_data['sales'] = np.log1p(train_data['sales'])

In [None]:
stores_data = pd.read_csv(
    work_dir + '/stores.csv'
)
stores_data.head()

In [None]:
stores_data.shape

**Sales variation per store**

In [None]:
sales_per_store = (
    train_data
    .groupby(['date', 'store_nbr'])
    .aggregate({'sales': 'sum'})
    .unstack()
    .resample('1M')
    .sum()
)
sales_per_store.plot(figsize=(21,5))
plt.legend(bbox_to_anchor=(0.9,-.2), ncol=9)
plt.title("Sales per month for each store")

**Sales variation with product family**

In [None]:
family_sales_per_month = (
    train_data
    .groupby(['date', 'family'])
    .aggregate({'sales': 'sum'})
    .unstack()
    .resample('1M')
    .sum()
)
family_sales_per_month.plot(figsize=(21,5))
plt.legend(bbox_to_anchor=(1, -.2), ncol=6)
plt.title("Sales per month for product family")

**Oil prices**

In [None]:
oil_data = pd.read_csv(
    work_dir + '/oil.csv'
)
oil_data.head()

In [None]:
oil_data.isna().sum()

In [None]:
oil_data.fillna(method = 'ffill', inplace = True)
oil_data['date'] = pd.to_datetime(oil_data['date'])

In [None]:
sns.lineplot(data=oil_data, x = 'date', y = 'dcoilwtico')
plt.title('Oil price')

**Checking Seasonal trends**

In [None]:
store_sales = pd.read_csv(
    work_dir + '/train.csv',
    usecols=['store_nbr', 'family', 'date', 'sales'],
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'sales': 'float32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
store_sales['date'] = store_sales.date.dt.to_period('D')
store_sales = store_sales.set_index(['store_nbr', 'family', 'date']).sort_index()
store_sales.head()

In [None]:
average_sales = (
    store_sales
    .groupby('date').mean()
    .squeeze()
    .loc['2017']
)
average_sales.head()

In [None]:
X = average_sales.to_frame()
X["week"] = X.index.week
X["day"] = X.index.dayofweek
seasonal_plot(X, y='sales', period='week', freq='day');

**Trying out a simple model with seasonality**

In [None]:
y = average_sales.copy()
fourier = CalendarFourier(freq="M", order = 4)
dp = DeterministicProcess(
    index=y.index,
    constant=True,
    order=1,
    # YOUR CODE HERE
    seasonal=True,
    additional_terms=[fourier],
    drop=True,
)
X = dp.in_sample()

In [None]:
model = LinearRegression().fit(X, y)
y_pred = pd.Series(
    model.predict(X),
    index=X.index,
    name='Fitted',
)

y_pred = pd.Series(model.predict(X), index=X.index)

# Plot
ax = y.plot(**plot_params, alpha=0.5, title="Average Sales", ylabel="items sold")
ax = y_pred.plot(ax=ax, label="Seasonal")
ax.legend();

**Comparing deseasonalized plot with original to check if all seasonality is accounted for**

In [None]:
y_deseason = y - y_pred

fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, sharey=True, figsize=(10, 7))
ax1 = plot_periodogram(y, ax=ax1)
ax1.set_title("Product Sales Frequency Components")
ax2 = plot_periodogram(y_deseason, ax=ax2);
ax2.set_title("Deseasonalized");

**Sales variation with holidays**

In [None]:
holidays_events = pd.read_csv(
    '/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv',
    dtype={
        'type': 'category',
        'locale': 'category',
        'locale_name': 'category',
        'description': 'category',
        'transferred': 'bool',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
holidays_events = holidays_events.set_index('date').to_period('D')
holidays = (
    holidays_events
    .query("locale in ['National', 'Regional']")
    .loc['2017':'2017-08-15', ['description']]
    .assign(description=lambda x: x.description.cat.remove_unused_categories())
)

display(holidays)

In [None]:
ax = y_deseason.plot(**plot_params)
plt.plot_date(holidays.index, y_deseason[holidays.index], color='C3')
ax.set_title('National and Regional Holidays');

We can thus infer that there are seasonal trends for weekly, biweekly and monthly across all stores and the sales also depend on oil prices and holidays.

# **Functions for feature generation**

In [None]:
# Function for adding lag features
def lag_features(df, lags):
    for lag in lags:
        df[f"sales_t-{lag}"] = df.groupby(["store_nbr", "family"])["sales"].transform(
            lambda x: x.shift(lag))
    return df

In [None]:
# Function for rolling average features
def roll_mean_features(df, windows):
    for window in windows:
        df['sales_roll_mean_' + str(window)] = df.groupby(["store_nbr", "family"])['sales']. \
                                                          transform(
            lambda x: x.shift(16).rolling(window=window, min_periods=7, win_type="triang").mean()) + random_noise(
            df)
    return df

In [None]:
# Random Noise
def random_noise(df):
    return np.random.normal(scale=2.0, size=(len(df),))


In [None]:
# Function for ewm features
def ewm_features(df, alphas, lags):
    for alpha in alphas:
        for lag in lags:
            df['sales_ewm_alpha_' + str(alpha).replace(".", "") + "_lag_" + str(lag)] = \
                df.groupby(["store_nbr", "family"])['sales'].transform(lambda x: x.shift(lag).ewm(alpha=alpha).mean())
    return df

In [None]:
# Calendar features
def create_date_features(df):
    df['month'] = df.date.dt.month
    df['day_of_month'] = df.date.dt.day
    df['day_of_year'] = df.date.dt.dayofyear
    df['week_of_year'] = df.date.dt.weekofyear
    df['day_of_week'] = df.date.dt.dayofweek
    df['year'] = df.date.dt.year
    return df

In [None]:
# Function to fill missing values
def fill_na(df):
    df['holiday_type'] = df['holiday_type'].fillna('Common')
    df['locale'] = df['locale'].fillna('Common')
    df['description'] = df['description'].fillna('Unknown')
    df['transferred'] = df['transferred'].fillna(False)
    df['dcoilwtico'] = df['dcoilwtico'].fillna(method='backfill')
    return df

In [None]:
# To merge all the created features 
def merge_data(df):
    df = df \
        .merge(stores_data, left_on="store_nbr", right_on="store_nbr", how="left") \
        .rename(columns={"type": "store_type"}) \
        .merge(transactions_data, left_on=["date", "store_nbr"], right_on=["date", "store_nbr"], how="left") \
        .merge(holidays_data, left_on="date", right_on="date", how="left") \
        .drop_duplicates(subset="id") \
        .rename(columns={"type": "holiday_type"}) \
        .merge(oil_data, left_on="date", right_on="date", how="left") 
    return df


In [None]:
use_cols = ['date','store_nbr','family','sales','onpromotion','cluster','holiday_type', 
    'locale','description','transferred','dcoilwtico']

In [None]:
test_data = pd.read_csv(
    work_dir + '/test.csv'
)
test_data['date'] = pd.to_datetime(test_data['date'])
test_data.head()

In [None]:
test_id = test_data['id']
test_data['date'] = pd.to_datetime(test_data['date'])

# **Adding features to complete dataset**

In [None]:
data = pd.concat([train_data, test_data], axis=0)
data = merge_data(data)
data = data[use_cols]
data = fill_na(data)
data = create_date_features(data)
data = lag_features(data, lags = [*range(1, 16), 16,17,18,19,20,21,22,30,31,90,180,364])
data = roll_mean_features(data,[16,17,18,30])

In [None]:
data = ewm_features(data, [0.95, 0.9, 0.8, 0.5],[1, 7,30])

In [None]:
data['store_nbr'] = data['store_nbr'].astype('category')
data['family'] = data['family'].astype('category')
data['store_nbr'] = data['store_nbr'].astype('category')
data['cluster'] = data['cluster'].astype('category')
data['family'] = data['family'].astype('category')
data['holiday_type'] = data['holiday_type'].astype('category')
data['locale'] = data['locale'].astype('category')
data['description'] = data['description'].astype('category')

In [None]:
data.head()

**Splitting test and train datasets according to dates**

In [None]:
data = data.query("date > '2013-12-31'")
X = data.query("date <= '2017-08-15'")
X_test = data.query("date > '2017-08-15'")
X.drop(['date'], inplace=True, axis=1)
X_test.drop(['date', 'year'], inplace=True, axis=1)

In [None]:
corr = X.corr()
corr['sales'].sort_values(ascending=False)

**Sales v/s Other Features Correlation Heatmap**

In [None]:
fig, ax = plt.subplots(figsize=(20,15)) 
sns.heatmap(corr, ax = ax)

In [None]:
Y = X['sales']
X = X.drop(['sales', 'year'], axis=1)
X_test = X_test.drop(['sales'], axis=1)

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
X_train.dtypes

# **Models**

In [None]:
def recursive_predict(model):
    global X_test
    output = np.array([])
    for day in range(16, 32):
        pred = model.predict(X_test.query(f"day_of_month == {day}"))
        pred[pred < 0] = 0
        output = np.concatenate([output, pred], axis=0)
        for k in range(day+1, 32):
            X_test.loc[X_test[X_test["day_of_month"] == k].index, f"sales_t-{k-day}"] = pred
    return output

In [None]:
lgb_params = {
    'metric': 'mse',
    'boosting_type' : 'gbdt',
    'num_leaves': 8,
    'learning_rate': 0.2,
    'max_depth': 7,
    'verbose': 0,
    'num_boost_round': 5000,
    'early_stopping_rounds': 200,
    'nthread': -1,
    'force_col_wise': True,
}

train_dataset = lgb.Dataset(data=X_train, label=Y_train, feature_name='auto')
val_dataset = lgb.Dataset(data=X_val, label=Y_val, reference=train_dataset, feature_name='auto')

In [None]:
model = lgb.train(
    params=lgb_params, 
    train_set=train_dataset,
    valid_sets=[train_dataset, val_dataset],
    verbose_eval=100,
)

In [None]:
Y_val_pred = model.predict(X_val, num_iteration=model.best_iteration)
Y_val_pred[Y_val_pred < 0] = 0
print(np.sqrt(mean_squared_error(Y_val, Y_val_pred)))
print(mean_squared_log_error(np.expm1(Y_val), np.expm1(Y_val_pred)))

In [None]:
rec_pred_lgb = recursive_predict(model)

In [None]:
X_test = data.query("date > '2017-08-15'")
X_test.drop(['date', 'year', 'sales'], inplace=True, axis=1)

In [None]:
cat_indices = np.where(X_train.dtypes == 'category')[0]
train_dataset_cb = cb.Pool(X_train, Y_train, cat_features=cat_indices)
val_dataset_cb = cb.Pool(X_val, Y_val, cat_features=cat_indices)

model_cb = CatBoostRegressor(
    loss_function='RMSE',
    cat_features=cat_indices,
    max_depth=7,
    n_estimators=100
)
model_cb.fit(train_dataset_cb, eval_set=val_dataset_cb)

In [None]:
Y_val_pred_cb = model_cb.predict(X_val)
Y_val_pred_cb[Y_val_pred_cb < 0] = 0
print(np.sqrt(mean_squared_error(Y_val, Y_val_pred_cb)))
print(mean_squared_log_error(np.expm1(Y_val), np.expm1(Y_val_pred_cb)))

In [None]:
rec_pred_cb = recursive_predict(model_cb)

In [None]:
X_test = data.query("date > '2017-08-15'")
X_test.drop(['date', 'year', 'sales'], inplace=True, axis=1)

In [None]:
final_submission = pd.DataFrame({'id': test_id, 'sales': np.expm1(rec_pred_lgb)})
final_submission.to_csv('submission.csv', index=False)