# Introduction

The notebook is intended to perform a Sales Forecast over the data of the [Tabular Playground Series - September 2022](https://www.kaggle.com/competitions/tabular-playground-series-sep-2022)

In [None]:
# Import Standard Libraries
import pandas as pd
import numpy as np


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedShuffleSplit

# Read Data

In [None]:
# Read train and test data
train_data = pd.read_csv('./../../data/2022_09/train.csv',
                         parse_dates=['date'],
                         index_col=0)
test_data = pd.read_csv('./../../data/2022_09/test.csv',
                        parse_dates=['date'],
                        index_col=0)

# Exploratory Data Analysis

## Null Values

In [None]:
# Define figure
figure = make_subplots(rows=2, 
                       cols=1, 
                       subplot_titles=('Train Null Values Percentage', 
                                       'Test Null Values Percentage'))

# Fetch Train & Test Data
for i, (k, data) in enumerate({'train': train_data, 'test': test_data}.items()):
    
    print(data.isna().melt(value_name='missing'))

In [None]:
sample_data = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [5, 6, 7, np.nan], 'C': [9, 10, 11, 12]})

In [None]:
sample_data

In [None]:
sample_data.isna().melt(var_name='Feature', value_name='Missing')

In [None]:
ex.histogram(data_frame=sample_data.isna().melt(var_name='Feature', value_name='Missing'), 
             y='Feature', 
             color='Missing')

# Data Preparation

## Product Normalization

In [None]:
# Replace spaces and special characters
train_data['product'] = train_data['product'].str.replace(' ', '_')
train_data['product'] = train_data['product'].str.replace(':', '_')
test_data['product'] = test_data['product'].str.replace(' ', '_')
test_data['product'] = test_data['product'].str.replace(':', '_')

## Features and Labels Definition

In [None]:
# Define features and labels
numerical_features = ['date_day', 
                      'date_month', 
                      'date_year', 
                      'date_dayofweek']

categorical_features = ['country', 
                        'store', 
                        'product']

labels = ['num_sold']

## Numerical Features

Since there are no NaN values in the data, it is possible to skip the Fill NaN values step.

In [None]:
# Numerical features pipeline
numerical_features_pipeline = Pipeline(steps=[
    ('numerical_scaler', StandardScaler())
])

## Categorical Features

Since there are no NaN values in the data, it is possible to skip the Fill NaN values step.

In [None]:
# Categorical features pipeline
categorical_features_pipeline = Pipeline(steps=[
    ('categorical_one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))
])

## Bundle Data Preprocessing Steps

In [None]:
# Bunlde data preprocessing steps
data_preprocessor = ColumnTransformer(
    transformers=[
        ('numerical_preprocessing', numerical_features_pipeline, numerical_features),
        ('categorical_preprocessing', categorical_features_pipeline, categorical_features)
    ])

# Train & Test Split

Use the data from 2020 as validation set.

In [None]:
# Define X and y for the training set
X = train_data[numerical_features + categorical_features]
y = train_data[labels]

In [None]:
# Split training data into train and validation
X_train = X[X['date_year'] != 2020]
y_train = y.iloc[X_train.index]
X_test = X[X['date_year'] == 2020]
y_test = y.iloc[X_test.index]

In [None]:
# Define the a Stratified K-fold Shuffle Splitter
stratified_kfold = StratifiedShuffleSplit(n_splits=5,
                                          test_size=.3, 
                                          random_state=0)

# Model

In [None]:
# Define function to compute the SMAPE (source: https://www.kaggle.com/code/jcaliz/tps-sep22-eda-baseline-you-were-looking-for)
def smape(y_true, y_pred):
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.nanmean(diff)

In [None]:
# Define the used metrics
metrics = ['smape', 'mse']

In [None]:
# Initialize DataFrame of model performance
performance = pd.DataFrame(columns=metrics)

## Linear Regression

In [None]:
%%time

# Define the model
model_lr = LinearRegression()

# Define the pipeline
pipe_lr = Pipeline([
    ('data_preprocessing', data_preprocessor),
    ('linear_regression', model_lr)
])

# Train the pipeline
pipe_lr.fit(X_train, 
            y_train)

# Get number of sold predictions
predictions_lr = pipe_lr.predict(X_test)

# Model evaluation
smape_lr = round(smape(y_test, predictions_lr), 2)
mse_lr = round(mean_squared_error(y_test, predictions_lr), 2)

print('SMAPE: {}%'.format(smape_lr))
print('MSE: {}'.format(mse_lr))
print('\n')

In [None]:
# Update 'performance' DataFrame
performance.loc['linear_regression'] = [smape_lr, mse_lr]

## Linear Regression - Cross-Validation

# Model Comparison