# Store Sales - Model Training

The notebook is intended to explore different models for the Time Series Forecasting of the data contained in `data/store_sales`.

**NOTE:** Ensure to download also the `train.csv` from the following Kaggle Competition: [Stores Sales - Time Series Forecasting](https://www.kaggle.com/competitions/store-sales-time-series-forecasting/data).

# Setup Notebooks

## Imports

In [19]:
# Import Standard Libraries
import os
import pandas as pd
from pathlib import Path
from dynaconf import Dynaconf
from statsmodels.tsa.deterministic import DeterministicProcess
from sklearn.linear_model import LinearRegression

# Import Package Modules
from src.exploratory_data_analysis.exploratory_data_analysis_utils import (
    set_plot_characteristics
)
from src.data_preparation.data_preparation_utils import (
    group_avg_column_by_frequency
)

## Define Configurations

In [2]:
# Retrieve root path
root_path = Path(os.getcwd()).parents[1]

# Read EDA configuration
eda_config = Dynaconf(settings_files=[root_path / 'configuration' / 'store_sales_config.toml'], 
                      environment=True, 
                      env='eda')

# Extract configurations
data_paths = eda_config['data_paths']

## Set Plots Characteristics

In [4]:
set_plot_characteristics(eda_config['plot_settings'])

[09/23/2024 16:33:23 - exploratory_data_analysis_utils] INFO - set_plot_characteristics - Start
[09/23/2024 16:33:23 - exploratory_data_analysis_utils] INFO - set_plot_characteristics - Creating theme parameters
[09/23/2024 16:33:23 - exploratory_data_analysis_utils] INFO - set_plot_characteristics - End


# Read Data

In [5]:
# Read data with parse dates
holidays_data = pd.read_csv(root_path / data_paths['holidays_data'], parse_dates=['date'], date_format='%Y-%m-%d')
oil_data = pd.read_csv(root_path / data_paths['oil_data'], parse_dates=['date'], date_format='%Y-%m-%d')
stores_data = pd.read_csv(root_path / data_paths['stores_data'])
transactions_data = pd.read_csv(root_path / data_paths['transactions_data'], parse_dates=['date'], date_format='%Y-%m-%d')
train_data = pd.read_csv(root_path / data_paths['train_data'], parse_dates=['date'], date_format='%Y-%m-%d')
test_data = pd.read_csv(root_path / data_paths['test_data'], parse_dates=['date'], date_format='%Y-%m-%d')

# Model Training

## Deterministic Process with Linear Regression

### Data Preparation

In [7]:
# Group transactions by week
transactions_data_weekly = group_avg_column_by_frequency(data=transactions_data, 
                                                         key='date', 
                                                         column='transactions', 
                                                         frequency='W')

[09/23/2024 16:36:27 - data_preparation_utils] INFO - group_avg_column_by_frequency - Start
[09/23/2024 16:36:27 - data_preparation_utils] INFO - group_avg_column_by_frequency - key: date | frequency: W | column: transactions
[09/23/2024 16:36:27 - data_preparation_utils] INFO - group_avg_column_by_frequency - End


In [11]:
# Create the DeterministicProcess object
transactions_dp = DeterministicProcess(index=transactions_data_weekly['date'], 
                                       constant=True, 
                                       order=1, 
                                       drop=True)

# Manually set index frequency
transactions_dp.index.freq = 'W'

In [14]:
# Create x & y
transactions_x = transactions_dp.in_sample()
transactions_y = transactions_data_weekly['transactions']

In [17]:
# Copy the index to the target 
transactions_y.index = transactions_x.index

### Model Training

In [20]:
# Instance and fit a linear regression model
transactions_model_lr = LinearRegression(fit_intercept=False)
transactions_model_lr.fit(transactions_x, transactions_y)