# AT2 - Retail dataset modelling for prediction - SGD Regression 

# 1. Loading the dataset

## 1.1. Launch commands to automatically reload modules

In [1]:
%load_ext autoreload
%autoreload 2

## 1.2. Import the packages

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# To ensure plots are displayed in Jupyter notebooks
%matplotlib inline

## 1.3. Load the training and test dataset into a dataframes

In [3]:
# Load the training and test datasets into data frames 
X_train = pd.read_parquet('../../data/processed/X_train.parquet')
X_val = pd.read_parquet('../../data/processed/X_val.parquet')
y_train = pd.read_parquet('../../data/processed/y_train.parquet')
y_val = pd.read_parquet('../../data/processed/y_val.parquet')
X_test = pd.read_parquet('../../data/processed/X_test.parquet')
y_test = pd.read_parquet('../../data/processed/y_test.parquet')

 # 2.0 Installing the custom package with pip

In [4]:
! pip install -i https://test.pypi.org/simple/ my_krml_pine==2024.0.1.18

Looking in indexes: https://test.pypi.org/simple/


## 2.1 import custom functions

In [5]:
from my_krml_pine.models.performance import print_regressor_scores
from my_krml_pine.models.null import NullRegressor

# 3.0 Baseline Model

In [6]:
# Instantiate a NullRegressor and save it into a variable called base_model
base_model = NullRegressor()

In [7]:
# Make a prediction using fit_predict() and save the results in a variable called y_base
y_base = base_model.fit_predict(y_train)

In [8]:
# Display the RMSE and MAE scores of this baseline model
print_regressor_scores(y_preds=y_base, y_actuals=y_train, set_name='Training')

RMSE Training: 10.44
MAE Training: 5.31


In [9]:
# Display the RMSE and MAE scores of this baseline model on the validation set
print_regressor_scores(y_preds=base_model.predict(y_val), y_actuals=y_val, set_name='Validation')

RMSE Validation: 11.04
MAE Validation: 5.15


In [10]:
# Display the RMSE and MAE scores of this baseline model on the testing set
print_regressor_scores(y_preds=base_model.predict(y_test), y_actuals=y_test, set_name='Testing')

RMSE Testing: 11.31
MAE Testing: 5.23


# 4.0 Build Pipeline and Modelling 

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import SGDRegressor
from sklearn.compose import ColumnTransformer

In [12]:
# All the features are of type object. No numerical transformation is needed. So defining only categorical transformer
# Define the pipeline for one-hot encoding
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [13]:
# Define the ColumnTransformer for all categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('categorical', categorical_transformer, X_train.columns) 
    ],
    remainder='drop'  
)

In [14]:
# Create a Pipeline called sgd_pipe that contains 2 steps preprocessor and another that instantiates a Linear regressor.
# stochastic gradient descent algorithm, which iteratively updates the model weights based on a subset of the data (mini-batches) rather than the entire dataset.Suitable for large-scale datasets
sgd_pipe = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('sgd', SGDRegressor(penalty='elasticnet', random_state=42)) 
    ]
)

In [15]:
# Fit sgd_pipe with training dataset
sgd_pipe.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [16]:
# Make predictions on training dataset and store the predictions in train_preds
train_preds = sgd_pipe.predict(X_train)

In [17]:
# Display the RMSE and MAE scores of sgd model
print_regressor_scores(y_preds=train_preds, y_actuals=y_train, set_name='Training')

RMSE Training: 10.40
MAE Training: 5.28


In [18]:
# Make predictions on validation dataset and print the performance scores
val_preds = sgd_pipe.predict(X_val)
print_regressor_scores(y_preds=val_preds, y_actuals=y_val, set_name='Validation')

RMSE Validation: 11.02
MAE Validation: 4.92


In [19]:
# Make predictions on test dataset and print the performance scores
test_preds = sgd_pipe.predict(X_test)
print_regressor_scores(y_preds=test_preds, y_actuals=y_test, set_name='Test')

RMSE Test: 11.35
MAE Test: 4.92


In [20]:
# Save the model 
# Import dump from joblib package and save sgd_pipe into models folder
from joblib import dump

dump(sgd_pipe,  '../../models/Predictive/sgd_pipeline.joblib')

['../../models/Predictive/sgd_pipeline.joblib']