# Predict required stock beforehand with Elastic Net Regularization

We only have data of 15 festivals, so regular ML methods are unusable because they would overfit. According to [this](https://www.quora.com/What-are-the-best-machine-learning-models-for-extremely-small-datasets-20-instances) post topology- and geometry-based regression models, such as homotopy LASSO, dgLARS, and others will work for extremely small datasets ([source](https://psyarxiv.com/v8jgk/)).

In [56]:
import pandas as pd
import numpy as np
from joblib import dump
from datetime import datetime

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet

from sklearn.exceptions import ConvergenceWarning
from warnings import simplefilter
simplefilter("ignore", category=FutureWarning)
simplefilter("ignore", category=ConvergenceWarning)

In [57]:
def grid_search_model(pgrid, regressor, X, y):
    """Perform grid search on an sklearn regressor"""
    grid = GridSearchCV(
        regressor(),
        pgrid,
        scoring="neg_mean_absolute_percentage_error",
        cv=X.shape[0],
        verbose=1,
    )
    grid.fit(X, y)
    model = regressor(**grid.best_params_).fit(X, y)
    return model, grid.best_params_, grid.best_score_ * 100


def inspect_cross_validation(regressor, params, X, y):
    """Look at the cross validations of each eventfor each product of a regressor"""
    res = pd.DataFrame(
        columns=[
            "Pred A",
            "Pred B",
            "Req A",
            "Req B",
            "Error A (%)",
            "Error B (%)",
        ]
    )

    for i in range(X.shape[0]):
        X_train = np.delete(X, (i), axis=0)
        y_train = np.delete(y, (i), axis=0)
        x_test = X[i].reshape(1, -1)
        y_test = y[i]
        model = regressor(**params)
        model.fit(X_train, y_train)
        pred = np.hstack([model.predict(x_test)[0], y_test])
        row = dict()
        row["Pred A"] = pred[0]
        row["Pred B"] = pred[1]
        row["Req A"] = pred[2]
        row["Req B"] = pred[3]
        row["Error A (%)"] = abs((pred[2] - pred[0]) / pred[2]) * 100
        row["Error B (%)"] = abs((pred[3] - pred[1]) / pred[3]) * 100
        res = res.append(row, ignore_index=True)

    mean_a = res["Error A (%)"].mean()
    mean_b = res["Error B (%)"].mean()
    print("Mean absolute error product A:", round(mean_a, 2), end="%\n")
    print("Mean absolute error product B:", round(mean_b, 2), end="%\n")
    print(
        "Mean absolute error total:    ",
        round(np.mean([mean_a, mean_b]), 2),
        end="%\n",
    )

    return res


def save_model(model, name="model", timestamp=True):
    """Save the model to a file"""
    suffix = "_" + datetime.now().strftime("%m-%d-%Y_%H:%M:%S") if timestamp else ""
    dump(model, f"../../models/forecast/{name}{suffix}.joblib")

In [58]:
df = pd.read_csv("../../data/processed/events.csv")

lbl_X = ['Male/female ratio', 'Median age', 'Mean age', 'Max age',
    'Min age', 'Mode age', 'Std dev age', 'Netherlands', 'France',
    'Germany', 'Belgium', 'UK', 'Italy', 'Spain', 'month', 'day',
    'weather_tavg', 'weather_tmin', 'weather_tmax', 'weather_prcp',
    'weather_wdir', 'weather_wspd', 'weather_wpgt', 'weather_pres',
    'inflation', 'benzineprijs', 'dieselprijs']
lbl_y = ['stock_beer', 'stock_water']

X, y = df[lbl_X].to_numpy(), df[lbl_y].to_numpy()

## Elastic net regularization 
Should work with small datasets ([source](https://en.wikipedia.org/wiki/Elastic_net_regularization)). Because the dataset is so small, we can use GridSearch with cross-validation to optimize the parameters.

In [65]:
# Tests 7.000.000 configurations, so takes a long time to train
elnet_pgrid = {"max_iter": np.linspace(1, 5, 5, dtype=int),
                "alpha": np.linspace(60, 70, 200),
                "l1_ratio": np.linspace(0.25, 0.75, 10)}         
elnet_model, elnet_params, elnet_score = grid_search_model(elnet_pgrid, ElasticNet, X, y)
print(elnet_params)

Fitting 7 folds for each of 10000 candidates, totalling 70000 fits
{'alpha': 62.61306532663317, 'l1_ratio': 0.6388888888888888, 'max_iter': 3}


In [66]:
inspect_cross_validation(ElasticNet, elnet_params, X, y)

Mean absolute error product A: 110.3%
Mean absolute error product B: 262.95%
Mean absolute error total:     186.63%


Unnamed: 0,Pred A,Pred B,Req A,Req B,Error A (%),Error B (%)
0,44900.619637,8473.995943,40509.0,20220.0,10.841096,58.091019
1,27735.901955,19772.903314,3942.0,1283.0,603.599745,1441.14601
2,35817.132009,12816.048958,38403.0,7181.0,6.733505,78.471647
3,23775.821926,16610.850945,40461.0,9149.0,41.237681,81.559197
4,26672.703663,23252.918578,26372.0,10784.0,1.140238,115.624245
5,28927.180086,8517.147999,20876.0,24070.0,38.56668,64.615089
6,43391.931371,128.480944,25523.0,127.0,70.011093,1.166098


In [None]:
save_model(elnet_model, "ElasticNetReg")