# Simple GLM

In [1]:
import os
from pathlib import Path

import altair as alt
import numpy as np
import pandas as pd
from quantcore.glm import GeneralizedLinearRegressor
from quantcore.learn.metrics import gini_score
from quantcore.learn.plotting import make_plots
from quantcore.learn.preprocessing import Categorizer, FeatureSelector
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.pipeline import Pipeline

from feature_engineering import apply_all_transformations
from metrics import root_mean_squared_percentage_error
from process_data import load_test, load_train, process_data

pd.set_option("display.float_format", lambda x: "%.3f" % x)
pd.set_option('display.max_columns', None)
alt.data_transformers.enable("json")  # to allow for large plots

DataTransformerRegistry.enable('json')

## 1. Data Loading and Feature Engineering

### 1.1 Load

In [4]:
if not all([os.path.exists(p) for p in ["raw_data/train.csv", "raw_data/test.csv", "raw_data/store.csv"]]):
    raise Exception("Please download raw data into 'raw_data' folder")

if not all([os.path.exists(p) for p in ["processed_data/train.parquet", "processed_data/test.parquet"]]):
    "Processed data not found. Processing data from raw data..."
    process_data()
    "Done"
    
df = pd.concat(
    [
        load_train().assign(sample="train"),
        load_test().assign(sample="test"),
    ],
    ignore_index=True,
).sort_values(["store", "date"])
df = df.iloc[:int(.1*len(df))]
df.head()

Unnamed: 0,store,day_of_week,date,sales,customers,open,promo,state_holiday,school_holiday,year,month,store_type,assortment,competition_distance,competition_open_since_month,competition_open_since_year,promo2,promo2_since_week,promo2_since_year,promo_interval,sample,id
0,1,2,2013-01-01,0.0,0.0,False,0,a,1,2013,1,c,a,1270.0,9.0,2008.0,0,,,,train,
1,1,3,2013-01-02,5530.0,668.0,True,0,0,1,2013,1,c,a,1270.0,9.0,2008.0,0,,,,train,
2,1,4,2013-01-03,4327.0,578.0,True,0,0,1,2013,1,c,a,1270.0,9.0,2008.0,0,,,,train,
3,1,5,2013-01-04,4486.0,619.0,True,0,0,1,2013,1,c,a,1270.0,9.0,2008.0,0,,,,train,
4,1,6,2013-01-05,4997.0,635.0,True,0,0,1,2013,1,c,a,1270.0,9.0,2008.0,0,,,,train,


### 1.2 Feature Engineering

In [5]:
df = apply_all_transformations(df)
df.head()

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,store,day_of_week,date,sales,customers,open,promo,state_holiday,school_holiday,year,month,store_type,assortment,competition_distance,competition_open_since_month,competition_open_since_year,promo2,promo2_since_week,promo2_since_year,promo_interval,sample,id,age_quantile,competition_open,count,log_sales,open_lag_1,open_lag_2,open_lag_3,open_lead_1,open_lead_2,open_lead_3,promo_lag_1,promo_lag_2,promo_lag_3,promo_lead_1,promo_lead_2,promo_lead_3,school_holiday_lag_1,school_holiday_lag_2,school_holiday_lag_3,school_holiday_lead_1,school_holiday_lead_2,school_holiday_lead_3,state_holiday_lag_1,state_holiday_lag_2,state_holiday_lag_3,state_holiday_lead_1,state_holiday_lead_2,state_holiday_lead_3,store_day_of_week,store_month,store_school_holiday,store_state_holiday,store_year,zscore
0,1,2,2013-01-01,0.0,0.0,False,0,a,1,2013,1,c,a,1270.0,9.0,2008.0,0,,,,train,,-1,1.0,0,-inf,1.0,1.0,1.0,True,True,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0,0,0,0,0,0,1_2,1_1,1_1,1_True,1_2013,
1,1,3,2013-01-02,5530.0,668.0,True,0,0,1,2013,1,c,a,1270.0,9.0,2008.0,0,,,,train,,-1,1.0,1,8.618,False,1.0,1.0,True,True,True,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,a,0,0,0,0,0,1_3,1_1,1_1,1_False,1_2013,
2,1,4,2013-01-03,4327.0,578.0,True,0,0,1,2013,1,c,a,1270.0,9.0,2008.0,0,,,,train,,-1,1.0,2,8.373,True,False,1.0,True,True,False,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0,a,0,0,0,0,1_4,1_1,1_1,1_False,1_2013,
3,1,5,2013-01-04,4486.0,619.0,True,0,0,1,2013,1,c,a,1270.0,9.0,2008.0,0,,,,train,,-1,1.0,3,8.409,True,True,False,True,False,True,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0,0,a,0,0,0,1_5,1_1,1_1,1_False,1_2013,
4,1,6,2013-01-05,4997.0,635.0,True,0,0,1,2013,1,c,a,1270.0,9.0,2008.0,0,,,,train,,-1,1.0,4,8.517,True,True,True,False,True,True,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0,0,0,0,0,0,1_6,1_1,1_1,1_False,1_2013,


In [None]:
validation_window = [pd.to_datetime("2015-06-15"), pd.to_datetime("2015-07-31")]

In [None]:
select_train = (
    df["sample"].eq("train")
    & df["sales"].gt(0)
    & df["date"].lt(validation_window[0])
    & df["zscore"].abs().lt(5)
).to_numpy()

select_val = (
    df["sample"].eq("train")
    & df["sales"].gt(0)
    & df["date"].ge(validation_window[0])
    & df["date"].lt(validation_window[1])
).to_numpy()

In [None]:
(select_train.sum(), select_val.sum())

In [None]:
categorical_features = {"year", "month", "day_of_week", "store"}
numeric_features = set()
pipeline_year_month_day_store = Pipeline(
    [
        (
            "select",
            FeatureSelector(categorical_features | numeric_features),
        ),
        (
            "categorize",
            Categorizer(categorical_features),
        ),
        (
            "estimate",
            GeneralizedLinearRegressor(
                family="normal",
                scale_predictors=True,
                l1_ratio=0.0,
                alphas=[1e-8],
            ),
        ),
    ]
)

In [None]:
pipeline_year_month_day_store.fit(df[select_train], df.loc[select_train, "log_sales"])
df.loc[lambda x: x["open"], "offset"] = pipeline_year_month_day_store.predict(
    df.loc[lambda x: x["open"]]
)


In [None]:
df["offset"]

In [None]:
root_mean_squared_percentage_error(
    df.loc[select_val, "sales"], np.exp(df.loc[select_val, "offset"])
)

In [None]:
categorical_features = {
    "age_quantile",
    "competition_open",
    "open_lag_1",
    "open_lag_2",
    "open_lag_3",
    "open_lead_1",
    "open_lead_2",
    "open_lead_3",
    "promo_lag_1",
    "promo_lag_2",
    "promo_lag_3",
    "promo_lead_1",
    "promo_lead_2",
    "promo_lead_3",
    "promo",
    "school_holiday_lag_1",
    "school_holiday_lag_2",
    "school_holiday_lag_3",
    "school_holiday_lead_1",
    "school_holiday_lead_2",
    "school_holiday_lead_3",
    "school_holiday",
    "state_holiday_lag_1",
    "state_holiday_lag_2",
    "state_holiday_lag_3",
    "state_holiday_lead_1",
    "state_holiday_lead_2",
    "state_holiday_lead_3",
    "state_holiday",
    "store_day_of_week",
    "store_month",
    "store_school_holiday",
    "store_state_holiday",
    "store_year",
}

numeric_features = set()

pipeline = Pipeline(
    [
        (
            "select",
            FeatureSelector(categorical_features | numeric_features),
        ),
        (
            "categorize",
            Categorizer(categorical_features, unseen_method="most_frequent"),
        ),
        (
            "estimate",
            GeneralizedLinearRegressor(
                family="normal",
                scale_predictors=True,
                warm_start=True,
                alpha_search=True,
                l1_ratio=0.0,  # only ridge for now
                alphas=[1e-4, 1e-3, 1e-2, 5e-2, 1e-1, 5e-1, 1, 10],
            ),
        ),
    ]
)

In [None]:
_ = pipeline.fit(
    df.loc[select_train],
    df.loc[select_train, "log_sales"],
    estimate__offset=df.loc[select_train, "offset"],
)

In [None]:
pipeline["estimate"].feature_names_[:10]

In [None]:
metrics = []
for alpha_index, alpha in enumerate(pipeline["estimate"]._alphas):

    # predict for open and closed stores separately
    df.loc[lambda x: x["open"], "pred"] = np.exp(
        pipeline.predict(df.loc[lambda x: x["open"]], alpha_index=alpha_index)
        + df.loc[lambda x: x["open"], "offset"].to_numpy()
    )
    df.loc[lambda x: ~x["open"], "pred"] = 0.0

    # make sure we hit the means of the past 100 days exactly
    adjustment = (
        df.loc[select_train, ["store", "sales", "pred"]]
        .groupby("store")
        .apply(lambda x: x["sales"].tail(100).mean() / x["pred"].tail(100).mean())
    )

    pred_adj = df["pred"] * adjustment[df["store"]].to_numpy()
    sales_val = df.loc[select_val, "sales"]
    pred_adj_val = pred_adj[select_val]

    metrics.append(
        pd.DataFrame(
            {
                "alpha_index": alpha_index,
                "alpha": alpha,
                "gini": gini_score(sales_val, pred_adj_val),
                "rmse": mean_squared_error(sales_val, pred_adj_val, squared=False),
                "rmspe": root_mean_squared_percentage_error(sales_val, pred_adj_val),
                "rmsle": np.sqrt(mean_squared_log_error(sales_val, pred_adj_val)),
                "bias": pred_adj_val.mean() / sales_val.mean(),
            },
            index=[0],
        )
    )

metrics = pd.concat(metrics, ignore_index=True)
metrics

In [None]:
best_alpha_index = metrics.sort_values("rmspe")["alpha_index"].iloc[0]
best_alpha = pipeline["estimate"]._alphas[best_alpha_index]

In [None]:
# save prediction
df.loc[lambda x: x["open"], "pred"] = np.exp(
    pipeline.predict(df.loc[lambda x: x["open"]], alpha_index=best_alpha_index)
    + df.loc[lambda x: x["open"], "offset"].to_numpy()
)
df.loc[lambda x: ~x["open"], "pred"] = 0.0

# make sure we hit the means of the past 100 days exactly
adjustment = (
    df.loc[select_train, ["store", "sales", "pred"]]
    .groupby("store")
    .apply(lambda x: x["sales"].tail(100).mean() / x["pred"].tail(100).mean())
)

df["pred_adj"] = df["pred"] * adjustment[df["store"]].to_numpy()

In [None]:
df_plot=df.assign(
    # make train, val, test split for this plot
    # fill missings in the outcome with the mean so
    # that the plots look pretty
    sample=lambda x: x["sample"].where(~select_val, "val"),
    sales=lambda x: x["sales"].fillna(x["sales"].mean()),
).loc[lambda x: x["open"] & (select_train | select_val)]

pipelines={
    "glm": lambda x: (
        np.exp(
            pipeline.predict(x, alpha_index=best_alpha_index)
            + x["offset"].to_numpy()
        )
        * adjustment[x["store"]].to_numpy()
    ),
    "baseline": lambda x: np.exp(pipeline_year_month_day_store.predict(x)),
}

plots = make_plots(
    df=df_plot,
    sample_for_pd=1000,
    pipelines=pipelines,
    features=["year"],  # pipeline["select"].columns_,
    outcome_column="sales",
    facet_column="sample",  # the plotter breaks without it
    facet_order=["train", "val", "test"],
    output="altair",
)

In [None]:
plots["year"]