# Simple GLM

In [1]:
import os
from pathlib import Path

import altair as alt
import numpy as np
import pandas as pd
from quantcore.glm import GeneralizedLinearRegressor
from quantcore.learn.metrics import gini_score
from quantcore.learn.plotting import make_plots
from quantcore.learn.preprocessing import Categorizer, FeatureSelector
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.pipeline import Pipeline

from rossmann import load_test, load_train, root_mean_squared_percentage_error
from rossmann.feature_engineering import (
    compute_age_quantile,
    compute_competition_open,
    compute_open_lag,
    compute_open_lead,
    compute_promo_lag,
    compute_promo_lead,
    compute_school_holiday_lag,
    compute_school_holiday_lead,
    compute_state_holiday_lag,
    compute_state_holiday_lead,
    compute_store_day_of_week,
    compute_store_month,
    compute_store_school_holiday,
    compute_store_state_holiday,
    compute_store_year,
    compute_zscore,
)

from process_data import process_data

pd.set_option("display.float_format", lambda x: "%.3f" % x)
alt.data_transformers.enable("json")  # to allow for large plots

DataTransformerRegistry.enable('json')

In [2]:
if not all([os.path.exists(p) for p in ["raw_data/train.csv", "raw_data/test.csv", "raw_data/store.csv"]]):
    raise Exception("Please download raw data into 'raw_data' folder")

if not all([os.path.exists(p) for p in ["processed_data/train.parquet", "processed_data/test.parquet"]]):
    "Processed data not found. Processing data from raw data..."
    process_data()
    "Done"

In [3]:
df = pd.concat(
    [
        load_train().assign(sample="train"),
        load_test().assign(sample="test"),
    ],
    ignore_index=True,
).sort_values(["store", "date"])
df = df.iloc[:int(.1*len(df))]
df.head()

Unnamed: 0,store,day_of_week,date,sales,customers,open,promo,state_holiday,school_holiday,year,...,assortment,competition_distance,competition_open_since_month,competition_open_since_year,promo2,promo2_since_week,promo2_since_year,promo_interval,sample,id
0,1,2,2013-01-01,0.0,0.0,False,0,a,1,2013,...,a,1270.0,9.0,2008.0,0,,,,train,
1,1,3,2013-01-02,5530.0,668.0,True,0,0,1,2013,...,a,1270.0,9.0,2008.0,0,,,,train,
2,1,4,2013-01-03,4327.0,578.0,True,0,0,1,2013,...,a,1270.0,9.0,2008.0,0,,,,train,
3,1,5,2013-01-04,4486.0,619.0,True,0,0,1,2013,...,a,1270.0,9.0,2008.0,0,,,,train,
4,1,6,2013-01-05,4997.0,635.0,True,0,0,1,2013,...,a,1270.0,9.0,2008.0,0,,,,train,


In [4]:
df["age_quantile"] = compute_age_quantile(df, 5)
df["competition_open"] = compute_competition_open(df)
df["count"] = df.groupby("store")[["date"]].transform("cumcount")
df["log_sales"] = np.log(df["sales"])
df["open_lag_1"] = compute_open_lag(df)
df["open_lag_2"] = compute_open_lag(df, 2)
df["open_lag_3"] = compute_open_lag(df, 3)
df["open_lead_1"] = compute_open_lead(df)
df["open_lead_2"] = compute_open_lead(df, -2)
df["open_lead_3"] = compute_open_lead(df, -3)
df["promo_lag_1"] = compute_promo_lag(df)
df["promo_lag_2"] = compute_promo_lag(df, 2)
df["promo_lag_3"] = compute_promo_lag(df, 3)
df["promo_lead_1"] = compute_promo_lead(df)
df["promo_lead_2"] = compute_promo_lead(df, -2)
df["promo_lead_3"] = compute_promo_lead(df, -3)
df["school_holiday_lag_1"] = compute_school_holiday_lag(df)
df["school_holiday_lag_2"] = compute_school_holiday_lag(df, 2)
df["school_holiday_lag_3"] = compute_school_holiday_lag(df, 3)
df["school_holiday_lead_1"] = compute_school_holiday_lead(df)
df["school_holiday_lead_2"] = compute_school_holiday_lead(df, -2)
df["school_holiday_lead_3"] = compute_school_holiday_lead(df, -3)
df["state_holiday_lag_1"] = compute_state_holiday_lag(df)
df["state_holiday_lag_2"] = compute_state_holiday_lag(df, 2)
df["state_holiday_lag_3"] = compute_state_holiday_lag(df, 3)
df["state_holiday_lead_1"] = compute_state_holiday_lead(df)
df["state_holiday_lead_2"] = compute_state_holiday_lead(df, -2)
df["state_holiday_lead_3"] = compute_state_holiday_lead(df, -3)
df["store_day_of_week"] = compute_store_day_of_week(df)
df["store_month"] = compute_store_month(df)
df["store_school_holiday"] = compute_store_school_holiday(df)
df["store_state_holiday"] = compute_store_state_holiday(df)
df["store_year"] = compute_store_year(df)
df["zscore"] = compute_zscore(df, window=150)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [5]:
df.head()

Unnamed: 0,store,day_of_week,date,sales,customers,open,promo,state_holiday,school_holiday,year,...,state_holiday_lag_3,state_holiday_lead_1,state_holiday_lead_2,state_holiday_lead_3,store_day_of_week,store_month,store_school_holiday,store_state_holiday,store_year,zscore
0,1,2,2013-01-01,0.0,0.0,False,0,a,1,2013,...,0,0,0,0,1_2,1_1,1_1,1_True,1_2013,
1,1,3,2013-01-02,5530.0,668.0,True,0,0,1,2013,...,0,0,0,0,1_3,1_1,1_1,1_False,1_2013,
2,1,4,2013-01-03,4327.0,578.0,True,0,0,1,2013,...,0,0,0,0,1_4,1_1,1_1,1_False,1_2013,
3,1,5,2013-01-04,4486.0,619.0,True,0,0,1,2013,...,a,0,0,0,1_5,1_1,1_1,1_False,1_2013,
4,1,6,2013-01-05,4997.0,635.0,True,0,0,1,2013,...,0,0,0,0,1_6,1_1,1_1,1_False,1_2013,


In [6]:
df.shape

(105829, 56)

In [7]:
validation_window = [pd.to_datetime("2015-06-15"), pd.to_datetime("2015-07-31")]

In [8]:
select_train = (
    df["sample"].eq("train")
    & df["sales"].gt(0)
    & df["date"].lt(validation_window[0])
    & df["zscore"].abs().lt(5)
).to_numpy()

select_val = (
    df["sample"].eq("train")
    & df["sales"].gt(0)
    & df["date"].ge(validation_window[0])
    & df["date"].lt(validation_window[1])
).to_numpy()

In [9]:
(select_train.sum(), select_val.sum())

(65996, 4446)

In [10]:
categorical_features = {"year", "month", "day_of_week", "store"}
numeric_features = set()
pipeline_year_month_day_store = Pipeline(
    [
        (
            "select",
            FeatureSelector(categorical_features | numeric_features),
        ),
        (
            "categorize",
            Categorizer(categorical_features),
        ),
        (
            "estimate",
            GeneralizedLinearRegressor(
                family="normal",
                scale_predictors=True,
                l1_ratio=0.0,
                alphas=[1e-8],
            ),
        ),
    ]
)

In [11]:
pipeline_year_month_day_store.fit(df[select_train], df.loc[select_train, "log_sales"])
df.loc[lambda x: x["open"], "offset"] = pipeline_year_month_day_store.predict(
    df.loc[lambda x: x["open"]]
)


In [12]:
df["offset"]

0          NaN
1        8.548
2        8.550
3        8.578
4        8.448
          ... 
101792     NaN
101793     NaN
101794   8.843
101795   8.713
101796     NaN
Name: offset, Length: 105829, dtype: float64

In [18]:
root_mean_squared_percentage_error(
    df.loc[select_val, "sales"], np.exp(df.loc[select_val, "offset"])
)

0.3081770078583425

In [19]:
categorical_features = {
    "age_quantile",
    "competition_open",
    "open_lag_1",
    "open_lag_2",
    "open_lag_3",
    "open_lead_1",
    "open_lead_2",
    "open_lead_3",
    "promo_lag_1",
    "promo_lag_2",
    "promo_lag_3",
    "promo_lead_1",
    "promo_lead_2",
    "promo_lead_3",
    "promo",
    "school_holiday_lag_1",
    "school_holiday_lag_2",
    "school_holiday_lag_3",
    "school_holiday_lead_1",
    "school_holiday_lead_2",
    "school_holiday_lead_3",
    "school_holiday",
    "state_holiday_lag_1",
    "state_holiday_lag_2",
    "state_holiday_lag_3",
    "state_holiday_lead_1",
    "state_holiday_lead_2",
    "state_holiday_lead_3",
    "state_holiday",
    "store_day_of_week",
    "store_month",
    "store_school_holiday",
    "store_state_holiday",
    "store_year",
}

numeric_features = set()

pipeline = Pipeline(
    [
        (
            "select",
            FeatureSelector(categorical_features | numeric_features),
        ),
        (
            "categorize",
            Categorizer(categorical_features, unseen_method="most_frequent"),
        ),
        (
            "estimate",
            GeneralizedLinearRegressor(
                family="normal",
                scale_predictors=True,
                warm_start=True,
                alpha_search=True,
                l1_ratio=0.0,  # only ridge for now
                alphas=[1e-4, 1e-3, 1e-2, 5e-2, 1e-1, 5e-1, 1, 10],
            ),
        ),
    ]
)

In [20]:
_ = pipeline.fit(
    df.loc[select_train],
    df.loc[select_train, "log_sales"],
    estimate__offset=df.loc[select_train, "offset"],
)

In [21]:
pipeline["estimate"].feature_names_[:10]

['store_year__100_2013',
 'store_year__100_2014',
 'store_year__100_2015',
 'store_year__101_2013',
 'store_year__101_2014',
 'store_year__101_2015',
 'store_year__102_2013',
 'store_year__102_2014',
 'store_year__102_2015',
 'store_year__103_2013']

In [22]:
metrics = []
for alpha_index, alpha in enumerate(pipeline["estimate"]._alphas):

    # predict for open and closed stores separately
    df.loc[lambda x: x["open"], "pred"] = np.exp(
        pipeline.predict(df.loc[lambda x: x["open"]], alpha_index=alpha_index)
        + df.loc[lambda x: x["open"], "offset"].to_numpy()
    )
    df.loc[lambda x: ~x["open"], "pred"] = 0.0

    # make sure we hit the means of the past 100 days exactly
    adjustment = (
        df.loc[select_train, ["store", "sales", "pred"]]
        .groupby("store")
        .apply(lambda x: x["sales"].tail(100).mean() / x["pred"].tail(100).mean())
    )

    pred_adj = df["pred"] * adjustment[df["store"]].to_numpy()
    sales_val = df.loc[select_val, "sales"]
    pred_adj_val = pred_adj[select_val]

    metrics.append(
        pd.DataFrame(
            {
                "alpha_index": alpha_index,
                "alpha": alpha,
                "gini": gini_score(sales_val, pred_adj_val),
                "rmse": mean_squared_error(sales_val, pred_adj_val, squared=False),
                "rmspe": root_mean_squared_percentage_error(sales_val, pred_adj_val),
                "rmsle": np.sqrt(mean_squared_log_error(sales_val, pred_adj_val)),
                "bias": pred_adj_val.mean() / sales_val.mean(),
            },
            index=[0],
        )
    )

metrics = pd.concat(metrics, ignore_index=True)
metrics

Unnamed: 0,alpha_index,alpha,gini,rmse,rmspe,rmsle,bias
0,0,0.0,0.948,964.512,0.13,0.123,1.013
1,1,0.001,0.948,965.735,0.13,0.124,1.012
2,2,0.01,0.948,965.105,0.13,0.123,1.011
3,3,0.05,0.948,947.876,0.128,0.122,1.005
4,4,0.1,0.949,935.512,0.126,0.122,1.002
5,5,0.5,0.948,926.008,0.129,0.125,0.993
6,6,1.0,0.944,955.733,0.138,0.133,0.992
7,7,10.0,0.881,1307.334,0.213,0.192,0.997


In [23]:
best_alpha_index = metrics.sort_values("rmspe")["alpha_index"].iloc[0]
best_alpha = pipeline["estimate"]._alphas[best_alpha_index]

In [24]:
# save prediction
df.loc[lambda x: x["open"], "pred"] = np.exp(
    pipeline.predict(df.loc[lambda x: x["open"]], alpha_index=best_alpha_index)
    + df.loc[lambda x: x["open"], "offset"].to_numpy()
)
df.loc[lambda x: ~x["open"], "pred"] = 0.0

# make sure we hit the means of the past 100 days exactly
adjustment = (
    df.loc[select_train, ["store", "sales", "pred"]]
    .groupby("store")
    .apply(lambda x: x["sales"].tail(100).mean() / x["pred"].tail(100).mean())
)

df["pred_adj"] = df["pred"] * adjustment[df["store"]].to_numpy()

In [25]:
df_plot=df.assign(
    # make train, val, test split for this plot
    # fill missings in the outcome with the mean so
    # that the plots look pretty
    sample=lambda x: x["sample"].where(~select_val, "val"),
    sales=lambda x: x["sales"].fillna(x["sales"].mean()),
).loc[lambda x: x["open"] & (select_train | select_val)]

pipelines={
    "glm": lambda x: (
        np.exp(
            pipeline.predict(x, alpha_index=best_alpha_index)
            + x["offset"].to_numpy()
        )
        * adjustment[x["store"]].to_numpy()
    ),
    "baseline": lambda x: np.exp(pipeline_year_month_day_store.predict(x)),
}

plots = make_plots(
    df=df_plot,
    sample_for_pd=1000,
    pipelines=pipelines,
    features=["year"],  # pipeline["select"].columns_,
    outcome_column="sales",
    facet_column="sample",  # the plotter breaks without it
    facet_order=["train", "val", "test"],
    output="altair",
)

In [26]:
plots["year"]