# Setup

In [30]:
# Clone Repo
!git clone -b temp https://Philst4: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@github.com/Philst4/Store-Sales.git

Cloning into 'Store-Sales'...
remote: Enumerating objects: 490, done.[K
remote: Counting objects: 100% (226/226), done.[K
remote: Compressing objects: 100% (143/143), done.[K
remote: Total 490 (delta 144), reused 154 (delta 83), pack-reused 264 (from 1)[K
Receiving objects: 100% (490/490), 7.43 MiB | 29.48 MiB/s, done.
Resolving deltas: 100% (290/290), done.


In [31]:
# Navigate to root directory of project
%cd Store-Sales

/content/Store-Sales/Store-Sales


In [32]:
# Check project structure
!ls

config.yaml	 experiment_configs  scratch.py  src
environment.yml  notebooks	     scripts


In [33]:
# Mount to GDrive (for reading and writing data)
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [34]:
# Read in raw data from GDrive to working environment; check our data
!mkdir ./data
!cp -r /content/drive/MyDrive/store_sales_data/raw ./data/raw
!ls data

raw


In [35]:
!ls data/raw

holidays_events.csv  sample_submission.csv  test.csv   transactions.csv
oil.csv		     stores.csv		    train.csv


In [36]:
!ls data/clean

ls: cannot access 'data/clean': No such file or directory


In [37]:
import pandas as pd
import numpy as np

train = pd.read_csv("./data/raw/train.csv")
train['date'] = pd.to_datetime(train['date'], format="%Y-%m-%d")

# Make log-sales
train['log_sales'] = np.log1p(train['sales'])
print(train.shape)
print(train.date.nunique())
train.dtypes

(3000888, 7)
1684


Unnamed: 0,0
id,int64
date,datetime64[ns]
store_nbr,int64
family,object
sales,float64
onpromotion,int64
log_sales,float64


In [None]:
def calc_daily_stats(
    df,
    cols_to_roll,
    group_cols,
    supported_stats,
    quantiles
):
    df = df.sort_values(by=group_cols + ['date'])

    quantile_fns = {}
    for quantile in quantiles:
        quantile_fns[quantile] = lambda x, q=quantile: np.quantile(x, float(q[1:]) / 100)

    daily_stats = (
        df
        .groupby(['date'] + group_cols, observed=True)[cols_to_roll]
        .agg(supported_stats + list(quantile_fns.values()))
        .reset_index()
        .set_index('date')
    )

    # Flatten the multi-index
    suffix = f"_wrt_{'_'.join(group_cols)}" if group_cols else ""
    new_cols = group_cols.copy()
    for col in cols_to_roll:
        for stat in supported_stats + quantiles:
            new_cols.append(f"{col}_{stat}{suffix}")
    daily_stats.columns = new_cols

    return daily_stats

cols_to_roll = ['sales', 'log_sales']
group_cols = []
supported_stats = ['mean', 'std', 'min', 'max']
quantiles = ['q0.1', 'q1', 'q5', 'q25', 'q50', 'q75', 'q95', 'q99', 'q99.9']

daily_stats = calc_daily_stats(train, cols_to_roll, group_cols, supported_stats, quantiles)
print(daily_stats.shape)
daily_stats.head(10)

In [None]:
def roll_daily_stats(daily_stats, group_cols, lag=16, window=1):
    daily_stats = daily_stats.sort_values(by=group_cols).sort_index()

    # These are the average aggregation stats
    value_cols = [c for c in daily_stats.columns if c not in group_cols]

    if group_cols:
        rolling_stats = (
            daily_stats
            .groupby(group_cols, group_keys=False)[value_cols]
            .rolling(window=window, min_periods=1)
            .mean()
            .groupby(level=group_cols)
            .shift(lag)
            .fillna(0)
            .reset_index()
        )

    else:
        rolling_stats = (
            daily_stats[value_cols]
            .rolling(window=window, min_periods=1)
            .mean()
            .shift(lag)
            .fillna(0)
            .reset_index()
        )

    # Rename rolling cols to have lag and window info
    new_cols = [col + f"_lag{lag}_window{window}" if col not in ['date'] + group_cols else col for col in list(rolling_stats.columns)]
    rolling_stats.columns = new_cols
    return rolling_stats


rolling_stats = roll_daily_stats(daily_stats, group_cols)
print(group_cols)
print(rolling_stats.shape)
rolling_stats.head(20)

In [None]:
def compute_rolling_stats(
        df,
        cols_to_roll,
        group_cols,
        supported_stats,
        quantiles,
        lag=16,
        window=1
    ):

    daily_stats = calc_daily_stats(
        df,
        cols_to_roll,
        group_cols,
        supported_stats,
        quantiles
    )

    rolled_stats = roll_daily_stats(
        daily_stats,
        group_cols,
        lag,
        window,
    )

    return rolled_stats

rolling_stats = compute_rolling_stats(
    train,
    cols_to_roll,
    group_cols,
    supported_stats,
    quantiles,
    lag=16,
    window=1
)
print(group_cols)
print(rolling_stats.shape)
rolling_stats.head(20)

In [None]:
cols_to_roll = ['sales', 'log_sales']
group_cols = ['store_nbr']
supported_stats = ['mean', 'std', 'min', 'max']
quantiles = ['q0.1', 'q1', 'q5', 'q25', 'q50', 'q75', 'q95', 'q99', 'q99.9']

daily_stats = calc_daily_stats(
    train[train['store_nbr'].isin([1, 2, 3])],
    cols_to_roll,
    group_cols,
    supported_stats,
    quantiles
)
print(daily_stats.shape)
daily_stats.head(10)

In [None]:
rolling_stats = roll_daily_stats(daily_stats, group_cols)
print(group_cols)
print(rolling_stats.shape)
rolling_stats.sort_values(by=['date'] + group_cols).head(60)

In [None]:
rolling_stats = compute_rolling_stats(
    train[train['store_nbr'].isin([1, 2, 3])],
    cols_to_roll,
    group_cols,
    supported_stats,
    quantiles,
    lag=16,
    window=1
)
print(group_cols)
print(rolling_stats.shape)
rolling_stats.sort_values(by=['date'] + group_cols).head(60)


# Clean/Process Raw Data

In [None]:
if False:
    !python scripts/process_data.py
else:
    !cp -r /content/drive/MyDrive/store_sales_data/clean2 ./data/clean

In [None]:
!ls data

In [None]:
!ls data/raw

In [None]:
!ls data/clean

In [None]:
# Write clean data back to drive
!rm -r /content/drive/MyDrive/store_sales_data/clean
!cp -r ./data/clean /content/drive/MyDrive/store_sales_data/

In [None]:
!cp -r ./data/clean /content/drive/MyDrive/store_sales_data/clean2

In [None]:
from src.io_utils import load_and_merge_from_manifest

ddf = load_and_merge_from_manifest("./data/clean/manifest.json", sample=0.001)
df = ddf.compute()

print(df.dtypes)
print(df.shape)
df.head()

# Tune Model

In [None]:
!pip install optuna -q
!pip install mlflow -q

In [None]:
!cp -r /content/drive/MyDrive/store_sales_data/optuna_studies.db ./optuna_studies.db
#!cp -r /content/drive/MyDrive/store_sales_data/mlruns ./mlruns
!ls

In [None]:
# Model tuning coarse search
if False:
    !python scripts/tune_model.py --sample 0.025 --n_trials 15 --n_backtests 8 --valset_size 16 --n_jobs -1
# Tuning with 2.5% sample is about 75_000 training samples.

In [None]:
# Write studies + mlruns back to drive
!cp -r ./optuna_studies.db /content/drive/MyDrive/store_sales_data/optuna_studies.db

# Fit Best Model

In [29]:
!python scripts/train_best.py

Loading experiment config from 'experiment_configs.xgb'...

--- Training using following trial.... ---
Best trial number: 14
Best value (objective/loss): 0.5287987552583218
Best hyperparameters:
 * n_estimators: 1682
 * max_depth: 7
 * learning_rate: 0.018289572550180128
 * subsample: 0.6463003563392685
 * colsample_bytree: 0.6741625687416009
 * reg_lambda: 4.574415260540854
 * gamma: 0.05638242999262061
 * min_child_weight: 10
 * seed: 42
 * objective: reg:squarederror
 * eval_metric: rmse
 * tree_method: hist
 * enable_categorical: True
 * device: cpu
 * max_bin: 256
 * early_stopping_rounds: 100


 -- Training Iteration 1/10 (sampling 10.00% of data) --
Locating 'main data' chunk...
Locating 'secondary_data' chunks...: 100% 3/3 [00:00<00:00, 16.88it/s]
Locating 'rolling_stats' chunks...: 100% 35/35 [00:02<00:00, 17.09it/s]
Loading chunk into memory...
Splitting train/test...
Training model on chunk...
Traceback (most recent call last):
  File "/content/Store-Sales/scripts/train_best

# Make Submission

In [None]:
!python scripts/make_submission.py