# Setup

In [1]:
# Clone Repo
!git clone -b temp https://Philst4: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@github.com/Philst4/Store-Sales.git

Cloning into 'Store-Sales'...
remote: Enumerating objects: 444, done.[K
remote: Counting objects: 100% (180/180), done.[K
remote: Compressing objects: 100% (111/111), done.[K
remote: Total 444 (delta 111), reused 127 (delta 69), pack-reused 264 (from 1)[K
Receiving objects: 100% (444/444), 7.39 MiB | 34.54 MiB/s, done.
Resolving deltas: 100% (257/257), done.


In [2]:
# Navigate to root directory of project
%cd Store-Sales

/content/Store-Sales


In [3]:
# Check project structure
!ls

config.yaml	 experiment_configs  scratch.py  src
environment.yml  notebooks	     scripts


In [4]:
# Mount to GDrive (for reading and writing data)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Read in raw data from GDrive to working environment; check our data
!mkdir ./data
!cp -r /content/drive/MyDrive/store_sales_data/raw ./data/raw
!ls data

raw


In [6]:
!ls data/raw

holidays_events.csv  sample_submission.csv  test.csv   transactions.csv
oil.csv		     stores.csv		    train.csv


In [7]:
!ls data/clean

ls: cannot access 'data/clean': No such file or directory


In [8]:
import pandas as pd
import numpy as np

train = pd.read_csv("./data/raw/train.csv")
train['date'] = pd.to_datetime(train['date'], format="%Y-%m-%d")

# Make log-sales
train['log_sales'] = np.log1p(train['sales'])
print(train.shape)
print(train.date.nunique())
train.dtypes

(3000888, 7)
1684


Unnamed: 0,0
id,int64
date,datetime64[ns]
store_nbr,int64
family,object
sales,float64
onpromotion,int64
log_sales,float64


In [9]:
def calc_daily_stats(
    df,
    cols_to_roll,
    group_cols,
    supported_stats,
    quantiles
):
    df = df.sort_values(by=group_cols + ['date'])

    quantile_fns = {}
    for quantile in quantiles:
        quantile_fns[quantile] = lambda x, q=quantile: np.quantile(x, float(q[1:]) / 100)

    daily_stats = (
        df
        .groupby(['date'] + group_cols, observed=True)[cols_to_roll]
        .agg(supported_stats + list(quantile_fns.values()))
        .reset_index()
        .set_index('date')
    )

    # Flatten the multi-index
    suffix = f"_wrt_{'_'.join(group_cols)}" if group_cols else ""
    new_cols = group_cols.copy()
    for col in cols_to_roll:
        for stat in supported_stats + quantiles:
            new_cols.append(f"{col}_{stat}{suffix}")
    daily_stats.columns = new_cols

    return daily_stats

cols_to_roll = ['sales', 'log_sales']
group_cols = []
supported_stats = ['mean', 'std', 'min', 'max']
quantiles = ['q0.1', 'q1', 'q5', 'q25', 'q50', 'q75', 'q95', 'q99', 'q99.9']

daily_stats = calc_daily_stats(train, cols_to_roll, group_cols, supported_stats, quantiles)
print(daily_stats.shape)
daily_stats.head(10)

(1684, 26)


Unnamed: 0_level_0,sales_mean,sales_std,sales_min,sales_max,sales_q0.1,sales_q1,sales_q5,sales_q25,sales_q50,sales_q75,...,log_sales_max,log_sales_q0.1,log_sales_q1,log_sales_q5,log_sales_q25,log_sales_q50,log_sales_q75,log_sales_q95,log_sales_q99,log_sales_q99.9
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-01,1.409438,26.658379,0.0,810.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.698268,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.520495
2013-01-02,278.390807,926.074635,0.0,11422.0,0.0,0.0,0.0,0.0,0.0,117.78275,...,9.343384,0.0,0.0,0.0,0.0,0.0,4.777292,7.235962,8.319721,9.294081
2013-01-03,202.840197,644.7027,0.0,7342.0,0.0,0.0,0.0,0.0,0.0,97.965,...,8.901503,0.0,0.0,0.0,0.0,0.0,4.594766,6.953918,8.100629,8.845804
2013-01-04,198.911154,632.73451,0.0,7527.0,0.0,0.0,0.0,0.0,0.0,94.72825,...,8.926385,0.0,0.0,0.0,0.0,0.0,4.561504,6.96616,8.046446,8.890909
2013-01-05,267.873244,893.800062,0.0,10827.0,0.0,0.0,0.0,0.0,0.0,123.778,...,9.289891,0.0,0.0,0.0,0.0,0.0,4.826536,7.25443,8.371244,9.268304
2013-01-06,291.636028,1024.703786,0.0,13428.0,0.0,0.0,0.0,0.0,0.0,112.0,...,9.505172,0.0,0.0,0.0,0.0,0.0,4.727269,7.277904,8.508677,9.421014
2013-01-07,188.6211,600.481657,0.0,7059.0,0.0,0.0,0.0,0.0,0.0,83.0,...,8.8622,0.0,0.0,0.0,0.0,0.0,4.430817,6.91656,7.95805,8.768851
2013-01-08,178.64634,692.336863,0.0,19849.0,0.0,0.0,0.0,0.0,0.0,80.956501,...,9.895959,0.0,0.0,0.0,0.0,0.0,4.406188,6.849193,7.915916,8.533056
2013-01-09,169.770375,542.906126,0.0,7546.0,0.0,0.0,0.0,0.0,0.0,74.9665,...,8.928905,0.0,0.0,0.0,0.0,0.0,4.330289,6.821442,7.96789,8.653729
2013-01-10,145.332213,439.484645,0.0,5101.0,0.0,0.0,0.0,0.0,0.0,75.75,...,8.537388,0.0,0.0,0.0,0.0,0.0,4.340537,6.667768,7.793551,8.324194


In [10]:
def roll_daily_stats(daily_stats, group_cols, lag=16, window=1):
    daily_stats = daily_stats.sort_values(by=group_cols).sort_index()

    # These are the average aggregation stats
    value_cols = [c for c in daily_stats.columns if c not in group_cols]

    if group_cols:
        rolling_stats = (
            daily_stats
            .groupby(group_cols, group_keys=False)[value_cols]
            .rolling(window=window, min_periods=1)
            .mean()
            .groupby(level=group_cols)
            .shift(lag)
            .fillna(0)
            .reset_index()
        )

    else:
        rolling_stats = (
            daily_stats[value_cols]
            .rolling(window=window, min_periods=1)
            .mean()
            .shift(lag)
            .fillna(0)
            .reset_index()
        )

    # Rename rolling cols to have lag and window info
    new_cols = [col + f"_lag{lag}_window{window}" if col not in ['date'] + group_cols else col for col in list(rolling_stats.columns)]
    rolling_stats.columns = new_cols
    return rolling_stats


rolling_stats = roll_daily_stats(daily_stats, group_cols)
print(group_cols)
print(rolling_stats.shape)
rolling_stats.head(20)

[]
(1684, 27)


Unnamed: 0,date,sales_mean_lag16_window1,sales_std_lag16_window1,sales_min_lag16_window1,sales_max_lag16_window1,sales_q0.1_lag16_window1,sales_q1_lag16_window1,sales_q5_lag16_window1,sales_q25_lag16_window1,sales_q50_lag16_window1,...,log_sales_max_lag16_window1,log_sales_q0.1_lag16_window1,log_sales_q1_lag16_window1,log_sales_q5_lag16_window1,log_sales_q25_lag16_window1,log_sales_q50_lag16_window1,log_sales_q75_lag16_window1,log_sales_q95_lag16_window1,log_sales_q99_lag16_window1,log_sales_q99.9_lag16_window1
0,2013-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2013-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2013-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2013-01-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2013-01-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2013-01-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2013-01-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,2013-01-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,2013-01-09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,2013-01-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
def compute_rolling_stats(
        df,
        cols_to_roll,
        group_cols,
        supported_stats,
        quantiles,
        lag=16,
        window=1
    ):

    daily_stats = calc_daily_stats(
        df,
        cols_to_roll,
        group_cols,
        supported_stats,
        quantiles
    )

    rolled_stats = roll_daily_stats(
        daily_stats,
        group_cols,
        lag,
        window,
    )

    return rolled_stats

rolling_stats = compute_rolling_stats(
    train,
    cols_to_roll,
    group_cols,
    supported_stats,
    quantiles,
    lag=16,
    window=1
)
print(group_cols)
print(rolling_stats.shape)
rolling_stats.head(20)

[]
(1684, 27)


Unnamed: 0,date,sales_mean_lag16_window1,sales_std_lag16_window1,sales_min_lag16_window1,sales_max_lag16_window1,sales_q0.1_lag16_window1,sales_q1_lag16_window1,sales_q5_lag16_window1,sales_q25_lag16_window1,sales_q50_lag16_window1,...,log_sales_max_lag16_window1,log_sales_q0.1_lag16_window1,log_sales_q1_lag16_window1,log_sales_q5_lag16_window1,log_sales_q25_lag16_window1,log_sales_q50_lag16_window1,log_sales_q75_lag16_window1,log_sales_q95_lag16_window1,log_sales_q99_lag16_window1,log_sales_q99.9_lag16_window1
0,2013-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2013-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2013-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2013-01-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2013-01-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2013-01-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2013-01-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,2013-01-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,2013-01-09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,2013-01-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
cols_to_roll = ['sales', 'log_sales']
group_cols = ['store_nbr']
supported_stats = ['mean', 'std', 'min', 'max']
quantiles = ['q0.1', 'q1', 'q5', 'q25', 'q50', 'q75', 'q95', 'q99', 'q99.9']

daily_stats = calc_daily_stats(
    train[train['store_nbr'].isin([1, 2, 3])],
    cols_to_roll,
    group_cols,
    supported_stats,
    quantiles
)
print(daily_stats.shape)
daily_stats.head(10)

(5052, 27)


Unnamed: 0_level_0,store_nbr,sales_mean_wrt_store_nbr,sales_std_wrt_store_nbr,sales_min_wrt_store_nbr,sales_max_wrt_store_nbr,sales_q0.1_wrt_store_nbr,sales_q1_wrt_store_nbr,sales_q5_wrt_store_nbr,sales_q25_wrt_store_nbr,sales_q50_wrt_store_nbr,...,log_sales_max_wrt_store_nbr,log_sales_q0.1_wrt_store_nbr,log_sales_q1_wrt_store_nbr,log_sales_q5_wrt_store_nbr,log_sales_q25_wrt_store_nbr,log_sales_q50_wrt_store_nbr,log_sales_q75_wrt_store_nbr,log_sales_q95_wrt_store_nbr,log_sales_q99_wrt_store_nbr,log_sales_q99.9_wrt_store_nbr
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-01,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-01-01,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-01-01,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-01-02,1,224.762061,517.978644,0.0,2652.0,0.0,0.0,0.0,0.0,10.0,...,7.883446,0.0,0.0,0.0,0.0,2.397895,5.273,6.978487,7.599389,7.855041
2013-01-02,2,311.112696,741.660043,0.0,3870.0,0.0,0.0,0.0,0.0,8.0,...,8.261268,0.0,0.0,0.0,0.0,2.197225,5.7301,7.280974,7.95403,8.230544
2013-01-02,3,729.101455,1690.976013,0.0,8463.0,0.0,0.0,0.0,0.0,15.0,...,9.043577,0.0,0.0,0.0,0.0,2.772589,6.43615,8.187457,8.840493,9.023269
2013-01-03,1,177.977091,415.027208,0.0,2121.0,0.0,0.0,0.0,0.0,3.0,...,7.660114,0.0,0.0,0.0,0.0,1.386294,5.036953,6.767644,7.392677,7.633371
2013-01-03,2,230.68303,533.587524,0.0,2640.0,0.0,0.0,0.0,0.0,5.0,...,7.878913,0.0,0.0,0.0,0.0,1.791759,5.231109,7.052173,7.675585,7.85858
2013-01-03,3,562.749849,1330.220234,0.0,6712.0,0.0,0.0,0.0,0.0,9.0,...,8.811801,0.0,0.0,0.0,0.0,2.302585,6.045467,7.941816,8.591089,8.78973
2013-01-04,1,179.390273,409.12581,0.0,2056.0,0.0,0.0,0.0,0.0,7.0,...,7.629004,0.0,0.0,0.0,0.0,2.079442,4.885911,6.775673,7.383135,7.604417


In [13]:
rolling_stats = roll_daily_stats(daily_stats, group_cols)
print(group_cols)
print(rolling_stats.shape)
rolling_stats.sort_values(by=['date'] + group_cols).head(60)

['store_nbr']
(5052, 28)


Unnamed: 0,store_nbr,date,sales_mean_wrt_store_nbr_lag16_window1,sales_std_wrt_store_nbr_lag16_window1,sales_min_wrt_store_nbr_lag16_window1,sales_max_wrt_store_nbr_lag16_window1,sales_q0.1_wrt_store_nbr_lag16_window1,sales_q1_wrt_store_nbr_lag16_window1,sales_q5_wrt_store_nbr_lag16_window1,sales_q25_wrt_store_nbr_lag16_window1,...,log_sales_max_wrt_store_nbr_lag16_window1,log_sales_q0.1_wrt_store_nbr_lag16_window1,log_sales_q1_wrt_store_nbr_lag16_window1,log_sales_q5_wrt_store_nbr_lag16_window1,log_sales_q25_wrt_store_nbr_lag16_window1,log_sales_q50_wrt_store_nbr_lag16_window1,log_sales_q75_wrt_store_nbr_lag16_window1,log_sales_q95_wrt_store_nbr_lag16_window1,log_sales_q99_wrt_store_nbr_lag16_window1,log_sales_q99.9_wrt_store_nbr_lag16_window1
0,1,2013-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1684,2,2013-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3368,3,2013-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2013-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1685,2,2013-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3369,3,2013-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,2013-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1686,2,2013-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3370,3,2013-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,2013-01-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
rolling_stats = compute_rolling_stats(
    train[train['store_nbr'].isin([1, 2, 3])],
    cols_to_roll,
    group_cols,
    supported_stats,
    quantiles,
    lag=16,
    window=1
)
print(group_cols)
print(rolling_stats.shape)
rolling_stats.sort_values(by=['date'] + group_cols).head(60)

['store_nbr']
(5052, 28)


Unnamed: 0,store_nbr,date,sales_mean_wrt_store_nbr_lag16_window1,sales_std_wrt_store_nbr_lag16_window1,sales_min_wrt_store_nbr_lag16_window1,sales_max_wrt_store_nbr_lag16_window1,sales_q0.1_wrt_store_nbr_lag16_window1,sales_q1_wrt_store_nbr_lag16_window1,sales_q5_wrt_store_nbr_lag16_window1,sales_q25_wrt_store_nbr_lag16_window1,...,log_sales_max_wrt_store_nbr_lag16_window1,log_sales_q0.1_wrt_store_nbr_lag16_window1,log_sales_q1_wrt_store_nbr_lag16_window1,log_sales_q5_wrt_store_nbr_lag16_window1,log_sales_q25_wrt_store_nbr_lag16_window1,log_sales_q50_wrt_store_nbr_lag16_window1,log_sales_q75_wrt_store_nbr_lag16_window1,log_sales_q95_wrt_store_nbr_lag16_window1,log_sales_q99_wrt_store_nbr_lag16_window1,log_sales_q99.9_wrt_store_nbr_lag16_window1
0,1,2013-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1684,2,2013-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3368,3,2013-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2013-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1685,2,2013-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3369,3,2013-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,2013-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1686,2,2013-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3370,3,2013-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,2013-01-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



# Clean/Process Raw Data

In [22]:
if False:
    !python scripts/process_data.py
else:
    !cp -r /content/drive/MyDrive/store_sales_data/clean2 ./data/clean

In [16]:
!ls data

clean  raw


In [17]:
!ls data/raw

holidays_events.csv  sample_submission.csv  test.csv   transactions.csv
oil.csv		     stores.csv		    train.csv


In [23]:
!ls data/clean

clean2
holidays_events_cat_meta.json
holidays_events.parquet
main_cat_meta.json
main.parquet
manifest.json
oil_cat_meta.json
oil.parquet
rolling_lag16_window1_cat_meta.json
rolling_lag16_window1.parquet
rolling_lag16_window28_cat_meta.json
rolling_lag16_window28.parquet
rolling_lag16_window365_cat_meta.json
rolling_lag16_window365.parquet
rolling_lag16_window7_cat_meta.json
rolling_lag16_window7.parquet
rolling_lag16_window91_cat_meta.json
rolling_lag16_window91.parquet
rolling_wrt_city_lag16_window1_cat_meta.json
rolling_wrt_city_lag16_window1.parquet
rolling_wrt_city_lag16_window28_cat_meta.json
rolling_wrt_city_lag16_window28.parquet
rolling_wrt_city_lag16_window365_cat_meta.json
rolling_wrt_city_lag16_window365.parquet
rolling_wrt_city_lag16_window7_cat_meta.json
rolling_wrt_city_lag16_window7.parquet
rolling_wrt_city_lag16_window91_cat_meta.json
rolling_wrt_city_lag16_window91.parquet
rolling_wrt_cluster_lag16_window1_cat_meta.json
rolling_wrt_cluster_lag16_window1.parquet
rolling

In [19]:
# Write clean data back to drive
!rm -r /content/drive/MyDrive/store_sales_data/clean
!cp -r ./data/clean /content/drive/MyDrive/store_sales_data/

In [20]:
!cp -r ./data/clean /content/drive/MyDrive/store_sales_data/clean2

# Tune Model

In [21]:
!pip install optuna -q
!pip install mlflow -q
!pip install pyngrok -q #pyngrok used for visualizing mlflow

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/400.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.0/247.0 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.4/26.4 MB[0m [31m56.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.8/147.8 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.9/114.9 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!cp -r /content/drive/MyDrive/store_sales_data/optuna_studies.db ./optuna_studies.db
#!cp -r /content/drive/MyDrive/store_sales_data/mlruns ./mlruns
!ls

config.yaml  environment.yml	 notebooks	    scripts
data	     experiment_configs  optuna_studies.db  src


In [None]:
# Model tuning
!python scripts/tune_model.py --sample_frac 0.025 --n_trials 1 --n_backtests 8 --valset_size 15 --n_jobs -1

Loading clean Pandas data from './data/clean/'...
Traceback (most recent call last):
  File "/content/Store-Sales/scripts/tune_model.py", line 110, in <module>
    main(args)
  File "/content/Store-Sales/scripts/tune_model.py", line 40, in main
    clean_dfs = load_clean_data(CLEAN_DATA_PATH)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/Store-Sales/src/io_utils.py", line 79, in load_clean_data
    df = pd.read_parquet(parquet_path, engine="pyarrow")
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pandas/io/parquet.py", line 667, in read_parquet
    return impl.read(
           ^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pandas/io/parquet.py", line 267, in read
    path_or_handle, handles, filesystem = _get_path_or_handle(
                                          ^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pandas/io/parquet.py", line 140, in _get_path_or_handle
    hand

In [None]:
# Write studies + mlruns back to drive
!cp -r ./optuna_studies.db /content/drive/MyDrive/store_sales_data/optuna_studies.db

# Fit Best Model

In [None]:
!python scripts/train_best.py

<Client: 'tcp://127.0.0.1:39883' processes=4 threads=4, memory=22.35 GiB>
Loading clean Dask data from './data/clean/'...
Merging all data using Dask...
--- Training using following trial.... --- 

Best trial number: 93
Best value (objective/loss): 0.49744042896454427
Best hyperparameters:
 * n_estimators: 493
 * max_depth: 9
 * learning_rate: 0.0410202535208337
 * subsample: 0.7176046362898401
 * colsample_bytree: 0.9463364779194452
 * reg_lambda: 2.3036586760144404
 * gamma: 0.2579149541256932
 * enable_categorical: True

Fitting model...
2025-07-29 00:25:36,825 - distributed.scheduler - ERROR - Task ('read_parquet-fused-ce8f1b87cf52c8b2767987f4d780cf80', 0) marked as failed because 4 workers died while trying to run it
Dask client closed
Traceback (most recent call last):
  File "/content/Store-Sales/scripts/train_best.py", line 121, in <module>
    main(args)
  File "/content/Store-Sales/scripts/train_best.py", line 105, in main
    model.fit(
  File "/usr/local/lib/python3.11/dist

# Make Submission

In [None]:
!python scripts/make_submission.py