### Imports

In [2]:
import os
import pandas as pd

In [3]:
from tqdm.autonotebook import tqdm
from statsforecast import StatsForecast
from statsforecast.models import AutoARIMA, HistoricAverage

  from tqdm.autonotebook import tqdm


### Constants

In [79]:
DATA_FOLDER = "data/"
M5_FILE = "m5-dataset.parquet.gzip"
FOZZY_FILE = "fozzy-dataset.parquet.gzip"

### Functions

In [80]:

def preprocess_m5():
    """
    unique_id:  object
    ds:         datetime64[ns]
    y:          float32
    """
    df = pd.read_parquet('https://m5-benchmarks.s3.amazonaws.com/data/train/target.parquet')
    df = df.rename(columns={
        'item_id': 'unique_id', 
        'timestamp': 'ds', 
        'demand': 'y'
    })
    df['ds'] = pd.to_datetime(df['ds'])
    df['unique_id'] = df['unique_id'].astype(str)
    df = df.reset_index(drop=True)
    print(df.dtypes)
    df.to_parquet(os.path.join(DATA_FOLDER, M5_FILE), compression='gzip') 
    print("The file is saved.")


In [81]:
def preprocess_fozzy():
    # todo: implement function
    pass


# Fozzy Group Data Preprocessing

In [82]:
os.listdir(os.path.join(DATA_FOLDER, "hack4retail/"))

['.DS_Store',
 'sku_final.csv',
 'geo_params.csv',
 'train_final.csv',
 'archive.zip',
 'sample_final.csv',
 'test_data.csv']

In [83]:
df = pd.read_csv(os.path.join(DATA_FOLDER, "hack4retail/train_final.csv"))

In [84]:
df.head()

Unnamed: 0,ID,geoCluster,SKU,date,price,sales
0,RR27956447,21,32485,2021-07-08,39.69,0.9
1,RR27956448,21,32485,2021-07-09,,
2,RR27956449,21,32485,2021-07-10,,
3,RR27956450,21,32485,2021-07-11,,
4,RR27956451,21,32485,2021-07-12,,


In [85]:
df.loc[(df["SKU"] == 32485) & (df["geoCluster"] == 3230)]

Unnamed: 0,ID,geoCluster,SKU,date,price,sales
3362405,RR31318852,3230,32485,2021-06-30,41.99,12.3
3362406,RR31318853,3230,32485,2021-07-01,42.79,27.8
3362407,RR31318854,3230,32485,2021-07-02,41.89,33.0
3362408,RR31318855,3230,32485,2021-07-03,41.99,34.2
3362409,RR31318856,3230,32485,2021-07-04,41.79,40.4
3362410,RR31318857,3230,32485,2021-07-05,41.59,45.8
3362411,RR31318858,3230,32485,2021-07-06,41.99,44.9
3362412,RR31318859,3230,32485,2021-07-07,41.89,35.4
3362413,RR31318860,3230,32485,2021-07-08,42.09,46.1
3362414,RR31318861,3230,32485,2021-07-09,37.79,24.4


In [86]:
df["unique_id"] = df["geoCluster"].astype(str) + "-" + df["SKU"].astype(str)

In [87]:
df.loc[df["unique_id"] == "3230-32485"]

Unnamed: 0,ID,geoCluster,SKU,date,price,sales,unique_id
3362405,RR31318852,3230,32485,2021-06-30,41.99,12.3,3230-32485
3362406,RR31318853,3230,32485,2021-07-01,42.79,27.8,3230-32485
3362407,RR31318854,3230,32485,2021-07-02,41.89,33.0,3230-32485
3362408,RR31318855,3230,32485,2021-07-03,41.99,34.2,3230-32485
3362409,RR31318856,3230,32485,2021-07-04,41.79,40.4,3230-32485
3362410,RR31318857,3230,32485,2021-07-05,41.59,45.8,3230-32485
3362411,RR31318858,3230,32485,2021-07-06,41.99,44.9,3230-32485
3362412,RR31318859,3230,32485,2021-07-07,41.89,35.4,3230-32485
3362413,RR31318860,3230,32485,2021-07-08,42.09,46.1,3230-32485
3362414,RR31318861,3230,32485,2021-07-09,37.79,24.4,3230-32485


In [88]:
data = df[["unique_id", "date", "sales"]]

In [89]:
data = data.rename(columns={"date": "ds", "sales": "y"})
data["y"] = data["y"].fillna(0)
data

Unnamed: 0,unique_id,ds,y
0,21-32485,2021-07-08,0.9
1,21-32485,2021-07-09,0.0
2,21-32485,2021-07-10,0.0
3,21-32485,2021-07-11,0.0
4,21-32485,2021-07-12,0.0
...,...,...,...
27956440,3230-841486,2021-07-15,0.0
27956441,3230-841486,2021-07-16,0.0
27956442,3230-841486,2021-07-17,0.0
27956443,3230-841486,2021-07-18,0.0


In [90]:
StatsForecast.plot(data)

### Fix issues in data

- We need to remove small TS

In [91]:
data.groupby(['unique_id'])['unique_id'].count().describe()

count    115831.000000
mean        241.355466
std         144.581146
min           1.000000
25%         124.000000
50%         226.000000
75%         384.000000
max         566.000000
Name: unique_id, dtype: float64

In [92]:
res = data.groupby(['unique_id'])['unique_id'].count().reset_index(name='counts')
res.sort_values(by='counts', ascending=True)

Unnamed: 0,unique_id,counts
2217,1934-796587,1
112055,2992-441879,1
25310,2043-850552,1
25324,2043-853103,1
67754,2238-763773,1
...,...,...
89478,2407-873803,566
47491,2126-844051,566
47492,2126-844052,566
106150,2795-873801,566


In [93]:
ts = res["unique_id"].loc[res["counts"] >= 100].unique()
data = data.loc[data["unique_id"].isin(ts)]

### Check the data set: simple case

In [94]:
train = data.loc[data["unique_id"] == "2262-736359"]

In [95]:
sf = StatsForecast(
    models = [AutoARIMA(season_length=14)],
    freq = 'D'
)

sf.fit(train)

StatsForecast(models=[AutoARIMA])

In [96]:
forecast_df = sf.predict(h=14, level=[90]) 

In [97]:
sf.plot(train, forecast_df, level=[90])

### Check the data: all TS

In [98]:
sf = StatsForecast(
    models = [HistoricAverage()],
    freq = 'D'
)

sf.fit(data)

StatsForecast(models=[HistoricAverage])

In [99]:
forecast_df = sf.predict(h=14, level=[90])

In [100]:
sf.plot(data, forecast_df, level=[90])

### Save Data

In [101]:
data.head()

Unnamed: 0,unique_id,ds,y
1997,323-24,2021-03-03,0.4
1998,323-24,2021-03-04,0.0
1999,323-24,2021-03-05,0.0
2000,323-24,2021-03-06,0.0
2001,323-24,2021-03-07,0.0


In [102]:
data.shape

(26748942, 3)

In [103]:
data.to_parquet(os.path.join(DATA_FOLDER, FOZZY_FILE), compression='gzip') 

# Data Set: Corporación Favorita Grocery Sales Forecasting

In [1]:
FAV_PATH = "data/favorita-grocery-sales-forecasting/"

In [6]:
for file in os.listdir(FAV_PATH):
    print(file)
    
    if file != ".DS_Store":
        data = pd.read_csv(os.path.join(FAV_PATH, file))

        print(data.head())
        print("\n")

.DS_Store
test.csv
          id        date  store_nbr  item_nbr  onpromotion
0  125497040  2017-08-16          1     96995        False
1  125497041  2017-08-16          1     99197        False
2  125497042  2017-08-16          1    103501        False
3  125497043  2017-08-16          1    103520        False
4  125497044  2017-08-16          1    103665        False


train.csv


  data = pd.read_csv(os.path.join(FAV_PATH, file))


   id        date  store_nbr  item_nbr  unit_sales onpromotion
0   0  2013-01-01         25    103665         7.0         NaN
1   1  2013-01-01         25    105574         1.0         NaN
2   2  2013-01-01         25    105575         2.0         NaN
3   3  2013-01-01         25    108079         1.0         NaN
4   4  2013-01-01         25    108701         1.0         NaN


transactions.csv
         date  store_nbr  transactions
0  2013-01-01         25           770
1  2013-01-02          1          2111
2  2013-01-02          2          2358
3  2013-01-02          3          3487
4  2013-01-02          4          1922


items.csv
   item_nbr        family  class  perishable
0     96995     GROCERY I   1093           0
1     99197     GROCERY I   1067           0
2    103501      CLEANING   3008           0
3    103520     GROCERY I   1028           0
4    103665  BREAD/BAKERY   2712           1


oil.csv
         date  dcoilwtico
0  2013-01-01         NaN
1  2013-01-02       93.14