## Read in raw data

In [None]:
# Read in raw data

import os
import pandas as pd

raw_data_path = "../data/raw"

# Store df's in a dictionary
raw_dfs = {}
raw_dfs['holidays_events'] = pd.read_csv(os.path.join(raw_data_path, "holidays_events.csv"))
raw_dfs['oil'] = pd.read_csv(os.path.join(raw_data_path, "oil.csv"))
raw_dfs['stores'] = pd.read_csv(os.path.join(raw_data_path, "stores.csv"))
raw_dfs['train'] = pd.read_csv(os.path.join(raw_data_path, "train.csv"))
raw_dfs['test'] = pd.read_csv(os.path.join(raw_data_path, "test.csv"))

We have a normalized setup of our data

## Basic inspection of data

In [2]:
# Check out high-level info of our df's
def inspect_df(df_name, df):
    print(f" * {df_name}.dtypes: \n{df.dtypes}")
    print(f" * {df_name}.isnull().sum(): \n{df.isnull().sum()}")
    print(f" * {df_name}.shape: {df.shape}")
    print(f" * {df_name}.nunique() : \n{df.nunique()}")

for i, df_name in enumerate(raw_dfs):
    print(f"\n#### {i+1}/{len(raw_dfs)}: {df_name} ####")
    df = raw_dfs[df_name]
    inspect_df(df_name, df)


#### 1/5: holidays_events ####
 * holidays_events.dtypes: 
date           object
type           object
locale         object
locale_name    object
description    object
transferred      bool
dtype: object
 * holidays_events.isnull().sum(): 
date           0
type           0
locale         0
locale_name    0
description    0
transferred    0
dtype: int64
 * holidays_events.shape: (100, 6)
 * holidays_events.nunique() : 
date           89
type            6
locale          3
locale_name    24
description    52
transferred     2
dtype: int64

#### 2/5: oil ####
 * oil.dtypes: 
date           object
dcoilwtico    float64
dtype: object
 * oil.isnull().sum(): 
date          0
dcoilwtico    4
dtype: int64
 * oil.shape: (100, 2)
 * oil.nunique() : 
date          100
dcoilwtico     89
dtype: int64

#### 3/5: stores ####
 * stores.dtypes: 
store_nbr     int64
city         object
state        object
type         object
cluster       int64
dtype: object
 * stores.isnull().sum(): 
store_nbr    0
ci

What we deduce from above:
* Basically no N/A's, only in oil
* No duplicate rows

In [3]:
# Get a better idea of what our data looks like
for i, df_name in enumerate(raw_dfs):
    print(f"\n#### {i+1}/{len(raw_dfs)}: {df_name} ####")
    df = raw_dfs[df_name]

    print(df.head())


#### 1/5: holidays_events ####
         date     type    locale locale_name                    description  \
0  2012-03-02  Holiday     Local       Manta             Fundacion de Manta   
1  2012-04-01  Holiday  Regional    Cotopaxi  Provincializacion de Cotopaxi   
2  2012-04-12  Holiday     Local      Cuenca            Fundacion de Cuenca   
3  2012-04-14  Holiday     Local    Libertad      Cantonizacion de Libertad   
4  2012-04-21  Holiday     Local    Riobamba      Cantonizacion de Riobamba   

   transferred  
0        False  
1        False  
2        False  
3        False  
4        False  

#### 2/5: oil ####
         date  dcoilwtico
0  2013-01-01         NaN
1  2013-01-02       93.14
2  2013-01-03       92.97
3  2013-01-04       93.12
4  2013-01-07       93.20

#### 3/5: stores ####
   store_nbr           city                           state type  cluster
0          1          Quito                       Pichincha    D       13
1          2          Quito                 

## Diagnosis:
* Aren't any duplicate rows
* Only N/A's in 'oil' (will deal with later)
* *Mainly want to set types*
    * Change type of 'date' to pd.datetime
    * Change (other) object types to 'category'


# For total sales

In [5]:
main = raw_dfs['train']
main['date'] = pd.to_datetime(main['date'], format="%Y-%m-%d")
main = main.sort_values('date')

# Group by date and aggregate (e.g., sum/mean) if duplicates exist
daily_sales = main.groupby('date')['sales'].sum().reset_index()  # or .mean(), depending on your goal
daily_sales.set_index('date', inplace=True)

In [6]:
# Define stats
stats = ['mean', 'std', 'min', 'max']

# Rolling over 7 days (now works correctly since index has no duplicates)
rolled = (
    daily_sales[['sales']]
    .rolling('7D', min_periods=1)
    .agg(stats)
)
rolled

Unnamed: 0_level_0,sales,sales,sales,sales
Unnamed: 0_level_1,mean,std,min,max
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013-01-01,2511.618999,,2511.618999,2511.618999


In [7]:
# Step 1: Aggregate sales by store and date (to handle duplicates)
daily_sales = (
    main.groupby(['store_nbr', 'date'])['sales']
    .sum()  # or .mean(), .max(), etc.
    .reset_index()
)

# Step 2: Set 'date' as index (required for time-based rolling)
daily_sales = daily_sales.set_index('date')

# Step 3: Define stats
stats = ['mean', 'std', 'min', 'max']

# Step 4: Group by store_nbr and apply rolling 7D window
rolled = (
    daily_sales.groupby('store_nbr')['sales']
    .rolling('7D', min_periods=1)
    .agg(stats)
)

rolled

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,min,max
store_nbr,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2013-01-01,0.0,,0.0,0.0
2,2013-01-01,0.0,,0.0,0.0
3,2013-01-01,0.0,,0.0,0.0
10,2013-01-01,0.0,,0.0,0.0
11,2013-01-01,0.0,,0.0,0.0
12,2013-01-01,0.0,,0.0,0.0
13,2013-01-01,0.0,,0.0,0.0
14,2013-01-01,0.0,,0.0,0.0
15,2013-01-01,0.0,,0.0,0.0
16,2013-01-01,0.0,,0.0,0.0
