# Preprocessing

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import joblib
import os

In [2]:
df = pd.read_parquet('../Data/vizu.parquet')
data = {
    column.replace('_', ' ').title():df[column].rename(column.replace('_', ' ').title()) 
    for column in df.columns
}
data.keys()

dict_keys(['Auto Public', 'Auto Private', 'Passenger Truck Public', 'Passenger Truck Private', 'Cargo Public', 'Cargo Private', 'Total'])

## Standardize time series data

In [3]:
scaler_dir = '../Outputs/Models/Scalers'
os.makedirs(scaler_dir, exist_ok=True)

scalers = {
    key:StandardScaler().fit(data[key].values.reshape(-1,1)) 
    for key in data.keys()
}
for key in scalers.keys():
    joblib.dump(scalers[key], os.path.join(scaler_dir, f'scaler_{key.replace(" ", "_").lower()}.save'))

data_scaled = {
    key:pd.DataFrame(
        scalers[key].transform(data[key].values.reshape(-1,1)).flatten().reshape(-1, 1), 
        index=data[key].index, 
        columns=[key]
    ) 
    for key in data.keys()
}

## Date Time Feature

In [4]:
for key in data_scaled.keys():
    df = data_scaled[key]

    df.rename(columns={key: 'Registers_scaled'}, inplace=True)

    df['Year'] = df.index.year
    df['Month'] = df.index.month
    df['Quarter'] = df.index.quarter

    df['Month_sin'] = np.sin(2 * np.pi * df['Month']/12)
    df['Month_cos'] = np.cos(2 * np.pi * df['Month']/12)

    print(df.columns)
    data_scaled[key] = df

Index(['Registers_scaled', 'Year', 'Month', 'Quarter', 'Month_sin',
       'Month_cos'],
      dtype='object')
Index(['Registers_scaled', 'Year', 'Month', 'Quarter', 'Month_sin',
       'Month_cos'],
      dtype='object')
Index(['Registers_scaled', 'Year', 'Month', 'Quarter', 'Month_sin',
       'Month_cos'],
      dtype='object')
Index(['Registers_scaled', 'Year', 'Month', 'Quarter', 'Month_sin',
       'Month_cos'],
      dtype='object')
Index(['Registers_scaled', 'Year', 'Month', 'Quarter', 'Month_sin',
       'Month_cos'],
      dtype='object')
Index(['Registers_scaled', 'Year', 'Month', 'Quarter', 'Month_sin',
       'Month_cos'],
      dtype='object')
Index(['Registers_scaled', 'Year', 'Month', 'Quarter', 'Month_sin',
       'Month_cos'],
      dtype='object')


## Lag Feature and Window Feature

In [5]:
for key in data_scaled.keys():
    df = data_scaled[key]

    for lag in [1, 3, 6, 9, 12]:
        df[f'Lag_{lag}'] = df['Registers_scaled'].shift(lag)
    
    print(df.columns)
    data_scaled[key] = df

Index(['Registers_scaled', 'Year', 'Month', 'Quarter', 'Month_sin',
       'Month_cos', 'Lag_1', 'Lag_3', 'Lag_6', 'Lag_9', 'Lag_12'],
      dtype='object')
Index(['Registers_scaled', 'Year', 'Month', 'Quarter', 'Month_sin',
       'Month_cos', 'Lag_1', 'Lag_3', 'Lag_6', 'Lag_9', 'Lag_12'],
      dtype='object')
Index(['Registers_scaled', 'Year', 'Month', 'Quarter', 'Month_sin',
       'Month_cos', 'Lag_1', 'Lag_3', 'Lag_6', 'Lag_9', 'Lag_12'],
      dtype='object')
Index(['Registers_scaled', 'Year', 'Month', 'Quarter', 'Month_sin',
       'Month_cos', 'Lag_1', 'Lag_3', 'Lag_6', 'Lag_9', 'Lag_12'],
      dtype='object')
Index(['Registers_scaled', 'Year', 'Month', 'Quarter', 'Month_sin',
       'Month_cos', 'Lag_1', 'Lag_3', 'Lag_6', 'Lag_9', 'Lag_12'],
      dtype='object')
Index(['Registers_scaled', 'Year', 'Month', 'Quarter', 'Month_sin',
       'Month_cos', 'Lag_1', 'Lag_3', 'Lag_6', 'Lag_9', 'Lag_12'],
      dtype='object')
Index(['Registers_scaled', 'Year', 'Month', 'Quarter', 'Mo

## Rolling Window Statistics

In [6]:
windows = [3, 6, 12]

for key in data_scaled.keys():
    df = data_scaled[key]
    for window in windows:
        df[f'RollingMean_{window}'] = df['Registers_scaled'].rolling(window=window).mean()
        df[f'RollingStd_{window}'] = df['Registers_scaled'].rolling(window=window).std()
        df[f'RollingMin_{window}'] = df['Registers_scaled'].rolling(window=window).min()
        df[f'RollingMax_{window}'] = df['Registers_scaled'].rolling(window=window).max()
    print(df.columns)
    data_scaled[key] = df

Index(['Registers_scaled', 'Year', 'Month', 'Quarter', 'Month_sin',
       'Month_cos', 'Lag_1', 'Lag_3', 'Lag_6', 'Lag_9', 'Lag_12',
       'RollingMean_3', 'RollingStd_3', 'RollingMin_3', 'RollingMax_3',
       'RollingMean_6', 'RollingStd_6', 'RollingMin_6', 'RollingMax_6',
       'RollingMean_12', 'RollingStd_12', 'RollingMin_12', 'RollingMax_12'],
      dtype='object')
Index(['Registers_scaled', 'Year', 'Month', 'Quarter', 'Month_sin',
       'Month_cos', 'Lag_1', 'Lag_3', 'Lag_6', 'Lag_9', 'Lag_12',
       'RollingMean_3', 'RollingStd_3', 'RollingMin_3', 'RollingMax_3',
       'RollingMean_6', 'RollingStd_6', 'RollingMin_6', 'RollingMax_6',
       'RollingMean_12', 'RollingStd_12', 'RollingMin_12', 'RollingMax_12'],
      dtype='object')
Index(['Registers_scaled', 'Year', 'Month', 'Quarter', 'Month_sin',
       'Month_cos', 'Lag_1', 'Lag_3', 'Lag_6', 'Lag_9', 'Lag_12',
       'RollingMean_3', 'RollingStd_3', 'RollingMin_3', 'RollingMax_3',
       'RollingMean_6', 'RollingStd_6', 

## Exapanding Window Statistics

In [7]:
for key in data_scaled.keys():
    df = data_scaled[key]

    df['ExpandingMean'] = df['Registers_scaled'].expanding().mean()
    df['ExpandingStd'] = df['Registers_scaled'].expanding().std()
    df['ExpandingMin'] = df['Registers_scaled'].expanding().min()
    df['ExpandingMax'] = df['Registers_scaled'].expanding().max()
    
    data_scaled[key] = df

## Saved Preprocessing data

In [8]:
os.makedirs('../Data/Processed', exist_ok=True)
for key in data_scaled.keys():
    df = data_scaled[key]
    df.to_parquet(f'../Data/Processed/processed_{key.replace(" ", "_").lower()}.parquet')
    print(f'Processed data for {key} saved to ../Data/Processed/processed_{key.replace(" ", "_").lower()}.parquet')

Processed data for Auto Public saved to ../Data/Processed/processed_auto_public.parquet
Processed data for Auto Private saved to ../Data/Processed/processed_auto_private.parquet
Processed data for Passenger Truck Public saved to ../Data/Processed/processed_passenger_truck_public.parquet
Processed data for Passenger Truck Private saved to ../Data/Processed/processed_passenger_truck_private.parquet
Processed data for Cargo Public saved to ../Data/Processed/processed_cargo_public.parquet
Processed data for Cargo Private saved to ../Data/Processed/processed_cargo_private.parquet
Processed data for Total saved to ../Data/Processed/processed_total.parquet
