# Preprocessing

In [1]:
import pandas as pd
from copy import deepcopy as dc
import numpy as np
from sklearn.preprocessing import StandardScaler
import joblib
import os

In [2]:
df = pd.read_parquet('../Data/vizu.parquet')
data = {
    column.replace('_', ' ').title():df[column].rename(column.replace('_', ' ').title()) 
    for column in df.columns
}
data.keys()

dict_keys(['Auto Public', 'Auto Private', 'Passenger Truck Public', 'Passenger Truck Private', 'Cargo Public', 'Cargo Private', 'Total'])

In [3]:
scaler_dir = '../Outputs/Models/Scalers'
os.makedirs(scaler_dir, exist_ok=True)

scalers = {
    key:StandardScaler().fit(data[key].values.reshape(-1,1)) 
    for key in data.keys()
}
for key in scalers.keys():
    joblib.dump(scalers[key], os.path.join(scaler_dir, f'scaler_{key.replace(" ", "_").lower()}.save'))

data_scaled = {
    key:pd.DataFrame(
        scalers[key].transform(data[key].values.reshape(-1,1)).flatten().reshape(-1, 1), 
        index=data[key].index, 
        columns=[key]
    ) 
    for key in data.keys()
}

for key in data_scaled.keys():
    df = dc(data_scaled[key])
    for i in range(1, 13):
        df[f'X(t-{i})'] = df[key].shift(i)
    df.rename(columns={key:'X'}, inplace=True)
    data_scaled[key] = df.dropna()

In [4]:
for key in data_scaled.keys():
    print(data_scaled[key].head(5))

                   X    X(t-1)    X(t-2)    X(t-3)    X(t-4)    X(t-5)  \
FECHA                                                                    
1992-01-31 -1.358812 -1.328176 -1.336889 -1.345374 -1.353834 -1.362273   
1992-02-29 -1.349682 -1.358812 -1.328176 -1.336889 -1.345374 -1.353834   
1992-03-31 -1.340507 -1.349682 -1.358812 -1.328176 -1.336889 -1.345374   
1992-04-30 -1.331273 -1.340507 -1.349682 -1.358812 -1.328176 -1.336889   
1992-05-31 -1.322000 -1.331273 -1.340507 -1.349682 -1.358812 -1.328176   

              X(t-6)    X(t-7)    X(t-8)    X(t-9)   X(t-10)   X(t-11)  \
FECHA                                                                    
1992-01-31 -1.370700 -1.379101 -1.387482 -1.395844 -1.404180 -1.412496   
1992-02-29 -1.362273 -1.370700 -1.379101 -1.387482 -1.395844 -1.404180   
1992-03-31 -1.353834 -1.362273 -1.370700 -1.379101 -1.387482 -1.395844   
1992-04-30 -1.345374 -1.353834 -1.362273 -1.370700 -1.379101 -1.387482   
1992-05-31 -1.336889 -1.345374 -1.353

## Saved Preprocessing data

In [5]:
os.makedirs('../Data/Processed', exist_ok=True)
for key in data_scaled.keys():
    df = data_scaled[key]
    df.dropna(inplace=True)
    df.to_parquet(f'../Data/Processed/processed_{key.replace(" ", "_").lower()}.parquet')
    print(f'Processed data for {key} saved to ../Data/Processed/processed_{key.replace(" ", "_").lower()}.parquet')

Processed data for Auto Public saved to ../Data/Processed/processed_auto_public.parquet
Processed data for Auto Private saved to ../Data/Processed/processed_auto_private.parquet
Processed data for Passenger Truck Public saved to ../Data/Processed/processed_passenger_truck_public.parquet
Processed data for Passenger Truck Private saved to ../Data/Processed/processed_passenger_truck_private.parquet
Processed data for Cargo Public saved to ../Data/Processed/processed_cargo_public.parquet
Processed data for Cargo Private saved to ../Data/Processed/processed_cargo_private.parquet
Processed data for Total saved to ../Data/Processed/processed_total.parquet
