# Preprocessing

In [5]:
# Standardize time series data
from sklearn.preprocessing import StandardScaler
import pandas as pd
from math import sqrt
import joblib

In [2]:
# Prepare data for standardization
data = pd.read_parquet('../Data/vizu.parquet')

values = {}
for col in data.columns:
    print(f'Values saved for column: {col.replace("_", " ").title()}')
    values[col] = data[col].values.reshape(-1, 1)

Values saved for column: Auto Public
Values saved for column: Auto Private
Values saved for column: Passenger Truck Public
Values saved for column: Passenger Truck Private
Values saved for column: Cargo Public
Values saved for column: Cargo Private
Values saved for column: Total


In [4]:
# Train the standardization
scaler = {}

for col in data.columns:
    print(f'StandardScaler trained for column: {col.replace("_", " ").title()}')
    scaler[col] = StandardScaler().fit(values[col])
    print(f'Mean: {scaler[col].mean_[0]:.2f}, Std: {sqrt(scaler[col].var_[0]):.2f}\n')

StandardScaler trained for column: Auto Public
Mean: 477197.66, Std: 153676.69

StandardScaler trained for column: Auto Private
Mean: 19161572.37, Std: 10394363.29

StandardScaler trained for column: Passenger Truck Public
Mean: 147420.89, Std: 65700.66

StandardScaler trained for column: Passenger Truck Private
Mean: 167539.25, Std: 133138.25

StandardScaler trained for column: Cargo Public
Mean: 122097.01, Std: 31100.21

StandardScaler trained for column: Cargo Private
Mean: 7442361.08, Std: 2960105.36

StandardScaler trained for column: Total
Mean: 27518188.26, Std: 13630623.76



In [12]:
# Transform the data
scaled_data = {}

for col in data.columns:
    print(f'Transforming data for column: {col.replace('_', ' ').title()}')
    scaled_data[col] = scaler[col].transform(values[col]).flatten()

transformed_data = pd.DataFrame(scaled_data)
transformed_data.sample(5)

Transforming data for column: Auto Public
Transforming data for column: Auto Private
Transforming data for column: Passenger Truck Public
Transforming data for column: Passenger Truck Private
Transforming data for column: Cargo Public
Transforming data for column: Cargo Private
Transforming data for column: Total


Unnamed: 0,auto_public,auto_private,passenger_truck_public,passenger_truck_private,cargo_public,cargo_private,total
408,1.463406,1.808157,4.289715,4.136525,0.210738,1.539774,1.791301
45,-1.144212,-1.172326,-1.154934,-0.982064,-1.181761,-1.332481,-1.214111
42,-1.110413,-1.16076,-1.226668,-1.000263,-1.184783,-1.333785,-1.205723
240,0.230974,0.146429,-0.066512,0.010153,1.341759,0.457765,0.216518
273,0.528007,0.464809,0.134125,0.148227,1.591275,0.66052,0.509572


In [6]:
# Save the scalers
for col in data.columns:
    joblib.dump(scaler[col], f'../Outputs/Models/scaler_{col}.save')

In [13]:
# Save tha transformed data
transformed_data.to_parquet('../Data/transformed_data.parquet')