## NYISO Load Prediction
- Objective: Utilize the NBEATS model to predict NYISO load data for 2023-12-31 using historical data from 2013-01-01 to 2023-12-30.
- Zones: `N.Y.C.`, `NORTH`, `CENTRL`
- Scaling methods: [definition](https://nixtlaverse.nixtla.io/neuralforecast/common.scalers.html)
     - [`identity`](https://nixtlaverse.nixtla.io/neuralforecast/common.scalers.html#std-statistics)
     - [`standard`](https://nixtlaverse.nixtla.io/neuralforecast/common.scalers.html#std-statistics)
     - [`minmax`](https://nixtlaverse.nixtla.io/neuralforecast/common.scalers.html#minmax-statistics)
     - [`robust`](https://nixtlaverse.nixtla.io/neuralforecast/common.scalers.html#robust-statistics)
     - `revin`:  learnable normalization parameters are added on top of the usual normalization technique.

In [None]:

import logging
import pickle
import warnings

import numpy as np
import pandas as pd
from neuralforecast import NeuralForecast
from neuralforecast.models import NBEATS
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from ts_scaler.data.data_handler import DataHandler
from ts_scaler.utils.logger import setup_logger

# Suppress warnings
warnings.filterwarnings("ignore")
logger = setup_logger(level=logging.ERROR)

# Fetch data
data_handler = DataHandler(logger=logger)
df = data_handler.fetch_nyiso_data(local_dir="../data", start_date="20130101", end_date="20231231")

# Data preprocessing
df = df.drop_duplicates(subset=['time_stamp', 'zone_name'])
df = df.dropna()
df = df[df['integrated_load'] > 0]

# Convert time_stamp to datetime
df['time_stamp'] = pd.to_datetime(df['time_stamp'])
ndf = df.rename(columns={"time_stamp": "ds", "integrated_load": "y", "zone_name": "unique_id"})[
    ["ds", "unique_id", "y"]]
zones = ndf.unique_id.unique()

In [None]:
def train_and_prediction_identity(df_train, df_test, zone):
    horizon = len(df_test)
    models = [
        NBEATS(input_size=len(df_test) * 7, h=horizon, max_steps=500, scaler_type='identity'),
    ]

    nf = NeuralForecast(models=models, freq='H')
    nf.fit(df=df_train)
    Y_hat_df = nf.predict().reset_index()

    Y_hat_df = df_test.merge(Y_hat_df, how='left', on=['unique_id', 'ds'])
    # df_train[-(24 * 3):]
    plot_df = pd.concat([df_train, Y_hat_df]).drop("unique_id", axis=1).set_index('ds').rename(columns={
        'NBEATS': 'NBEATS - Identity',
    })
    plot_df.index = pd.to_datetime(plot_df.index)
    return plot_df


def train_and_prediction(df_train, df_test, zone):
    horizon = len(df_test)
    models = [
        NBEATS(input_size=len(df_test) * 7, h=horizon, max_steps=500, scaler_type='identity'),
        NBEATS(input_size=len(df_test) * 7, h=horizon, max_steps=500, scaler_type='standard'),
        NBEATS(input_size=len(df_test) * 7, h=horizon, max_steps=500, scaler_type='minmax'),
        NBEATS(input_size=len(df_test) * 7, h=horizon, max_steps=500, scaler_type='robust'),
        NBEATS(input_size=len(df_test) * 7, h=horizon, max_steps=500, scaler_type='revin'),
    ]

    nf = NeuralForecast(models=models, freq='H')
    nf.fit(df=df_train)
    Y_hat_df = nf.predict().reset_index()

    Y_hat_df = df_test.merge(Y_hat_df, how='left', on=['unique_id', 'ds'])
    # df_train[-(24 * 3):]
    plot_df = pd.concat([df_train, Y_hat_df]).drop("unique_id", axis=1).set_index('ds').rename(columns={
        'NBEATS': 'NBEATS - Identity',
        'NBEATS1': 'NBEATS - Standard',
        'NBEATS2': 'NBEATS - MinMax',
        'NBEATS3': 'NBEATS - Robust',
        'NBEATS4': 'NBEATS - Revin',
    })
    plot_df.index = pd.to_datetime(plot_df.index)
    return plot_df


def process_and_predict(df_train, df_test, zone):
    # Ensure data types are consistent
    df_train = df_train.reset_index()
    df_test = df_test.reset_index()
    df_train['unique_id'] = df_train['unique_id'].astype(str)
    df_test['unique_id'] = df_test['unique_id'].astype(str)

    df_train['ds'] = pd.to_datetime(df_train['ds'])
    df_test['ds'] = pd.to_datetime(df_test['ds'])

    df_train['y'] = df_train['y'].astype(float)
    df_test['y'] = df_test['y'].astype(float)

    # Reset index and prepare the DataFrame for residuals
    df_train_residual = df_train[["ds", "residuals", "unique_id"]]
    df_train_residual = df_train_residual.rename(columns={"residuals": "y"})[["ds", "unique_id", "y"]]
    df_test_residual = df_test[["ds", "residuals", "unique_id"]]
    df_test_residual = df_test_residual.rename(columns={"residuals": "y"})[["ds", "unique_id", "y"]]

    # Perform predictions using the identity method
    pca_prediction_df = train_and_prediction_identity(df_train_residual, df_test_residual, zone)

    df_combined = pd.concat([df_train, df_test])

    # Ensure the column types for the df_combined DataFrame
    df_combined['unique_id'] = df_combined['unique_id'].astype(str)
    df_combined['ds'] = pd.to_datetime(df_combined['ds'])
    df_combined['reconstructed'] = df_combined['reconstructed'].astype(float)

    # Remove duplicate indices
    df_combined = df_combined.drop_duplicates(subset=['ds', 'unique_id'])
    df_train_residual = df_train_residual.drop_duplicates(subset=['ds', 'unique_id'])
    df_test_residual = df_test_residual.drop_duplicates(subset=['ds', 'unique_id'])

    # Perform predictions using the general method
    all_prediction_df = train_and_prediction(df_train[["ds", "y", "unique_id"]],
                                             df_test[["ds", "y", "unique_id"]],
                                             zone)

    # Ensure 'ds' and 'unique_id' are present in all_prediction_df
    all_prediction_df['ds'] = df_combined['ds'].values
    all_prediction_df['unique_id'] = df_combined['unique_id'].values

    pca_prediction_df['ds'] = df_combined['ds'].values
    pca_prediction_df['unique_id'] = df_combined['unique_id'].values

    # Ensure alignment before addition
    all_prediction_df = all_prediction_df.set_index(['ds', 'unique_id'])
    pca_prediction_df = pca_prediction_df.set_index(['ds', 'unique_id'])
    df_combined = df_combined.set_index(['ds', 'unique_id'])

    # Perform the addition with alignment
    all_prediction_df['NBEATS - PCA'] = pca_prediction_df['NBEATS - Identity'] + df_combined['reconstructed']

    all_prediction_df = all_prediction_df.reset_index()
    all_prediction_df["unique_id"] = zone

    return all_prediction_df.set_index('ds')


def add_residuals_and_reconstructed(df, unique_id='N.Y.C.', interpolation_method='linear'):
    # Filter the data for the specified unique_id
    df_filtered = df[df['unique_id'] == unique_id].copy()

    # Ensure 'ds' is a datetime type and set it as index
    if 'ds' not in df_filtered.columns:
        raise KeyError("'ds' column not found in the DataFrame")

    df_filtered['ds'] = pd.to_datetime(df_filtered['ds'])
    df_filtered.set_index('ds', inplace=True)

    # Ensure the index is continuous and fills in any missing hourly data with NaNs
    df_filtered = df_filtered.asfreq('H')

    # Interpolate missing values
    df_filtered['y'] = df_filtered['y'].interpolate(method=interpolation_method)

    # Check if there is at least one Monday and one Sunday in the dataset
    mondays = df_filtered.index[df_filtered.index.weekday == 0]
    sundays = df_filtered.index[df_filtered.index.weekday == 6]
    if len(mondays) == 0 or len(sundays) == 0:
        logging.warning("The dataset does not contain the required Monday and Sunday dates for processing.")
        return df_filtered

    # Define the training and test periods
    last_sunday = sundays[-1]
    last_monday = last_sunday - pd.DateOffset(days=6, hours=23)
    df_test = df_filtered[last_monday:last_monday + pd.DateOffset(hours=23)]
    last_train_sunday = last_monday - pd.DateOffset(days=1)
    df_train = df_filtered[mondays[0]:last_train_sunday + pd.DateOffset(hours=23)]

    # Reshape the training data into weekly format (7 days per week, 24 hours per day)
    reshaped_train_data = df_train['y'].values.reshape(-1, 7 * 24)

    # Standardize the training data
    scaler = StandardScaler()
    reshaped_scaled = scaler.fit_transform(reshaped_train_data)

    # Apply PCA to the training data
    pca = PCA(n_components=20)  # Using only the first principal component
    pca.fit(reshaped_scaled)

    # Reconstruct the training data using the first principal component
    principal_components_train = pca.transform(reshaped_scaled)
    reconstructed_train_data = pca.inverse_transform(principal_components_train)
    reconstructed_train_data_original_scale = scaler.inverse_transform(reconstructed_train_data).flatten()

    # Calculate the residuals for the training data
    residuals_train = df_train['y'].values - reconstructed_train_data_original_scale

    # Add residuals and reconstructed data as new columns in the training DataFrame
    df_train['residuals'] = residuals_train
    df_train['reconstructed'] = reconstructed_train_data_original_scale

    # Handle the test data
    test_data = df_test['y'].values

    # Standardize the test data using the mean and std from the training data
    reshaped_test_scaled = (test_data - scaler.mean_[:24]) / scaler.scale_[:24]

    # Pad the test data to match the number of features expected by PCA (168 features)
    padded_test_scaled = np.zeros((1, 168))
    padded_test_scaled[0, :24] = reshaped_test_scaled

    # Reconstruct the test data using the same PCA
    principal_components_test = pca.transform(padded_test_scaled)
    reconstructed_padded_test_data = pca.inverse_transform(principal_components_test)
    reconstructed_test_data_original_scale = (reconstructed_padded_test_data[:, :24] * scaler.scale_[
                                                                                       :24]) + scaler.mean_[:24]

    # Calculate the residuals for the test data
    residuals_test = df_test['y'].values - reconstructed_test_data_original_scale.flatten()

    # Add residuals and reconstructed data as new columns in the test DataFrame
    df_test['residuals'] = residuals_test
    df_test['reconstructed'] = reconstructed_test_data_original_scale.flatten()

    return df_train, df_test



In [None]:
train_dfs = {}
test_dfs = {}

for zone in zones:
    train_dfs[zone], test_dfs[zone] = add_residuals_and_reconstructed(ndf, zone)

In [None]:
all_prediction_dfs = {}
for zone in zones:
    all_prediction_dfs[zone] = process_and_predict(train_dfs[zone], test_dfs[zone], zone)

In [None]:
# Initial zone
# Save filtered data to pickle files
with open('train_dfs.pkl', 'wb') as f:
    pickle.dump(train_dfs, f)
with open('test_dfs.pkl', 'wb') as f:
    pickle.dump(test_dfs, f)

with open('all_prediction_dfs.pkl', 'wb') as f:
    pickle.dump(all_prediction_dfs, f)

with open('zones.pkl', 'wb') as f:
    pickle.dump(zones, f)

print("Data saved to pickle files successfully.")