## NYISO Load Prediction
- Objective: Utilize the NBEATS model to predict NYISO load data for 2023-12-31 using historical data from 2013-01-01 to 2023-12-30.
- Zones: `N.Y.C.`, `NORTH`, `CENTRL`
- Scaling methods: [definition](https://nixtlaverse.nixtla.io/neuralforecast/common.scalers.html)
     - [`identity`](https://nixtlaverse.nixtla.io/neuralforecast/common.scalers.html#std-statistics)
     - [`standard`](https://nixtlaverse.nixtla.io/neuralforecast/common.scalers.html#std-statistics)
     - [`minmax`](https://nixtlaverse.nixtla.io/neuralforecast/common.scalers.html#minmax-statistics)
     - [`robust`](https://nixtlaverse.nixtla.io/neuralforecast/common.scalers.html#robust-statistics)
     - `revin`:  learnable normalization parameters are added on top of the usual normalization technique.

In [None]:

import logging
import pickle
import warnings
from collections import defaultdict

import pandas as pd
from neuralforecast import NeuralForecast
from neuralforecast.models import NBEATS
from scipy.ndimage import gaussian_filter1d

from ts_scaler.data.data_handler import DataHandler
from ts_scaler.utils.logger import setup_logger

# Suppress warnings
warnings.filterwarnings("ignore")
logger = setup_logger(level=logging.ERROR)

# Fetch data
data_handler = DataHandler(logger=logger)
df = data_handler.fetch_nyiso_data(local_dir="../data", start_date="20130101", end_date="20231231")

# Data preprocessing
df = df.drop_duplicates(subset=['time_stamp', 'zone_name'])
df = df.dropna()
df = df[df['integrated_load'] > 0]

# Convert time_stamp to datetime
df['time_stamp'] = pd.to_datetime(df['time_stamp'])
ndf = df.rename(columns={"time_stamp": "ds", "integrated_load": "y", "zone_name": "unique_id"})[
    ["ds", "unique_id", "y"]]
zones = ndf.unique_id.unique()
sigma = 448

In [None]:
ndf[ndf.unique_id == "CAPITL"]

In [None]:
def create_train_test_split(df, train_length, test_length):
    # Ensure the DataFrame is sorted by date
    df = df.sort_values(by='ds')

    test_start_index = np.random.randint(train_length - 1, len(df))

    # Select the test data starting from the chosen index
    df_test = df.iloc[test_start_index: (test_start_index + test_length)]

    # Select the training data as the K timepoints before the test data
    df_train = df.iloc[test_start_index - train_length:test_start_index]

    return df_train, df_test


def generate_train_test_pairs(df, unique_ids, train_length, test_length, num_pairs=100):
    all_train_test_pairs = {}

    for unique_id in unique_ids:
        pairs = []
        df_unique = df[df.unique_id == unique_id]

        for _ in range(num_pairs):
            df_train, df_test = create_train_test_split(df_unique, train_length, test_length)
            pairs.append((df_train, df_test))

        all_train_test_pairs[unique_id] = pairs

    return all_train_test_pairs


# Define the number of previous timepoints to use as training data
train_length = 200 * 24  # Example: 200 days of hourly data
test_length = 1 * 24  # Example: 1 day of hourly data

# Assume `ndf` is your DataFrame and contains a 'unique_id' column
unique_ids = ndf['unique_id'].unique()

# Generate 100 train/test pairs for each unique_id
all_train_test_pairs = generate_train_test_pairs(ndf, unique_ids, train_length, test_length, num_pairs=100)

# Example: Access the first train/test pair for a specific unique_id
example_unique_id = unique_ids[0]
example_train, example_test = all_train_test_pairs[example_unique_id][0]

In [None]:
class Normalizer:
    def __init__(self, sigma=2):
        self.sigma = sigma
        self.smoothed_y = None
        self.std_y = None

    def fit(self, y):
        # Fill NaN values before fitting
        y = pd.Series(y).ffill().bfill().values

        # Apply Gaussian kernel smoothing to calculate smoothed values
        self.smoothed_y = gaussian_filter1d(y, sigma=self.sigma)

        # Compute the deviations from the smoothed line
        deviations = y - self.smoothed_y

        # Apply Gaussian kernel smoothing to the squared deviations to estimate std
        smoothed_squared_deviations = gaussian_filter1d(deviations ** 2, sigma=self.sigma)
        self.std_y = np.sqrt(smoothed_squared_deviations)

        return self

    def transform(self, y):
        # Fill NaN values before transforming
        y = pd.Series(y).ffill().bfill().values

        if self.smoothed_y is None or self.std_y is None:
            raise RuntimeError("The normalizer must be fitted before calling transform.")

        smoothed_y_partial = self.smoothed_y[-len(y):]
        std_y_partial = self.std_y[-len(y):]

        # Handle any NaN or zero values in std_y_partial to avoid extreme values
        std_y_partial = np.where(np.isnan(std_y_partial) | (std_y_partial == 0), 1e-6, std_y_partial)

        normalized_y = (y - smoothed_y_partial) / std_y_partial

        # Forward-fill any remaining NaN values in normalized_y
        normalized_y = pd.Series(normalized_y).ffill().values

        return normalized_y

    def fit_transform(self, y):
        self.fit(y)
        return self.transform(y)

    def inverse_transform(self, normalized_y):
        if self.smoothed_y is None or self.std_y is None:
            raise RuntimeError("The normalizer must be fitted before calling inverse_transform.")

        smoothed_y_partial = self.smoothed_y[-len(normalized_y):]
        std_y_partial = self.std_y[-len(normalized_y):]

        denormalized_y = normalized_y * std_y_partial + smoothed_y_partial
        return denormalized_y


In [None]:
# Initialize a dictionary to store normalizers for each unique_id
normalizers = {}

# Initialize lists to store train and test DataFrames
train_dfs = []
test_dfs = []

# Loop through each unique_id group
for unique_id, group in ndf.groupby('unique_id'):
    # Sort the group by date to ensure correct splitting
    group = group.sort_values(by='ds')

    # Split the data into training and test sets
    train_data = group.iloc[:-168]
    test_data = group.iloc[-168:]

    # Initialize a new normalizer for the training data
    #normalizer = Normalizer(window_size=90, polyorder=2)
    normalizer = Normalizer(sigma=sigma)
    # Fit the normalizer on the training data only
    train_data['normalized_y'] = normalizer.fit_transform(train_data['y'].values)

    # Transform the test data using the fitted normalizer
    test_data['normalized_y'] = normalizer.transform(test_data['y'].values)

    # Store the normalizer in the dictionary
    normalizers[unique_id] = normalizer

    # Append the processed train and test data to their respective lists
    train_dfs.append(train_data)
    test_dfs.append(test_data)

# Concatenate the processed data back into single DataFrames
train_df = pd.concat(train_dfs)
test_df = pd.concat(test_dfs)

In [None]:
def train_and_prediction_smoothing(df_train, df_test, normalizer):
    horizon = len(df_test)
    models = [
        NBEATS(input_size=len(df_test) * 7, h=horizon, max_steps=500, scaler_type='identity'),
    ]

    nf = NeuralForecast(models=models, freq='H')
    nf.fit(df=df_train)
    Y_hat_df = nf.predict().reset_index()
    Y_hat_df['NBEATS'] = normalizer.inverse_transform(Y_hat_df['NBEATS'].values)
    Y_hat_df = df_test.merge(Y_hat_df, how='left', on=['unique_id', 'ds'])
    # df_train[-(24 * 3):]
    plot_df = pd.concat([df_train, Y_hat_df]).drop("unique_id", axis=1).set_index('ds').rename(columns={
        'NBEATS': 'NBEATS - Smoothing',
    })
    plot_df.index = pd.to_datetime(plot_df.index)
    return plot_df


def train_and_prediction(df_train, df_test):
    horizon = len(df_test)
    models = [
        NBEATS(input_size=len(df_test) * 7, h=horizon, max_steps=500, scaler_type='identity'),
        NBEATS(input_size=len(df_test) * 7, h=horizon, max_steps=500, scaler_type='revin'),
    ]

    nf = NeuralForecast(models=models, freq='H')
    nf.fit(df=df_train)
    Y_hat_df = nf.predict().reset_index()

    Y_hat_df = df_test.merge(Y_hat_df, how='left', on=['unique_id', 'ds'])
    # df_train[-(24 * 3):]
    plot_df = pd.concat([df_train, Y_hat_df]).drop("unique_id", axis=1).set_index('ds').rename(columns={
        'NBEATS': 'NBEATS - Identity',
        'NBEATS1': 'NBEATS - Revin',
    })
    plot_df.index = pd.to_datetime(plot_df.index)
    return plot_df


def process_and_predict(df_train, df_test, zone, normalizer):
    # Ensure data types are consistent
    df_train = df_train.reset_index()
    df_test = df_test.reset_index()
    df_train['unique_id'] = df_train['unique_id'].astype(str)
    df_test['unique_id'] = df_test['unique_id'].astype(str)

    df_train['ds'] = pd.to_datetime(df_train['ds'])
    df_test['ds'] = pd.to_datetime(df_test['ds'])

    df_train['y'] = df_train['y'].astype(float)
    df_test['y'] = df_test['y'].astype(float)

    # Reset index and prepare the DataFrame for residuals
    df_train_smoothing = df_train[["ds", "normalized_y", "unique_id"]]
    df_train_smoothing = df_train_smoothing.rename(columns={"normalized_y": "y"})[["ds", "unique_id", "y"]]
    df_test_smoothing = df_test[["ds", "normalized_y", "unique_id"]]
    df_test_smoothing = df_test_smoothing.rename(columns={"normalized_y": "y"})[["ds", "unique_id", "y"]]

    # Perform predictions using the general method
    all_prediction_df = train_and_prediction(df_train[["ds", "y", "unique_id"]],
                                             df_test[["ds", "y", "unique_id"]],
                                             )
    df_combined = pd.concat([df_train, df_test])

    # Ensure the column types for the df_combined DataFrame
    df_combined['unique_id'] = df_combined['unique_id'].astype(str)
    df_combined['ds'] = pd.to_datetime(df_combined['ds'])
    df_combined = df_combined.drop_duplicates(subset=['ds', 'unique_id'])
    # Perform predictions using the identity method
    all_prediction_df['NBEATS - Smoothing'] = \
        train_and_prediction_smoothing(df_train_smoothing, df_test_smoothing, normalizer)['NBEATS - Smoothing']

    # Ensure 'ds' and 'unique_id' are present in all_prediction_df
    all_prediction_df['ds'] = df_combined['ds'].values
    all_prediction_df['unique_id'] = df_combined['unique_id'].values
    all_prediction_df["unique_id"] = zone

    return all_prediction_df


In [None]:

zones = ndf.unique_id.unique()
all_prediction_dfs = defaultdict(list)
normalizers = defaultdict(list)

for zone in zones:
    for train_df, test_df in all_train_test_pairs[zone]:
        normalizer = Normalizer(sigma=sigma)
        # Fit the normalizer on the training data only
        train_df['normalized_y'] = normalizer.fit_transform(train_df['y'].values)

        # Transform the test data using the fitted normalizer
        test_df['normalized_y'] = normalizer.transform(test_df['y'].values)

        # Store the normalizer in the dictionary
        normalizers[zone].append(normalizer)

        predicted_df = process_and_predict(train_df, test_df, zone, normalizer)
        all_prediction_dfs[zone].append(predicted_df)

In [None]:
# Initial zone
# Save filtered data to pickle files
with open('all_prediction_dfs.pkl', 'wb') as f:
    pickle.dump(all_prediction_dfs, f)
with open('normalizers.pkl', 'wb') as f:
    pickle.dump(normalizers, f)

with open('all_train_test_pairs.pkl', 'wb') as f:
    pickle.dump(all_train_test_pairs, f)

print("Data saved to pickle files successfully.")