# PJM Hourly Energy Consumption Case

PJM Interconnection LLC (PJM) is a regional transmission organization (RTO) in the United States. It is part of the Eastern Interconnection grid operating an electric transmission system serving all or parts of Delaware, Illinois, Indiana, Kentucky, Maryland, Michigan, New Jersey, North Carolina, Ohio, Pennsylvania, Tennessee, Virginia, West Virginia, and the District of Columbia.

The hourly power consumption data comes from PJM's website and are in megawatts (MW).

### XGBoost Training Step - By Sabrina Otoni da Silva - 2024/04

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

import xgboost as xgb

In [None]:
datapath = Path('../data/d02_intermediate')

In [None]:
df = pd.read_csv(f'{datapath}/pjme_n03.csv')
df = df.set_index('datetime')
df.index = pd.to_datetime(df.index)
df = df.sort_index()

In [None]:
x = df[['hour', 'dayofweek', 'quarter', 'month', 'year', 'dayofyear', 'day', 'weekofyear', 'lag1', 'lag2', 'lag3']]
y = df['pjme_mw']

In [None]:
tss = TimeSeriesSplit(n_splits=5, test_size=24*365*1, gap=24)

In [None]:
# grid_params = {'n_estimators': [int(x) for x in np.linspace(200, 1000, 3)],
#               'max_depth': [int(x) for x in np.linspace(5, 55, 11)],
#               'max_features': ['auto', 'sqrt', 'log2'],
#               'random_state': [42]
#               }
# refit = True  # Refit an estimator using the best found parameters on the whole dataset
# scoring = 'neg_mean_squared_error'  # Strategy to evaluate the performance of the cross-validated model on the test set
# n_jobs = -1  # Number of jobs to run in parallel
# tscv = TimeSeriesSplit(n_splits=5)

# grid_search = GridSearchCV(estimator=model, param_grid=grid_params, refit=refit,
#                            scoring=scoring, cv=tscv, n_jobs=n_jobs).fit(X, y)
# print(f'Model: {model} best params are: {grid_search.best_params_}')

In [None]:
# grid_search = GridSearchCV(estimator=, param_grid=, cv=tss, 
#                            scoring='roc_auc', verbose=2, error_score="raise")
# grid_search.fit(X_train, y_train)

In [None]:
preds = []
scores = []

for train_idx, test_idx in tss.split(df):
    train = df.iloc[train_idx]
    test = df.iloc[test_idx]

    X_train = train[x]
    y_train = train[y]

    X_test = test[x]
    y_test = test[y]

    reg = xgb.XGBRegressor(base_score=0.5,
                           booster='gbtree',
                           n_estimators=1_000,
                           early_stopping_rounds=50,
                           objective='reg:squarederror',
                           max_depth=3,
                           learning_rate=0.01)
    
    reg.fit(X_train, y_train,
            eval_set=[(X_train, y_train), (X_test, y_test)],
            verbose=100)
    
    y_pred = reg.predict(X_test)
    preds.append(y_pred)

    score = np.sqrt(mean_squared_error(y_test, y_pred))
    scores.append(score)

In [None]:
print(f'Score across folds: {np.mean(scores):0.4f}')
print(f'Fold scores: {scores}')