# ERCOT Data Exploration

Exploratory data analysis of the ERCOT hourly load dataset used in the benchmark.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from energy_benchmark.data import ERCOTLoader
from energy_benchmark.data.preprocessing import preprocess_series

loader = ERCOTLoader(years=[2020, 2021, 2022, 2023, 2024])
series = loader.load()
series = preprocess_series(series)

print(f"Total observations: {len(series):,}")
print(f"Date range: {series.index.min()} — {series.index.max()}")
series.describe()

## Time Series Overview

In [None]:
fig, ax = plt.subplots(figsize=(14, 4))
series.plot(ax=ax, linewidth=0.3, alpha=0.8)
ax.set_ylabel("Load (MW)")
ax.set_title("ERCOT Total System Load (2020–2024)")
plt.tight_layout()
plt.show()

## Seasonal Patterns

In [None]:
# Average daily profile
hourly_avg = series.groupby(series.index.hour).mean()

fig, axes = plt.subplots(1, 2, figsize=(14, 4))

axes[0].plot(hourly_avg.index, hourly_avg.values, "o-")
axes[0].set_xlabel("Hour of day")
axes[0].set_ylabel("Mean load (MW)")
axes[0].set_title("Average Daily Load Profile")

# Monthly averages
monthly_avg = series.groupby(series.index.month).mean()
axes[1].bar(monthly_avg.index, monthly_avg.values)
axes[1].set_xlabel("Month")
axes[1].set_ylabel("Mean load (MW)")
axes[1].set_title("Average Monthly Load")

plt.tight_layout()
plt.show()

## Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 4))

series.hist(bins=80, ax=axes[0], edgecolor="white")
axes[0].set_xlabel("Load (MW)")
axes[0].set_title("Load Distribution")

# Year-over-year boxplot
df_box = series.to_frame("load")
df_box["year"] = df_box.index.year
sns.boxplot(data=df_box, x="year", y="load", ax=axes[1])
axes[1].set_title("Load Distribution by Year")
axes[1].set_ylabel("Load (MW)")

plt.tight_layout()
plt.show()

## Autocorrelation

In [None]:
from pandas.plotting import autocorrelation_plot

fig, ax = plt.subplots(figsize=(14, 3))
# Use a subset for speed
autocorrelation_plot(series.iloc[:8760], ax=ax)  # 1 year
ax.set_xlim(0, 720)  # up to 30 days
ax.set_title("Autocorrelation (first year, up to 30 days)")
plt.tight_layout()
plt.show()

## Train / Val / Test Split

In [None]:
train, val, test = loader.split(series)

print(f"Train: {len(train):,} hrs ({train.index.min().date()} — {train.index.max().date()})")
print(f"Val:   {len(val):,} hrs ({val.index.min().date()} — {val.index.max().date()})")
print(f"Test:  {len(test):,} hrs ({test.index.min().date()} — {test.index.max().date()})")