# Exploratory Data Analysis


This notebook explores the OPSD time‑series data and summarizes basic statistics, missingness, and temporal structure.

In [None]:
from pathlib import Path
repo_root = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
RAW_PATH = repo_root / 'data' / 'raw' / 'time_series_60min_singleindex.csv'
print('Raw path:', RAW_PATH)



In [None]:
import pandas as pd

raw_path = RAW_PATH
cols = [
    'utc_timestamp',
    'DE_load_actual_entsoe_transparency',
    'DE_wind_generation_actual',
    'DE_solar_generation_actual',
]

df = pd.read_csv(raw_path, usecols=lambda c: c in cols)
print(df.shape)
df.head()




In [None]:
import pandas as pd

df['utc_timestamp'] = pd.to_datetime(df['utc_timestamp'], utc=True, errors='coerce')
df = df.sort_values('utc_timestamp')

summary = df.describe(include='all')
summary


In [None]:
# Missingness & gaps
missing_frac = df.isna().mean().sort_values(ascending=False)
missing_frac


In [None]:
# Hourly coverage
full_idx = pd.date_range(
    df["utc_timestamp"].min(),
    df["utc_timestamp"].max(),
    freq="h",
    tz="UTC",
)
missing_ts = len(full_idx.difference(df.set_index("utc_timestamp").index))
missing_ts


## Visual Exploration

The plots below give a quick public‑friendly view of the dataset: trends, seasonality, and distributions.

In [None]:
import matplotlib.pyplot as plt

# Basic time-series view
plot_df = df.set_index('utc_timestamp')[['DE_load_actual_entsoe_transparency', 'DE_wind_generation_actual', 'DE_solar_generation_actual']]
plot_df = plot_df.rename(columns={
    'DE_load_actual_entsoe_transparency': 'load_mw',
    'DE_wind_generation_actual': 'wind_mw',
    'DE_solar_generation_actual': 'solar_mw',
})

fig, ax = plt.subplots(3, 1, figsize=(12, 9), sharex=True)
plot_df['load_mw'].plot(ax=ax[0], color='#1f77b4', title='Load (MW) — full series')
plot_df['wind_mw'].plot(ax=ax[1], color='#2ca02c', title='Wind (MW) — full series')
plot_df['solar_mw'].plot(ax=ax[2], color='#ff7f0e', title='Solar (MW) — full series')
plt.tight_layout()


In [None]:
# Seasonal profiles
tmp = plot_df.copy()
tmp['hour'] = tmp.index.hour
tmp['month'] = tmp.index.month

hourly = tmp.groupby('hour')[['load_mw','wind_mw','solar_mw']].mean()
monthly = tmp.groupby('month')[['load_mw','wind_mw','solar_mw']].mean()

fig, ax = plt.subplots(1, 2, figsize=(12, 4))
hourly.plot(ax=ax[0], title='Average by Hour of Day')
monthly.plot(ax=ax[1], title='Average by Month')
plt.tight_layout()


In [None]:
# Distributions
fig, ax = plt.subplots(1, 3, figsize=(12, 4))
plot_df['load_mw'].plot(kind='hist', bins=50, ax=ax[0], title='Load Distribution')
plot_df['wind_mw'].plot(kind='hist', bins=50, ax=ax[1], title='Wind Distribution')
plot_df['solar_mw'].plot(kind='hist', bins=50, ax=ax[2], title='Solar Distribution')
plt.tight_layout()


In [None]:
# Correlation heatmap (numeric only)
corr = plot_df[['load_mw','wind_mw','solar_mw']].corr()
fig, ax = plt.subplots(figsize=(4, 3))
im = ax.imshow(corr, cmap='Blues', vmin=-1, vmax=1)
ax.set_xticks(range(len(corr.columns)), corr.columns, rotation=45, ha='right')
ax.set_yticks(range(len(corr.columns)), corr.columns)
for i in range(len(corr.columns)):
    for j in range(len(corr.columns)):
        ax.text(j, i, f'{corr.iloc[i, j]:.2f}', ha='center', va='center', color='black')
plt.title('Feature Correlation')
plt.tight_layout()
