# Exploratory Data Analysis


This notebook explores the OPSD timeâ€‘series data and summarizes basic statistics, missingness, and temporal structure.

In [1]:
from pathlib import Path
repo_root = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
RAW_PATH = repo_root / 'data' / 'raw' / 'time_series_60min_singleindex.csv'
print('Raw path:', RAW_PATH)



Raw path: /Users/pratik_n/Downloads/gridpulse/data/raw/time_series_60min_singleindex.csv


In [2]:
import pandas as pd

raw_path = RAW_PATH
cols = [
    'utc_timestamp',
    'DE_load_actual_entsoe_transparency',
    'DE_wind_generation_actual',
    'DE_solar_generation_actual',
]

df = pd.read_csv(raw_path, usecols=lambda c: c in cols)
print(df.shape)
df.head()




(50401, 4)


Unnamed: 0,utc_timestamp,DE_load_actual_entsoe_transparency,DE_solar_generation_actual,DE_wind_generation_actual
0,2014-12-31T23:00:00Z,,,
1,2015-01-01T00:00:00Z,41151.0,,8852.0
2,2015-01-01T01:00:00Z,40135.0,,9054.0
3,2015-01-01T02:00:00Z,39106.0,,9070.0
4,2015-01-01T03:00:00Z,38765.0,,9163.0


In [3]:
import pandas as pd

df['utc_timestamp'] = pd.to_datetime(df['utc_timestamp'], utc=True, errors='coerce')
df = df.sort_values('utc_timestamp')

summary = df.describe(include='all')
summary


Unnamed: 0,utc_timestamp,DE_load_actual_entsoe_transparency,DE_solar_generation_actual,DE_wind_generation_actual
count,50401,50400.0,50297.0,50326.0
mean,2017-11-15 23:00:00+00:00,55492.468552,4566.042905,11552.23465
min,2014-12-31 23:00:00+00:00,31307.0,0.0,135.0
25%,2016-06-08 23:00:00+00:00,47106.0,0.0,4506.0
50%,2017-11-15 23:00:00+00:00,55092.0,173.0,9015.0
75%,2019-04-24 23:00:00+00:00,64309.25,7342.0,16113.75
max,2020-09-30 23:00:00+00:00,77549.0,32947.0,46064.0
std,,10015.431042,6940.26759,9076.350769


In [4]:
# Missingness & gaps
missing_frac = df.isna().mean().sort_values(ascending=False)
missing_frac


DE_solar_generation_actual            0.002063
DE_wind_generation_actual             0.001488
DE_load_actual_entsoe_transparency    0.000020
utc_timestamp                         0.000000
dtype: float64

In [6]:
# Hourly coverage
full_idx = pd.date_range(
    df["utc_timestamp"].min(),
    df["utc_timestamp"].max(),
    freq="h",
    tz="UTC",
)
missing_ts = len(full_idx.difference(df.set_index("utc_timestamp").index))
missing_ts


0