# Feature Pipeline


This notebook runs the core data pipeline: validate raw OPSD data, build features, and create time splits.

In [None]:
from pathlib import Path

raw_dir = Path('data/raw')
opsd = raw_dir / 'time_series_60min_singleindex.csv'
if not opsd.exists():
    print('Missing expected CSV:', opsd)
    print('If you already downloaded OPSD, copy it into data/raw/')
else:
    print('Found:', opsd)


In [None]:
import subprocess

subprocess.run([
    'python', '-m', 'gridpulse.data_pipeline.validate_schema',
    '--in', 'data/raw',
    '--report', 'reports/data_quality_report.md'
], check=True)

subprocess.run([
    'python', '-m', 'gridpulse.data_pipeline.build_features',
    '--in', 'data/raw',
    '--out', 'data/processed'
], check=True)

subprocess.run([
    'python', '-m', 'gridpulse.data_pipeline.split_time_series',
    '--in', 'data/processed/features.parquet',
    '--out', 'data/processed/splits'
], check=True)


In [None]:
import pandas as pd

df = pd.read_parquet('data/processed/features.parquet')
print(df.shape)
df.head()
