In [1]:
"""
Load forecasting pipeline notebook 
Sections:
 A) LightGBM pipeline for Germany (DE) load forecasting
 B) Multi-country baseline evaluation (Top 3 models: Persistence, LightGBM, SARIMAX)
 C) Probabilistic forecasting (Quantile LightGBM) for DE day-ahead price
 Usage: run i
"""
import os
import json
import math
import warnings
warnings.filterwarnings("ignore")
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

import lightgbm as lgb
import joblib
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import mlflow
import mlflow.lightgbm

In [2]:
# Set random seed for reproducibility
random_seed = 42
np.random.seed(random_seed)

In [3]:
# Set up paths
DATA_DIR = Path('data')
OUTPUT_DIR = Path('outputs')
OUTPUT_DIR.mkdir(exist_ok=True)

In [4]:
# Load energy dataset for load forecasting with 15-minute intervals
data_path = DATA_DIR / '/Users/phionanamugga/Documents/coding/datascience/Energy_Projects/time_series_15min_singleindex.csv'
df = pd.read_csv(data_path, parse_dates=['utc_timestamp'], index_col='utc_timestamp')
df = df.tz_convert('Europe/Berlin')  # Assuming CET for Germany

In [5]:
# Select relevant columns
load_columns = [col for col in df.columns if 'load_actual' in col]
price_columns = [col for col in df.columns if 'price_day_ahead' in col]