In [1]:
import pandas as pd
import numpy as np
import os
import sys

In [2]:
df_wind = pd.read_csv(f'{os.getcwd()}\..\data\wind.csv', index_col=0)

df_wind = df_wind.drop(columns=['SS_Price', 'boa_MWh', 'DA_Price', 'Wind_MWh_credit', 'Solar_MWh_credit', 'Solar_MW', 'Solar_capacity_mwp', 'Solar_installedcapacity_mwp', 'dtm'])
df_wind['reference_time'] = pd.to_datetime(df_wind['reference_time'])
df_wind['valid_time'] = pd.to_datetime(df_wind['valid_time'])

In [3]:
df_wind['WindDirection:100_dwd'].min()

np.float64(0.44444954)

In [15]:
for col in df_wind.columns:
    print(col, df_wind[col].min(), df_wind[col].max())

valid_time 2020-09-20 01:30:00+00:00 2024-05-19 23:30:00+00:00
reference_time 2020-09-20 00:00:00+00:00 2024-05-19 00:00:00+00:00
RelativeHumidity_dwd 48.529324 100.00052
Temperature_dwd -0.3765937 21.303053
WindDirection_dwd 0.74073195 359.36404
WindDirection:100_dwd 0.44444954 359.36792
WindSpeed_dwd 0.18528552 26.82034
WindSpeed:100_dwd 0.15628327 35.885605
RelativeHumidity_ncep 43.966667 100.0
Temperature_ncep -0.5038011 21.28925
WindDirection_ncep 0.59209526 359.38287
WindDirection:100_ncep 0.5916104 359.3482
WindSpeed_ncep 0.15772003 27.78529
WindSpeed:100_ncep 0.20670801 35.430717
MIP -77.29 1983.66
Wind_MW 0.0 1192.744
sin_WindDirection_dwd -0.9999999999978068 0.999999999982805
cos_WindDirection_dwd -0.9999999999862922 0.999938400117741
sin_WindDirection_ncep -0.9999999999950652 0.9999999999950652
cos_WindDirection_ncep -0.9999999999978068 0.9999466045755288
WindSpeed_dwd_lag_1 0.18528552 26.82034
Temperature_dwd_lag_1 -0.3765937 21.303053
RelativeHumidity_dwd_lag_1 48.529324 1

# Feature Engineering

In [4]:
import pandas as pd
import numpy as np

# Cyclical encoding for wind direction
df_wind['sin_WindDirection_dwd'] = np.sin(df_wind['WindDirection_dwd'] * np.pi / 180)
df_wind['cos_WindDirection_dwd'] = np.cos(df_wind['WindDirection_dwd'] * np.pi / 180)
df_wind['sin_WindDirection_ncep'] = np.sin(df_wind['WindDirection_ncep'] * np.pi / 180)
df_wind['cos_WindDirection_ncep'] = np.cos(df_wind['WindDirection_ncep'] * np.pi / 180)

# Lag features (e.g., 1 time step in the past, assuming 30-minute intervals)
df_wind['WindSpeed_dwd_lag_1'] = df_wind['WindSpeed_dwd'].shift(1)
df_wind['Temperature_dwd_lag_1'] = df_wind['Temperature_dwd'].shift(1)
df_wind['RelativeHumidity_dwd_lag_1'] = df_wind['RelativeHumidity_dwd'].shift(1)

# Difference features
df_wind['wind_speed_diff'] = df_wind['WindSpeed_dwd'] - df_wind['WindSpeed_ncep']
df_wind['temperature_diff'] = df_wind['Temperature_dwd'] - df_wind['Temperature_ncep']

# Interaction features
df_wind['wind_temp_interaction'] = df_wind['WindSpeed_dwd'] * df_wind['Temperature_dwd']
df_wind['humidity_wind_interaction'] = df_wind['WindSpeed_dwd'] * df_wind['RelativeHumidity_dwd']

# Rolling window features (e.g., rolling mean over 1 hour, 2 intervals for 30-minute steps)
df_wind['WindSpeed_dwd_rolling_mean_1h'] = df_wind['WindSpeed_dwd'].rolling(window=2).mean()
df_wind['Temperature_dwd_rolling_std_2h'] = df_wind['Temperature_dwd'].rolling(window=4).std()

# Drop rows with NaN values introduced by lagging or rolling (optional)
df_wind = df_wind.dropna()


## V2

In [None]:
# Cyclical encoding for wind direction

# Average WindDirection
df_wind['WindDirection_avg'] = (df_wind['WindDirection_dwd'] + df_wind['WindDirection_ncep']) / 2
df_wind['WindDirection_100_avg'] =(df_wind['WindDirection:100_dwd'] + df_wind['WindDirection:100_ncep']) / 2
df_wind['sin_WindDirection_avg'] = np.sin(df_wind['WindDirection_avg'] * np.pi / 180)
df_wind['cos_WindDirection_avg'] = np.cos(df_wind['WindDirection_avg'] * np.pi / 180)
df_wind = df_wind.drop(columns=['WindDirection_dwd', 'WindDirection_ncep', 'WindDirection:100_dwd', 'WindDirection:100_ncep', 'WindDirection_avg', 'WindDirection_100_avg'])

# Average WindSpeed, Temperature, and RelativeHumidity, Lag features
df_wind['WindSpeed_avg'] = (df_wind['WindSpeed_dwd'] + df_wind['WindSpeed_ncep']) / 2
df_wind['Temperature_avg'] = (df_wind['Temperature_dwd'] + df_wind['Temperature_ncep']) / 2
df_wind['RelativeHumidity_avg'] = (df_wind['RelativeHumidity_dwd'] + df_wind['RelativeHumidity_ncep']) / 2
df_wind['WindSpeed_avg_lag_1'] = df_wind['WindSpeed_avg'].shift(1)
df_wind['WindSpeed_avg_lag_2'] = df_wind['WindSpeed_avg'].shift(2) 
# df_wind['WindSpeed_avg_lag_1_diff'] = df_wind['WindSpeed_avg'].shift(1) - df_wind['WindSpeed_avg']
# df_wind['WindSpeed_avg_lag_2_diff'] = df_wind['WindSpeed_avg'].shift(2) - df_wind['WindSpeed_avg']
df_wind['Temperature_avg_lag_1'] = df_wind['Temperature_avg'].shift(1)
df_wind['Temperature_avg_lag_2'] = df_wind['Temperature_avg'].shift(2)
# df_wind['Temperature_avg_lag_1_diff'] = df_wind['Temperature_avg'].shift(1) - df_wind['Temperature_avg']
# df_wind['Temperature_avg_lag_2_diff'] = df_wind['Temperature_avg'].shift(2) - df_wind['Temperature_avg']
df_wind['RelativeHumidity_avg_lag_1'] = df_wind['RelativeHumidity_avg'].shift(1)
df_wind['RelativeHumidity_avg_lag_2'] = df_wind['RelativeHumidity_avg'].shift(2)
# df_wind['RelativeHumidity_avg_lag_1_diff'] = df_wind['RelativeHumidity_avg'].shift(1) - df_wind['RelativeHumidity_avg']
# df_wind['RelativeHumidity_avg_lag_2_diff'] = df_wind['RelativeHumidity_avg'].shift(2) - df_wind['RelativeHumidity_avg']
df_wind = df_wind.drop(columns=['WindSpeed_dwd', 'WindSpeed_ncep', 'Temperature_dwd', 'Temperature_ncep', 'RelativeHumidity_dwd', 'RelativeHumidity_ncep'])

# Lag features (e.g., 1 time step in the past, assuming 30-minute intervals)
df_wind['WindSpeed_dwd_lag_1'] = df_wind['WindSpeed_dwd'].shift(1)
df_wind['Temperature_dwd_lag_1'] = df_wind['Temperature_dwd'].shift(1)
df_wind['RelativeHumidity_dwd_lag_1'] = df_wind['RelativeHumidity_dwd'].shift(1)

# Difference features
df_wind['wind_speed_diff'] = df_wind['WindSpeed_dwd'] - df_wind['WindSpeed_ncep']
df_wind['temperature_diff'] = df_wind['Temperature_dwd'] - df_wind['Temperature_ncep']

# Interaction features
df_wind['wind_temp_interaction'] = df_wind['WindSpeed_dwd'] * df_wind['Temperature_dwd']
df_wind['humidity_wind_interaction'] = df_wind['WindSpeed_dwd'] * df_wind['RelativeHumidity_dwd']

# Rolling window features (e.g., rolling mean over 1 hour, 2 intervals for 30-minute steps)
df_wind['WindSpeed_dwd_rolling_mean_1h'] = df_wind['WindSpeed_dwd'].rolling(window=2).mean()
df_wind['Temperature_dwd_rolling_std_2h'] = df_wind['Temperature_dwd'].rolling(window=4).std()

# Drop rows with NaN values introduced by lagging or rolling (optional)
df_wind = df_wind.dropna()

### AirMassFlowPerHour --- idea

In [None]:
# Constants
R_d = 287.05  # Specific gas constant for dry air (J/(kg·K))
R_v = 461.5   # Specific gas constant for water vapor (J/(kg·K))
p = 101325    # Standard atmospheric pressure in Pa

# Assuming df_wind is your original DataFrame and contains 'Temperature_dwd', 'RelativeHumidity_dwd', 'WindSpeed_dwd'
# Convert temperature from Celsius to Kelvin
df_wind['Temperature_K'] = df_wind['Temperature_avg'] + 273.15

# Calculate saturation vapor pressure (using temperature in Celsius), Tetens formula
e_s = 0.61078 * np.exp((17.27 * df_wind['Temperature_avg']) / (df_wind['Temperature_avg'] + 237.3))

# in pa
e_s = 1000 * e_s

# Calculate actual vapor pressure
e = df_wind['RelativeHumidity_avg'] / 100 * e_s

# Calculate air density (ρ) in kg/m³
df_wind['AirDensity'] = (p - e) / (R_d * df_wind['Temperature_K']) + (e / (R_v * df_wind['Temperature_K']))

# Calculate Air Mass Flow per hour (assuming a unit cross-sectional area)
df_wind['AirMassFlowPerHour'] = df_wind['AirDensity'] * df_wind['WindSpeed_avg'] * 1000

# Calculate Wind Power Density (W/m²)
df_wind['WindPowerDensity'] = 0.5 * df_wind['AirDensity'] * (df_wind['WindSpeed_avg']/3.6) ** 3

# Training

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor

# Load dataset (assuming df_merged is already loaded)
# Select the latest 'reference_time' for each 'valid_time'
df_latest = df_wind.groupby('valid_time').tail(1)

# Define features (X) and target (y)
X = df_latest.drop(columns=['Wind_MW', 'reference_time', 'valid_time'])
y = df_latest['Wind_MW']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optional: Normalize the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the quantiles for which models will be trained
quantiles = np.arange(0.1, 1.0, 0.1)

# Train one model for each quantile
models = {}

for quantile in quantiles:
    model = GradientBoostingRegressor(loss='quantile', alpha=quantile, n_estimators=100, max_depth=3, random_state=42)
    model.fit(X_train_scaled, y_train)
    models[quantile] = model

In [13]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Assuming df_wind is the original dataframe with features already prepared
# Define features and target
X = df_latest.drop(columns=['Wind_MW', 'reference_time', 'valid_time'])
y = df_latest['Wind_MW']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.columns = [col.replace(':', '_') for col in X_train.columns]
X_test.columns = [col.replace(':', '_') for col in X_test.columns]

print(X_train.shape, X_test.shape)


# Define quantiles to be predicted
quantiles = np.arange(0.1, 1.0, 0.1)

# Train separate models for each quantile
models_lightgbm = {}
for quantile in quantiles:
    params = {
        'objective': 'quantile',
        'alpha': quantile,
        'metric': 'quantile',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'learning_rate': 0.07,
        'n_estimators': 300,
        'max_depth': 6
    }
    
    model = lgb.LGBMRegressor(**params)
    model.fit(X_train, y_train)
    models_lightgbm[quantile] = model

(50894, 26) (12724, 26)


# Evaluation

In [14]:
from sklearn.metrics import mean_pinball_loss

# Assuming models is a dictionary of quantile regression models
# quantiles is a list of quantiles used in training (e.g., [0.1, 0.2, ..., 0.9])

train_losses = {}
test_losses = {}


for models_trained in [models, models_lightgbm]:
    for quantile in quantiles:
        # Extract the model for this quantile
        model = models_trained[quantile]
        
        # Predict on the training set and compute the pinball loss
        y_train_pred = model.predict(X_train_scaled)
        train_loss = mean_pinball_loss(y_train, y_train_pred, alpha=quantile)
        train_losses[quantile] = train_loss
        
        # Predict on the test set and compute the pinball loss
        y_test_pred = model.predict(X_test_scaled)
        test_loss = mean_pinball_loss(y_test, y_test_pred, alpha=quantile)
        test_losses[quantile] = test_loss

        print(f"Quantile {quantile}: Train Loss = {train_loss:.4f}, Test Loss = {test_loss:.4f}")

Quantile 0.1: Train Loss = 29.1965, Test Loss = 29.9784
Quantile 0.2: Train Loss = 40.3109, Test Loss = 41.3359
Quantile 0.30000000000000004: Train Loss = 46.2676, Test Loss = 47.3926
Quantile 0.4: Train Loss = 49.0065, Test Loss = 50.2207
Quantile 0.5: Train Loss = 48.3344, Test Loss = 49.4861
Quantile 0.6: Train Loss = 45.0970, Test Loss = 45.9745
Quantile 0.7000000000000001: Train Loss = 39.3059, Test Loss = 39.9608
Quantile 0.8: Train Loss = 30.7688, Test Loss = 31.1825
Quantile 0.9: Train Loss = 18.6024, Test Loss = 19.1388
Quantile 0.1: Train Loss = 56.0769, Test Loss = 57.2901
Quantile 0.2: Train Loss = 112.1654, Test Loss = 114.5801
Quantile 0.30000000000000004: Train Loss = 168.1937, Test Loss = 171.8107
Quantile 0.4: Train Loss = 224.2788, Test Loss = 229.1147
Quantile 0.5: Train Loss = 280.2575, Test Loss = 286.3015
Quantile 0.6: Train Loss = 336.2462, Test Loss = 343.4811
Quantile 0.7000000000000001: Train Loss = 391.0823, Test Loss = 399.4529
Quantile 0.8: Train Loss = 441