In [98]:
import pandas as pd
import numpy as np
import os
import sys
from scipy.interpolate import CubicSpline
import warnings

pd.options.mode.chained_assignment = None  # default='warn'
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [136]:
# Read the CSV file
df_wind = pd.read_csv(os.path.join(os.getcwd(), '..', 'data', 'wind2.csv'))

df_wind = df_wind.drop(columns=['SS_Price', 'boa_MWh', 'DA_Price', 'Wind_MWh_credit', 'dtm', 'MIP'])
df_wind['reference_time'] = pd.to_datetime(df_wind['reference_time'])
df_wind['valid_time'] = pd.to_datetime(df_wind['valid_time'])
df_latest = df_wind.groupby('valid_time').tail(1)

In [137]:
# drop everything after Jan 20, 2024
df_latest = df_latest[df_latest['valid_time'] < '2024-01-21']

In [138]:
# df_latest['WindSpeed:100_avg'] = (df_latest['WindSpeed:100_dwd'] + df_latest['WindSpeed:100_ncep']) / 2
df_latest['WindSpeed^3:100_avg'] = (df_latest['WindSpeed^3:100_dwd'] + df_latest['WindSpeed^3:100_ncep']) / 2
df_latest['Temperature_avg'] = (df_latest['Temperature_dwd'] + df_latest['Temperature_ncep']) / 2
df_latest['RelativeHumidity_avg'] = (df_latest['RelativeHumidity_dwd'] + df_latest['RelativeHumidity_ncep']) / 2
df_latest['WindDirection:100_avg'] = (df_latest['WindDirection:100_dwd'] + df_latest['WindDirection:100_ncep']) / 2

df_latest = df_latest.drop(columns=['WindSpeed_dwd', 'WindSpeed_ncep', 'Temperature_dwd', 'Temperature_ncep', 'RelativeHumidity_dwd', 'RelativeHumidity_ncep', 'WindSpeed^3_dwd', 'WindSpeed^3_ncep', 'WindDirection_dwd', 'WindDirection_ncep', 'WindSpeed:100_dwd', 'WindSpeed:100_ncep', 'WindSpeed:100^3_dwd', 'WindSpeed:100^3_ncep'])

In [139]:
# remove ^3 from the column names
df_latest.columns = [col.replace('^3', '') for col in df_latest.columns]

Index(['reference_time', 'valid_time', 'WindDirection:100_dwd',
       'WindSpeed:100_dwd', 'WindDirection:100_ncep', 'WindSpeed:100_ncep',
       'Wind_MW', 'WindSpeed:100_avg', 'Temperature_avg',
       'RelativeHumidity_avg', 'WindDirection:100_avg'],
      dtype='object')

# i. Calculate Power Forecast based on Physics

## 1. Calculate Air Density

In [140]:
# Constants
R_d = 287.05  # Specific gas constant for dry air (J/(kg·K))
R_v = 461.5   # Specific gas constant for water vapor (J/(kg·K))
p = 101325    # Standard atmospheric pressure in Pa

# Assuming df_latest is your original DataFrame and contains 'Temperature_dwd', 'RelativeHumidity_dwd', 'WindSpeed_dwd'
# Convert temperature from Celsius to Kelvin
df_latest['Temperature_K'] = df_latest['Temperature_avg'] + 273.15

# Calculate saturation vapor pressure (using temperature in Celsius), Tetens formula
e_s = 0.61078 * np.exp((17.27 * df_latest['Temperature_avg']) / (df_latest['Temperature_avg'] + 237.3))

# in pa
e_s = 1000 * e_s

# Calculate actual vapor pressure
e = df_latest['RelativeHumidity_avg'] / 100 * e_s

# Calculate air density (ρ) in kg/m³
df_latest['AirDensity'] = (p - e) / (R_d * df_latest['Temperature_K']) + (e / (R_v * df_latest['Temperature_K']))

## 2. Calculate Power based on Density and Wind Speed

In [142]:
# Turbine stats
rotor_diameter = 154  # in meters
approximated_total_efficiency = 0.31  # 31% efficiency
minimum_wind_speed = 3  # in m/s
maximum_wind_speed_for_power_curve = 12.5  # in m/s
maximum_wind_speed_for_operation = 25  # in m/s
rotor_area = np.pi * (rotor_diameter / 2) ** 2  # in m²
# turbine requires 3m/s to start rotating
const_internal_friction_coefficient = 0.5 * 1.240 * np.pi * 77**2 * 3**3 * approximated_total_efficiency * 174 / 1000000
maximum_power_per_turbine = 7 # in MW

# Calculate power output based on wind speed at 100m
df_latest['WindPower:100'] = 0.5 * df_latest['AirDensity'] * rotor_area * df_latest['WindSpeed:100_avg'] ** 3 * 174 / 1000000
df_latest['UsableWindPower:100'] = np.minimum(df_latest['WindPower:100'], maximum_power_per_turbine * 174 / approximated_total_efficiency)
df_latest['PowerOutput:100'] = np.where((df_latest['WindSpeed:100_avg'] >= minimum_wind_speed) & (df_latest['WindSpeed:100_avg'] <= maximum_wind_speed_for_operation), df_latest['UsableWindPower:100'] * approximated_total_efficiency - const_internal_friction_coefficient, 0)

In [143]:
print(max(df_latest['PowerOutput:100']))
print(const_internal_friction_coefficient)
print(max(df_latest['Wind_MW']))

1201.1810949493763
16.818905050623663
1192.744


In [144]:
# Forecast stats
# Absolute error
print('Mean Absolute error:100:', np.mean(np.abs(df_latest['Wind_MW'] - df_latest['PowerOutput:100'])))
# Initialize a list to collect the stats
stats_df = pd.DataFrame({'quantile': [], 'quantile_range': [], 'mean_absolute_error:100': [], 'mean_error:100': []})

# Mean absolute error in 10% quantiles
for q in np.arange(0.1, 1.1, 0.1):
    # Filter the DataFrame for each quantile range
    quantile_df = df_latest[(df_latest['Wind_MW'] < df_latest['Wind_MW'].quantile(q)) & (df_latest['Wind_MW'] >= df_latest['Wind_MW'].quantile(q - 0.1))]
    
    # Calculate the statistics
    quantile_stats = {
        'quantile': q,
        'quantile_range': f'{df_latest["Wind_MW"].quantile(q - 0.1):.1f} - {df_latest["Wind_MW"].quantile(q):.1f}',
        'mean_absolute_error:100': np.mean(np.abs(quantile_df['Wind_MW'] - quantile_df['PowerOutput:100'])),
        'mean_error:100': np.mean(quantile_df['Wind_MW'] - quantile_df['PowerOutput:100']),
    }
    
    # Append to the list
    stats_df.loc[len(stats_df)] = quantile_stats

# Display the DataFrame
stats_df

Mean Absolute error:100: 101.04136737561544


Unnamed: 0,quantile,quantile_range,mean_absolute_error:100,mean_error:100
0,0.1,0.0 - 18.8,39.517385,-37.074092
1,0.2,18.8 - 91.8,39.392657,-9.300244
2,0.3,91.8 - 198.0,70.498968,1.10643
3,0.4,198.0 - 339.9,96.019216,21.026575
4,0.5,339.9 - 518.1,135.888005,40.565237
5,0.6,518.1 - 748.9,174.402589,69.991685
6,0.7,748.9 - 994.1,201.631115,59.18924
7,0.8,994.1 - 1108.8,141.210712,-26.47831
8,0.9,1108.8 - 1154.5,79.268065,-39.587863
9,1.0,1154.5 - 1192.7,32.622062,-21.511389


## 3. Find optimal $\eta$ to update the approximation of the turbine efficiency

In [146]:
errors = {}
for efficiency in np.arange(0.28, 0.35, 0.001):
    for limiter in np.arange(0.9, 1.01, 0.01):
        const_internal_friction_coefficient = 0.5 * 1.240 * np.pi * 77**2 * 3**3 * efficiency * 174 / 1000000
        df_latest['WindPower:100_tmp'] = 0.5 * df_latest['AirDensity'] * rotor_area * df_latest['WindSpeed:100_avg'] ** 3 * 174 / 1000000
        df_latest['UsableWindPower:100_tmp'] = np.minimum(df_latest['WindPower:100_tmp'], maximum_power_per_turbine * 174 * limiter / efficiency)
        df_latest['PowerOutput:100_tmp'] = np.where((df_latest['WindSpeed:100_avg'] >= minimum_wind_speed) & (df_latest['WindSpeed:100_avg'] <= maximum_wind_speed_for_operation), df_latest['UsableWindPower:100_tmp'] * efficiency - const_internal_friction_coefficient, 0)
        mean_abs_error = np.mean(np.abs(df_latest['Wind_MW'] - df_latest['PowerOutput:100_tmp']))
        mean_error = np.mean(df_latest['Wind_MW'] - df_latest['PowerOutput:100_tmp'])
        errors[(efficiency.round(3), limiter.round(2))] = [mean_abs_error, mean_error]

In [147]:
# Find the minimum error
min_error = min(errors, key=lambda x: errors[x])
print(f'Minimum mean absolute error: {errors[min_error][0]:.2f} with efficiency={min_error}')
print(f'Mean error: {errors[min_error][1]:.2f}')

Minimum mean absolute error: 88.20 with efficiency=(0.348, 0.94)
Mean error: -1.87


In [156]:
approximated_total_efficiency = min_error[0]
limiter = min_error[1]
#limiter = 0.96

In [158]:
const_internal_friction_coefficient = 0.5 * 1.240 * np.pi * 77**2 * 3**3 * approximated_total_efficiency * 174 / 1000000
df_latest['WindPower_opt'] = 0.5 * df_latest['AirDensity'] * rotor_area * df_latest['WindSpeed:100_avg'] ** 3 * 174 / 1000000
df_latest['UsableWindPower_opt'] = np.minimum(df_latest['WindPower_opt'], maximum_power_per_turbine * 174 * limiter / approximated_total_efficiency)
df_latest['PowerOutput_opt'] = np.where((df_latest['WindSpeed:100_avg'] >= minimum_wind_speed) & (df_latest['WindSpeed:100_avg'] <= maximum_wind_speed_for_operation), df_latest['UsableWindPower_opt'] * approximated_total_efficiency - const_internal_friction_coefficient, 0)

# Update stats_df using optimal values
for q in np.arange(0.1, 1.1, 0.1):
    quantile_df = df_latest[(df_latest['Wind_MW'] < df_latest['Wind_MW'].quantile(q)) & (df_latest['Wind_MW'] >= df_latest['Wind_MW'].quantile(q - 0.1))]
    quantile_row = {
        'quantile': q,
        'mean_absolute_error:100_opt': np.mean(np.abs(quantile_df['Wind_MW'] - quantile_df['PowerOutput_opt'])),
        'mean_error:100_opt': np.mean(quantile_df['Wind_MW'] - quantile_df['PowerOutput_opt'])
    }
    stats_df.loc[stats_df['quantile'] == q, 'mean_absolute_error:100_opt'] = quantile_row['mean_absolute_error:100_opt']
    stats_df.loc[stats_df['quantile'] == q, 'mean_error:100_opt'] = quantile_row['mean_error:100_opt']

stats_df

Unnamed: 0,quantile,quantile_range,mean_absolute_error:100,mean_error:100,mean_absolute_error:100_opt,mean_error:100_opt,mean_absolute_error:100_opt_new,mean_error:100_opt_new
0,0.1,0.0 - 18.8,39.517385,-37.074092,41.075542,-38.711994,41.075542,-38.711994
1,0.2,18.8 - 91.8,39.392657,-9.300244,43.313708,-16.799664,43.313708,-16.799664
2,0.3,91.8 - 198.0,70.498968,1.10643,73.007629,-14.493744,73.007629,-14.493744
3,0.4,198.0 - 339.9,96.019216,21.026575,97.693936,-7.723784,97.693936,-7.723784
4,0.5,339.9 - 518.1,135.888005,40.565237,133.493847,-2.333356,133.493847,-2.333356
5,0.6,518.1 - 748.9,174.402589,69.991685,164.910003,9.72069,164.910003,9.72069
6,0.7,748.9 - 994.1,201.631115,59.18924,175.675709,-0.382085,175.675709,-0.382085
7,0.8,994.1 - 1108.8,141.210712,-26.47831,81.287565,-14.5623,81.287565,-14.5623
8,0.9,1108.8 - 1154.5,79.268065,-39.587863,22.136928,17.128711,22.136928,17.128711
9,1.0,1154.5 - 1192.7,32.622062,-21.511389,49.406168,49.406168,49.406168,49.406168


In [159]:
df_latest['PowerOutput_opt'].max()/limiter

1197.9142794865577

## 3. Make Forecast more accurate using smoothing of windspeed between datapoints

In [165]:
# Iterate over each reference_time
df_latest['PowerOutput_smoothed'] = 0.0
for reference_time, group in df_latest.groupby('reference_time'):

    # Get the valid_times and corresponding wind speeds for this reference_time
    valid_times = group['valid_time'].values
    wind_speeds = group['WindSpeed:100_avg'].values
    actual_generation = group['Wind_MW'].values
    
    # Convert valid_times to seconds for interpolation
    valid_times_in_seconds = (valid_times - valid_times[0]).astype('timedelta64[m]').astype(int) * 60  # seconds
    
    if len(valid_times) < 2:
        wind_power = 0.5 * group['AirDensity'].iloc[0] * rotor_area * avg_cubed_wind * 174 / 1000000
        usable_wind_power = min(wind_power, 7 * 174 * limiter / approximated_total_efficiency)
        power_output = usable_wind_power * approximated_total_efficiency - const_internal_friction_coefficient if (group['WindSpeed:100_avg'].iloc[0] >= minimum_wind_speed) and (group['WindSpeed:100_avg'].iloc[0] <= maximum_wind_speed_for_operation) else 0

        # Update PowerOutput
        df_latest.loc[(df_latest['reference_time'] == reference_time) & (df_latest['valid_time'] == valid_times[0]), 'PowerOutput_smoothed'] = power_output
        continue
    # Perform cubic spline interpolation for 1-minute intervals
    wind_speed_spline = CubicSpline(valid_times_in_seconds, wind_speeds)
    
    # Generate 1-minute intervals for the valid_time range
    for i in range(len(valid_times) - 1):
        t_start = valid_times_in_seconds[i]
        t_end = valid_times_in_seconds[i + 1]
        
        # Generate time points at 1-minute intervals within this 30-minute window
        times_1min = np.arange(t_start, t_end, 60)
        
        # Interpolate wind speeds at 1-minute intervals
        interpolated_wind_speeds = wind_speed_spline(times_1min)
        
        # Calculate the average wind power using the cubed wind speeds, reduce speeds <3ms to 0
        avg_cubed_wind = np.mean(np.where((interpolated_wind_speeds >= 3) & (interpolated_wind_speeds <= 25), interpolated_wind_speeds, 0) ** 3)
        avg_cubed_wind = np.mean(interpolated_wind_speeds ** 3)
        # get frac with windspeed > 3 and < 25
        frac_generation = np.mean(np.where((interpolated_wind_speeds >= 3) & (interpolated_wind_speeds <= 25), 1, 0))
        
        # Calculate wind power and apply limits for each interval
        wind_power = 0.5 * group['AirDensity'].iloc[i] * rotor_area * avg_cubed_wind * 174 / 1000000
        usable_wind_power = min(wind_power, 7 * 174 * limiter / approximated_total_efficiency)
        
        # Calculate final power output based on cut-in, cut-out wind speeds and efficiency
        power_output = max(0, usable_wind_power * approximated_total_efficiency - const_internal_friction_coefficient * frac_generation)
        
        # Update PowerOutput
        valid_time = pd.to_datetime(valid_times[i]).tz_localize('UTC')
        df_latest.loc[(df_latest['reference_time'] == reference_time) & (df_latest['valid_time'] == valid_time), 'PowerOutput_smoothed'] = power_output

In [166]:
# Absolute error
print('Mean Absolute error:', np.mean(np.abs(df_latest['Wind_MW'] - df_latest['PowerOutput_smoothed'])))
# Mean absolute error in 10% quantiles
for q in np.arange(0.1, 1.1, 0.1):
    quantile_df = df_latest[(df_latest['Wind_MW'] < df_latest['Wind_MW'].quantile(q)) & (df_latest['Wind_MW'] >= df_latest['Wind_MW'].quantile(q - 0.1))]
    quantile_row = {
        'quantile': q,
        'mean_absolute_error:100_new': np.mean(np.abs(quantile_df['Wind_MW'] - quantile_df['PowerOutput_smoothed'])),
        'mean_error:100_new': np.mean(quantile_df['Wind_MW'] - quantile_df['PowerOutput_smoothed'])
    }
    stats_df.loc[stats_df['quantile'] == q, 'mean_absolute_error:100_new'] = quantile_row['mean_absolute_error:100_new']
    stats_df.loc[stats_df['quantile'] == q, 'mean_error:100_new'] = quantile_row['mean_error:100_new']

stats_df


Mean Absolute error: 127.42484874897818


Unnamed: 0,quantile,quantile_range,mean_absolute_error:100,mean_error:100,mean_absolute_error:100_opt,mean_error:100_opt,mean_absolute_error:100_opt_new,mean_error:100_opt_new,mean_absolute_error:100_new,mean_error:100_new
0,0.1,0.0 - 18.8,39.517385,-37.074092,41.075542,-38.711994,41.075542,-38.711994,41.31824,-39.899149
1,0.2,18.8 - 91.8,39.392657,-9.300244,43.313708,-16.799664,43.313708,-16.799664,43.318718,-11.176472
2,0.3,91.8 - 198.0,70.498968,1.10643,73.007629,-14.493744,73.007629,-14.493744,77.977497,-1.949534
3,0.4,198.0 - 339.9,96.019216,21.026575,97.693936,-7.723784,97.693936,-7.723784,110.475175,16.138134
4,0.5,339.9 - 518.1,135.888005,40.565237,133.493847,-2.333356,133.493847,-2.333356,156.070749,33.597233
5,0.6,518.1 - 748.9,174.402589,69.991685,164.910003,9.72069,164.910003,9.72069,200.12773,58.138012
6,0.7,748.9 - 994.1,201.631115,59.18924,175.675709,-0.382085,175.675709,-0.382085,229.574278,64.244249
7,0.8,994.1 - 1108.8,141.210712,-26.47831,81.287565,-14.5623,81.287565,-14.5623,163.020507,75.195378
8,0.9,1108.8 - 1154.5,79.268065,-39.587863,22.136928,17.128711,22.136928,17.128711,113.712251,109.12557
9,1.0,1154.5 - 1192.7,32.622062,-21.511389,49.406168,49.406168,49.406168,49.406168,138.689875,138.689875


In [167]:
df_latest['PowerOutput_smoothed'].max()

1144.9199999999998

## => It got worse????

# ii. Modeling the residium 

## 1. XGBoost

In [169]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Calculate the residuals (difference between actual and forecast)
df_latest['residual'] = df_latest['Wind_MW'] - df_latest['PowerOutput_opt']

# add three lag features for WindSpeed:100_dwd
df_latest['WindSpeed:100_dwd_lag1'] = df_latest['WindSpeed:100_dwd'].shift(1)
df_latest['WindSpeed:100_dwd_lag2'] = df_latest['WindSpeed:100_dwd'].shift(2)
df_latest['WindSpeed:100_dwd_lag3'] = df_latest['WindSpeed:100_dwd'].shift(3)

df_latest = df_latest.dropna()

# Define the features (X) and the target (y)
X = df_latest[['WindSpeed:100_dwd', 'Temperature_avg', 'RelativeHumidity_avg', 'AirDensity', 'UsableWindPower_opt', 'WindSpeed:100_dwd_lag1', 'WindSpeed:100_dwd_lag2', 'WindSpeed:100_dwd_lag3']]  # Add any other relevant features
y = df_latest['residual']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [73]:
# Initialize the XGBoost regressor
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', seed=42)

# Define a parameter grid for hyperparameter tuning
param_grid = {
    'learning_rate': [0.01, 0.1],  # Controls the step size in each boosting step
    'n_estimators': [100, 500, 1000],  # Number of boosting rounds
    'max_depth': [5, 7],  # Maximum depth of the trees
    'subsample': [0.7, 1.0],  # Proportion of training data used for fitting the individual trees
    'colsample_bytree': [0.7, 1.0],  # Proportion of features used for fitting individual trees
    'gamma': [0.1, 0.3],  # Minimum loss reduction required to make a further partition on a leaf node
    'reg_lambda': [1, 10],  # L2 regularization term
    'reg_alpha': [0, 1],  # L1 regularization term
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xg_reg, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_absolute_error')

# Fit the model
grid_search.fit(X_train, y_train)

# Best model from GridSearchCV
best_xgb_model = grid_search.best_estimator_
print("Best parameters found: ", grid_search.best_params_)


Fitting 3 folds for each of 384 candidates, totalling 1152 fits
[CV] END colsample_bytree=0.7, gamma=0.1, learning_rate=0.01, max_depth=5, n_estimators=100, reg_alpha=0, reg_lambda=1, subsample=0.7; total time=   0.3s
[CV] END colsample_bytree=0.7, gamma=0.1, learning_rate=0.01, max_depth=5, n_estimators=100, reg_alpha=0, reg_lambda=1, subsample=0.7; total time=   0.3s
[CV] END colsample_bytree=0.7, gamma=0.1, learning_rate=0.01, max_depth=5, n_estimators=100, reg_alpha=0, reg_lambda=1, subsample=1.0; total time=   0.3s
[CV] END colsample_bytree=0.7, gamma=0.1, learning_rate=0.01, max_depth=5, n_estimators=100, reg_alpha=0, reg_lambda=1, subsample=1.0; total time=   0.3s
[CV] END colsample_bytree=0.7, gamma=0.1, learning_rate=0.01, max_depth=5, n_estimators=100, reg_alpha=0, reg_lambda=1, subsample=0.7; total time=   0.3s
[CV] END colsample_bytree=0.7, gamma=0.1, learning_rate=0.01, max_depth=5, n_estimators=100, reg_alpha=0, reg_lambda=1, subsample=1.0; total time=   0.3s
[CV] END col

In [74]:
# Predict the residuals for the test set
y_pred = best_xgb_model.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f'Mean Absolute Error (MAE): {mae:.3f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.3f}')


Mean Absolute Error (MAE): 75.430
Root Mean Squared Error (RMSE): 127.758


In [75]:
# compared to always predicting 0
y_pred_baseline = np.zeros_like(y_test)
mae_baseline = mean_absolute_error(y_test, y_pred_baseline)
rmse_baseline = np.sqrt(mean_squared_error(y_test, y_pred_baseline))

print(f'Mean Absolute Error (Baseline): {mae_baseline:.3f}')
print(f'Root Mean Squared Error (Baseline): {rmse_baseline:.3f}')

Mean Absolute Error (Baseline): 95.688
Root Mean Squared Error (Baseline): 160.782


# Training quantile Models

In [172]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error
from sklearn.ensemble import HistGradientBoostingRegressor

# Calculate the residuals (difference between actual and forecast)
df_latest['residual'] = df_latest['Wind_MW'] - df_latest['PowerOutput_opt']

# add three lag features for WindSpeed:100_dwd
df_latest['WindSpeed:100_dwd_lag1'] = df_latest['WindSpeed:100_dwd'].shift(1)
df_latest['WindSpeed:100_dwd_lag2'] = df_latest['WindSpeed:100_dwd'].shift(2)
df_latest['WindSpeed:100_dwd_lag3'] = df_latest['WindSpeed:100_dwd'].shift(3)

df_latest = df_latest.dropna()

# Define the features (X) and the target (y)
X = df_latest[['WindSpeed:100_dwd', 'Temperature_avg', 'RelativeHumidity_avg', 'AirDensity', 'UsableWindPower_opt', 'WindSpeed:100_dwd_lag1', 'WindSpeed:100_dwd_lag2', 'WindSpeed:100_dwd_lag3']]
y = df_latest['residual']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [177]:
# Define the quantiles we want to model
quantiles = np.arange(0.1, 1.0, 0.1)

# Dictionary to hold the best models for each quantile
best_models = {}

# Loop over each quantile and train a model
for quantile in quantiles:
    print(f"Training model for {quantile * 100:.0f}% quantile...")
    
    # Initialize the Gradient Boosting Regressor with the quantile loss
    gbr = HistGradientBoostingRegressor(loss='quantile', quantile=quantile, random_state=42)
    
    # Define the parameter grid for hyperparameter tuning
    param_grid = {
        'learning_rate': [0.01, 0.1],  # Controls the step size in each boosting step
        'max_iter': [300, 500],  # Number of boosting rounds
        'max_depth': [3, 5, 7],  # Maximum depth of the trees
        'min_samples_leaf': [1, 5, 10],  # Minimum number of samples required to be at a leaf node
    }

    # Custom scoring function for quantile regression (Pinball loss)
    def pinball_loss(y_true, y_pred):
        delta = y_true - y_pred
        return np.mean(np.maximum(quantile * delta, (quantile - 1) * delta))
    
    pinball_scorer = make_scorer(pinball_loss, greater_is_better=False)
    
    # Initialize GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(estimator=gbr, param_grid=param_grid, cv=3, n_jobs=-1, verbose=1, scoring=pinball_scorer)
    
    # Fit the model to the training data
    grid_search.fit(X_train, y_train)
    
    # Store the best model for this quantiles
    best_models[quantile] = grid_search.best_estimator_
    
    print(f"Best parameters for {quantile * 100:.0f}% quantile: {grid_search.best_params_}")

# All best models are now stored in the best_models dictionary.


Training model for 10% quantile...
Fitting 3 folds for each of 36 candidates, totalling 108 fits


Best parameters for 10% quantile: {'learning_rate': 0.1, 'max_depth': 7, 'max_iter': 300, 'min_samples_leaf': 5}
Training model for 20% quantile...
Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best parameters for 20% quantile: {'learning_rate': 0.1, 'max_depth': 7, 'max_iter': 300, 'min_samples_leaf': 10}
Training model for 30% quantile...
Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best parameters for 30% quantile: {'learning_rate': 0.1, 'max_depth': 7, 'max_iter': 300, 'min_samples_leaf': 10}
Training model for 40% quantile...
Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best parameters for 40% quantile: {'learning_rate': 0.1, 'max_depth': 7, 'max_iter': 300, 'min_samples_leaf': 5}
Training model for 50% quantile...
Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best parameters for 50% quantile: {'learning_rate': 0.1, 'max_depth': 7, 'max_iter': 300, 'min_samples_leaf': 10}
Training model for 60% quantile...
Fitting 3

In [178]:
# Loop over each quantile to evaluate the models
for quantile, model in best_models.items():
    # Predict the residuals for the test set
    y_pred = model.predict(X_test)
    
    # Calculate the evaluation metrics
    me = np.mean(y_test - y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    pinball = pinball_loss(y_test, y_pred)
    
    print(f"Evaluation for {quantile * 100:.0f}% quantile:")
    print(f"Mean Error (ME): {me:.3f}")
    print(f"Mean Absolute Error (MAE): {mae:.3f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.3f}")
    print(f"Pinball Loss: {pinball:.3f}")
    print("-" * 40)


Evaluation for 10% quantile:
Mean Error (ME): 117.881
Mean Absolute Error (MAE): 136.012
Root Mean Squared Error (RMSE): 191.349
Pinball Loss: 20.854
----------------------------------------
Evaluation for 20% quantile:
Mean Error (ME): 70.482
Mean Absolute Error (MAE): 102.487
Root Mean Squared Error (RMSE): 158.741
Pinball Loss: 30.099
----------------------------------------
Evaluation for 30% quantile:
Mean Error (ME): 39.829
Mean Absolute Error (MAE): 87.369
Root Mean Squared Error (RMSE): 143.935
Pinball Loss: 35.719
----------------------------------------
Evaluation for 40% quantile:
Mean Error (ME): 14.761
Mean Absolute Error (MAE): 80.851
Root Mean Squared Error (RMSE): 138.097
Pinball Loss: 38.949
----------------------------------------
Evaluation for 50% quantile:
Mean Error (ME): -7.149
Mean Absolute Error (MAE): 80.011
Root Mean Squared Error (RMSE): 138.591
Pinball Loss: 40.006
----------------------------------------
Evaluation for 60% quantile:
Mean Error (ME): -29.03

In [179]:
import joblib
for quantile, model in best_models.items():
    joblib.dump(model, os.path.join(os.getcwd(), 'models', f'gbr_quantile_{quantile:.1f}.pkl'))

# XGBoost

In [None]:
# Calculate the residuals (difference between actual and forecast)
df_latest['residual'] = df_latest['Wind_MW'] - df_latest['PowerOutput_opt']

# add three lag features for WindSpeed:100_dwd
df_latest['WindSpeed:100_dwd_lag1'] = df_latest['WindSpeed:100_dwd'].shift(1)
df_latest['WindSpeed:100_dwd_lag2'] = df_latest['WindSpeed:100_dwd'].shift(2)
df_latest['WindSpeed:100_dwd_lag3'] = df_latest['WindSpeed:100_dwd'].shift(3)

df_latest = df_latest.dropna()

# Define the features (X) and the target (y)
X = df_latest[['WindSpeed:100_dwd', 'Temperature_avg', 'RelativeHumidity_avg', 'AirDensity', 'UsableWindPower_opt', 'WindSpeed:100_dwd_lag1', 'WindSpeed:100_dwd_lag2', 'WindSpeed:100_dwd_lag3']]
y = df_latest['residual']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the quantiles we want to model
quantiles = np.arange(0.1, 1.0, 0.1)

# Dictionary to hold the best models for each quantile
best_models = {}

# Loop over each quantile and train a model
for quantile in quantiles:
    print(f"Training model for {quantile * 100:.0f}% quantile...")
    
    # Initialize the XGBoost Regressor with the quantile loss and GPU support
    xg_reg = xgb.XGBRegressor(objective='reg:quantile', alpha=quantile, tree_method='gpu_hist', random_state=42, verbosity=1)
    
    # Define the parameter grid for hyperparameter tuning
    param_grid = {
        'learning_rate': [0.01, 0.1],
        'n_estimators': [300, 500],
        'max_depth': [3, 5, 7],
        'subsample': [0.7],
        'colsample_bytree': [0.7, 1.0]
    }

    # Custom scoring function for quantile regression (Pinball loss)
    def pinball_loss(y_true, y_pred):
        delta = y_true - y_pred
        return np.mean(np.maximum(quantile * delta, (quantile - 1) * delta))
    
    pinball_scorer = make_scorer(pinball_loss, greater_is_better=False)
    
    # Initialize GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(estimator=xg_reg, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring=pinball_scorer)
    
    # Fit the model to the training data
    grid_search.fit(X_train, y_train)
    
    # Store the best model for this quantile
    best_models[quantile] = grid_search.best_estimator_
    
    print(f"Best parameters for {quantile * 100:.0f}% quantile: {grid_search.best_params_}")

In [None]:
# Loop over each quantile to evaluate the models
for quantile, model in best_models.items():
    # Predict the residuals for the test set
    y_pred = model.predict(X_test)
    
    # Calculate the evaluation metrics
    me = np.mean(y_test - y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    pinball = pinball_loss(y_test, y_pred)
    
    print(f"Evaluation for {quantile * 100:.0f}% quantile:")
    print(f"Mean Error (ME): {me:.3f}")
    print(f"Mean Absolute Error (MAE): {mae:.3f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.3f}")
    print(f"Pinball Loss: {pinball:.3f}")
    print("-" * 40)

In [None]:
for quantile, model in best_models.items():
    joblib.dump(model, os.path.join(os.getcwd(), 'models', f'xgboost_quantile_{quantile:.1f}.pkl'))

# Exploring the performance for sample dates

In [96]:
import joblib
import os
import plotly.graph_objs as go
import numpy as np

# Define the quantiles
quantiles = np.arange(0.1, 1.0, 0.1)

# Load the models for GBR and XGBoost from the saved files
best_models_gbr = {}
best_models_xgb = {}

# Load GBR models
for quantile in quantiles:
    model_path_gbr = os.path.join(os.getcwd(), 'models', f'gbr_quantile_{quantile:.1f}.pkl')
    best_models_gbr[quantile] = joblib.load(model_path_gbr)

# Load XGBoost models
for quantile in quantiles:
    model_path_xgb = os.path.join(os.getcwd(), 'models', f'xgboost_quantile_{quantile:.1f}.pkl')
    best_models_xgb[quantile] = joblib.load(model_path_xgb)

# Now, make predictions using the loaded models for your sample data
sample_date = pd.to_datetime(np.datetime64('2024-01-01')).tz_localize('UTC')
print(sample_date.date())
sample_data = df_latest[(df_latest['valid_time'].dt.date == sample_date.date())]

# Prepare the feature data for prediction from the sample_data
X_sample = sample_data[['WindSpeed:100_avg', 'WindDirection:100_dwd', 'Temperature_avg', 'RelativeHumidity_avg',
                        'WindDirection:100_ncep', 'AirDensity', 'MIP', 'UsableWindPower']]

# Predict with GradientBoostingRegressor models
gbr_predictions = {}
for quantile in quantiles:
    gbr_predictions[quantile] = best_models_gbr[quantile].predict(X_sample) + sample_data['PowerOutput_opt']

# Predict with XGBRegressor models
xgb_predictions = {}
for quantile in quantiles:
    xgb_predictions[quantile] = best_models_xgb[quantile].predict(X_sample) + sample_data['PowerOutput']

# Real data (Wind_MW)
y_real = sample_data['Wind_MW'].values

# Create a plotly figure
fig1 = go.Figure()
fig2 = go.Figure()

# Plot real data (Wind_MW)
fig1.add_trace(go.Scatter(x=sample_data['valid_time'], y=y_real, mode='lines+markers', name='Real Data (Wind_MW)', line=dict(color='black')))
fig2.add_trace(go.Scatter(x=sample_data['valid_time'], y=y_real, mode='lines+markers', name='Real Data (Wind_MW)', line=dict(color='black')))


# Plot GBR quantile predictions
for quantile in quantiles:
    fig1.add_trace(go.Scatter(x=sample_data['valid_time'], y=gbr_predictions[quantile], mode='lines', name=f'GBR {int(quantile * 100)}th Percentile', line=dict(dash='dash')))

# Plot XGB quantile predictions
for quantile in quantiles:
    fig2.add_trace(go.Scatter(x=sample_data['valid_time'], y=xgb_predictions[quantile], mode='lines', name=f'XGB {int(quantile * 100)}th Percentile'))

# Update the layout of the plot
fig1.update_layout(
    title=f"Quantile Predictions (GradientBoosting and XGBoost) on {sample_date.date()}",
    xaxis_title="Valid Time",
    yaxis_title="Power Output (MW)",
    hovermode="x unified",
    legend_title="Models"
)
fig2.update_layout(
    title=f"Quantile Predictions (GradientBoosting and XGBoost) on {sample_date.date()}",
    xaxis_title="Valid Time",
    yaxis_title="Power Output (MW)",
    hovermode="x unified",
    legend_title="Models"
)


# Show the interactive plot
fig1.show()
fig2.show()


2024-01-01



  Loading from a raw memory buffer (like pickle in Python, RDS in R) on a CPU-only
  machine. Consider using `save_model/load_model` instead. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.  Changing `tree_method` to `hist`.






  Loading from a raw memory buffer (like pickle in Python, RDS in R) on a CPU-only
  machine. Consider using `save_model/load_model` instead. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.  Changing `tree_method` to `hist`.







In [97]:
# save df_latest to csv
df_latest.to_csv(os.path.join(os.getcwd(), '..', 'data', 'wind_training.csv'), index=False)