In [30]:
import pandas as pd
import numpy as np
import os
import sys
from scipy.interpolate import CubicSpline
import warnings

pd.options.mode.chained_assignment = None  # default='warn'
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [31]:
# Read the CSV file
df_wind = pd.read_csv(os.path.join(os.getcwd(), '..', 'data', 'wind2.csv'))

df_wind = df_wind.drop(columns=['SS_Price', 'boa_MWh', 'DA_Price', 'Wind_MWh_credit', 'dtm'])
df_wind['reference_time'] = pd.to_datetime(df_wind['reference_time'])
df_wind['valid_time'] = pd.to_datetime(df_wind['valid_time'])
df_latest = df_wind.groupby('valid_time').tail(1)

In [32]:
# drop everything after Jan 20, 2024
df_latest = df_latest[df_latest['valid_time'] < '2024-01-21']

In [33]:
df_latest['WindSpeed_avg'] = (df_latest['WindSpeed_dwd'] + df_latest['WindSpeed_ncep']) / 2
df_latest['WindSpeed:100_avg'] = (df_latest['WindSpeed:100_dwd'] + df_latest['WindSpeed:100_ncep']) / 2
df_latest['WindSpeed_full_avg'] = (df_latest['WindSpeed_avg'] + df_latest['WindSpeed:100_avg']) / 2
df_latest['WindSpeed^3_avg'] = (df_latest['WindSpeed^3_dwd'] + df_latest['WindSpeed^3_ncep']) / 2
df_latest['WindSpeed^3:100_avg'] = (df_latest['WindSpeed^3:100_dwd'] + df_latest['WindSpeed^3:100_ncep']) / 2
df_latest['WindSpeed^3_full_avg'] = (df_latest['WindSpeed^3_avg'] + df_latest['WindSpeed^3:100_avg']) / 2
df_latest['Temperature_avg'] = (df_latest['Temperature_dwd'] + df_latest['Temperature_ncep']) / 2
df_latest['RelativeHumidity_avg'] = (df_latest['RelativeHumidity_dwd'] + df_latest['RelativeHumidity_ncep']) / 2

df_latest = df_latest.drop(columns=['WindSpeed_dwd', 'WindSpeed_ncep', 'Temperature_dwd', 'Temperature_ncep', 'RelativeHumidity_dwd', 'RelativeHumidity_ncep', 'WindSpeed:100_dwd', 'WindSpeed:100_ncep', 'WindSpeed^3_dwd', 'WindSpeed^3_ncep', 'WindSpeed^3:100_dwd', 'WindSpeed^3:100_ncep'])

In [34]:
# Switch WindSpeed and WindSpeed^3 columns
df_latest['WindSpeed_avg'], df_latest['WindSpeed^3_avg'] = df_latest['WindSpeed^3_avg'], df_latest['WindSpeed_avg']
df_latest['WindSpeed:100_avg'], df_latest['WindSpeed^3:100_avg'] = df_latest['WindSpeed^3:100_avg'], df_latest['WindSpeed:100_avg']
df_latest['WindSpeed_full_avg'], df_latest['WindSpeed^3_full_avg'] = df_latest['WindSpeed^3_full_avg'], df_latest['WindSpeed_full_avg']

# i. Calculate Power Forecast based on Physics

## 1. Calculate Air Density

In [35]:
# Constants
R_d = 287.05  # Specific gas constant for dry air (J/(kg·K))
R_v = 461.5   # Specific gas constant for water vapor (J/(kg·K))
p = 101325    # Standard atmospheric pressure in Pa

# Assuming df_latest is your original DataFrame and contains 'Temperature_dwd', 'RelativeHumidity_dwd', 'WindSpeed_dwd'
# Convert temperature from Celsius to Kelvin
df_latest['Temperature_K'] = df_latest['Temperature_avg'] + 273.15

# Calculate saturation vapor pressure (using temperature in Celsius), Tetens formula
e_s = 0.61078 * np.exp((17.27 * df_latest['Temperature_avg']) / (df_latest['Temperature_avg'] + 237.3))

# in pa
e_s = 1000 * e_s

# Calculate actual vapor pressure
e = df_latest['RelativeHumidity_avg'] / 100 * e_s

# Calculate air density (ρ) in kg/m³
df_latest['AirDensity'] = (p - e) / (R_d * df_latest['Temperature_K']) + (e / (R_v * df_latest['Temperature_K']))

## 2. Calculate Power based on Density and Wind Speed

In [36]:
# Turbine stats
rotor_diameter = 154  # in meters
approximated_total_efficiency = 0.31  # 31% efficiency
minimum_wind_speed = 3  # in m/s
maximum_wind_speed_for_power_curve = 12.5  # in m/s
maximum_wind_speed_for_operation = 25  # in m/s
rotor_area = np.pi * (rotor_diameter / 2) ** 2  # in m²
# turbine requires 3m/s to start rotating
const_internal_friction_coefficient = 0.5 * 1.240 * np.pi * 77**2 * 3**3 * approximated_total_efficiency * 174 / 1000000
maximum_power_per_turbine = 7 # in MW

# Calculating the Generated power
df_latest['WindPower'] = 0.5 * df_latest['AirDensity'] * rotor_area * df_latest['WindSpeed_avg'] ** 3 * 174 / 1000000
df_latest['UsableWindPower'] = np.minimum(df_latest['WindPower'], maximum_power_per_turbine * 174 / approximated_total_efficiency)
# depending on the wind speed, the power output is limited to the maximum power output of the turbine or 0
df_latest['PowerOutput'] = np.where((df_latest['WindSpeed_avg'] >= minimum_wind_speed) & (df_latest['WindSpeed_avg'] <= maximum_wind_speed_for_operation), df_latest['UsableWindPower'] * approximated_total_efficiency - const_internal_friction_coefficient, 0)

# Same for 100m
df_latest['WindPower:100'] = 0.5 * df_latest['AirDensity'] * rotor_area * df_latest['WindSpeed:100_avg'] ** 3 * 174 / 1000000
df_latest['UsableWindPower:100'] = np.minimum(df_latest['WindPower:100'], maximum_power_per_turbine * 174 / approximated_total_efficiency)
df_latest['PowerOutput:100'] = np.where((df_latest['WindSpeed:100_avg'] >= minimum_wind_speed) & (df_latest['WindSpeed:100_avg'] <= maximum_wind_speed_for_operation), df_latest['UsableWindPower:100'] * approximated_total_efficiency - const_internal_friction_coefficient, 0)

# Same for full
df_latest['WindPower_full'] = 0.5 * df_latest['AirDensity'] * rotor_area * df_latest['WindSpeed_full_avg'] ** 3 * 174 / 1000000
# df_latest['UsableWindPower_full'] = np.minimum(df_latest['WindPower_full'], 0.5 * df_latest['AirDensity'] * rotor_area * maximum_wind_speed_for_power_curve ** 3 * 174 / 1000000) - const_internal_friction_coefficient
# df_latest['PowerOutput_full'] = np.where((df_latest['WindSpeed_full_avg'] >= minimum_wind_speed) & (df_latest['WindSpeed_full_avg'] <= maximum_wind_speed_for_operation), df_latest['UsableWindPower_full'] * approximated_total_efficiency, 0)
df_latest['UsableWindPower_full'] = np.minimum(df_latest['WindPower_full'], maximum_power_per_turbine * 174 / approximated_total_efficiency)
df_latest['PowerOutput_full'] = np.where((df_latest['WindSpeed_full_avg'] >= minimum_wind_speed) & (df_latest['WindSpeed_full_avg'] <= maximum_wind_speed_for_operation), df_latest['UsableWindPower_full'] * approximated_total_efficiency - const_internal_friction_coefficient, 0)


In [37]:
print(max(df_latest['PowerOutput_full']))
print(const_internal_friction_coefficient)
print(max(df_latest['Wind_MW']))

1201.1810949493763
16.818905050623663
1192.744


In [38]:
# Forecast stats
# Absolute error
print('Mean Absolute error:', np.mean(np.abs(df_latest['Wind_MW'] - df_latest['PowerOutput'])))
print('Mean Absolute error:100:', np.mean(np.abs(df_latest['Wind_MW'] - df_latest['PowerOutput:100'])))
print('Mean Absolute error:full:', np.mean(np.abs(df_latest['Wind_MW'] - df_latest['PowerOutput_full'])))
# Initialize a list to collect the stats
stats_df = pd.DataFrame({'quantile': [], 'quantile_range': [], 'mean_absolute_error': [], 'mean_error': [], 'mean_absolute_error:100': [], 'mean_error:100': [], 'mean_absolute_error:full': [], 'mean_error:full': []})

# Mean absolute error in 10% quantiles
for q in np.arange(0.1, 1.1, 0.1):
    # Filter the DataFrame for each quantile range
    quantile_df = df_latest[(df_latest['Wind_MW'] < df_latest['Wind_MW'].quantile(q)) & (df_latest['Wind_MW'] >= df_latest['Wind_MW'].quantile(q - 0.1))]
    
    # Calculate the statistics
    quantile_stats = {
        'quantile': q,
        'quantile_range': f'{df_latest["Wind_MW"].quantile(q - 0.1):.1f} - {df_latest["Wind_MW"].quantile(q):.1f}',
        'mean_absolute_error': np.mean(np.abs(quantile_df['Wind_MW'] - quantile_df['PowerOutput'])),
        'mean_error': np.mean(quantile_df['Wind_MW'] - quantile_df['PowerOutput']),
        'mean_absolute_error:100': np.mean(np.abs(quantile_df['Wind_MW'] - quantile_df['PowerOutput:100'])),
        'mean_error:100': np.mean(quantile_df['Wind_MW'] - quantile_df['PowerOutput:100']),
        'mean_absolute_error:full': np.mean(np.abs(quantile_df['Wind_MW'] - quantile_df['PowerOutput_full'])),
        'mean_error:full': np.mean(quantile_df['Wind_MW'] - quantile_df['PowerOutput_full'])
    }
    
    # Append to the list
    stats_df.loc[len(stats_df)] = quantile_stats

# Display the DataFrame
stats_df

Mean Absolute error: 209.1264007953847
Mean Absolute error:100: 101.04136737561544
Mean Absolute error:full: 139.53031236383276


Unnamed: 0,quantile,quantile_range,mean_absolute_error,mean_error,mean_absolute_error:100,mean_error:100,mean_absolute_error:full,mean_error:full
0,0.1,0.0 - 18.8,27.932479,-23.754169,39.517385,-37.074092,32.384407,-29.248243
1,0.2,18.8 - 91.8,36.765569,22.200508,39.392657,-9.300244,34.068893,8.844873
2,0.3,91.8 - 198.0,89.021038,66.889237,70.498968,1.10643,73.402234,38.245138
3,0.4,198.0 - 339.9,148.778419,136.584573,96.019216,21.026575,113.473491,86.621036
4,0.5,339.9 - 518.1,237.238041,213.513098,135.888005,40.565237,177.551494,137.964223
5,0.6,518.1 - 748.9,338.816451,314.96611,174.402589,69.991685,248.500381,206.278986
6,0.7,748.9 - 994.1,427.989567,388.522649,201.631115,59.18924,297.022398,231.71088
7,0.8,994.1 - 1108.8,351.655802,285.785472,141.210712,-26.47831,221.998811,117.7806
8,0.9,1108.8 - 1154.5,270.656903,219.920388,79.268065,-39.587863,135.420867,50.725526
9,1.0,1154.5 - 1192.7,162.487407,132.021472,32.622062,-21.511389,61.533999,16.380763


In [57]:
# average difference between WindSpeed_avg and WindSpeed:100_avg
print(np.mean(df_latest['WindSpeed_avg'] - df_latest['WindSpeed:100_avg']))

-1.8531628967654072


## 3. Find optimal $\eta$ to update the approximation of the turbine efficiency

In [65]:
errors = {}
for efficiency in np.arange(0.28, 0.35, 0.001):
    for limiter in np.arange(0.9, 1.01, 0.01):
        const_internal_friction_coefficient = 0.5 * 1.240 * np.pi * 77**2 * 3**3 * efficiency * 174 / 1000000
        df_latest['WindPower_full'] = 0.5 * df_latest['AirDensity'] * rotor_area * df_latest['WindSpeed:100_avg'] ** 3 * 174 / 1000000
        df_latest['UsableWindPower_full'] = np.minimum(df_latest['WindPower_full'], maximum_power_per_turbine * 174 * limiter / efficiency)
        df_latest['PowerOutput_full'] = np.where((df_latest['WindSpeed:100_avg'] >= minimum_wind_speed) & (df_latest['WindSpeed:100_avg'] <= maximum_wind_speed_for_operation), df_latest['UsableWindPower_full'] * efficiency - const_internal_friction_coefficient, 0)
        mean_abs_error = np.mean(np.abs(df_latest['Wind_MW'] - df_latest['PowerOutput_full']))
        mean_error = np.mean(df_latest['Wind_MW'] - df_latest['PowerOutput_full'])
        errors[(efficiency.round(3), limiter.round(2))] = [mean_abs_error, mean_error]

In [66]:
# Find the minimum error
min_error = min(errors, key=lambda x: errors[x])
print(f'Minimum mean absolute error: {errors[min_error][0]:.2f} with efficiency={min_error}')
print(f'Mean error: {errors[min_error][1]:.2f}')

Minimum mean absolute error: 88.20 with efficiency=(0.348, 0.94)
Mean error: -1.87


In [67]:
approximated_total_efficiency = min_error[0]
limiter = min_error[1]
# maximum_wind_speed_for_power_curve = 12.0

In [68]:
const_internal_friction_coefficient = 0.5 * 1.240 * np.pi * 77**2 * 3**3 * approximated_total_efficiency * 174 / 1000000
df_latest['WindPower_opt'] = 0.5 * df_latest['AirDensity'] * rotor_area * df_latest['WindSpeed:100_avg'] ** 3 * 174 / 1000000
df_latest['UsableWindPower_opt'] = np.minimum(df_latest['WindPower_full'], maximum_power_per_turbine * 174 * limiter / approximated_total_efficiency)
df_latest['PowerOutput_opt'] = np.where((df_latest['WindSpeed:100_avg'] >= minimum_wind_speed) & (df_latest['WindSpeed:100_avg'] <= maximum_wind_speed_for_operation), df_latest['UsableWindPower_full'] * approximated_total_efficiency - const_internal_friction_coefficient, 0)

# Update stats_df using optimal values
for q in np.arange(0.1, 1.1, 0.1):
    quantile_df = df_latest[(df_latest['Wind_MW'] < df_latest['Wind_MW'].quantile(q)) & (df_latest['Wind_MW'] >= df_latest['Wind_MW'].quantile(q - 0.1))]
    quantile_row = {
        'quantile': q,
        'mean_absolute_error:100_opt': np.mean(np.abs(quantile_df['Wind_MW'] - quantile_df['PowerOutput_opt'])),
        'mean_error:100_opt': np.mean(quantile_df['Wind_MW'] - quantile_df['PowerOutput_opt'])
    }
    stats_df.loc[stats_df['quantile'] == q, 'mean_absolute_error:100_opt'] = quantile_row['mean_absolute_error:100_opt']
    stats_df.loc[stats_df['quantile'] == q, 'mean_error:100_opt'] = quantile_row['mean_error:100_opt']

stats_df

Unnamed: 0,quantile,quantile_range,mean_absolute_error,mean_error,mean_absolute_error:100,mean_error:100,mean_absolute_error:full,mean_error:full,mean_absolute_error:100_new,mean_error:100_new,mean_absolute_error:100_opt,mean_error:100_opt
0,0.1,0.0 - 18.8,27.932479,-23.754169,39.517385,-37.074092,32.384407,-29.248243,44.188882,-42.930866,42.121918,-39.75837
1,0.2,18.8 - 91.8,36.765569,22.200508,39.392657,-9.300244,34.068893,8.844873,41.064545,-12.242491,43.392514,-16.878471
2,0.3,91.8 - 198.0,89.021038,66.889237,70.498968,1.10643,73.402234,38.245138,73.398135,-5.023605,73.578106,-15.064221
3,0.4,198.0 - 339.9,148.778419,136.584573,96.019216,21.026575,113.473491,86.621036,100.139931,10.630677,98.144508,-8.174356
4,0.5,339.9 - 518.1,237.238041,213.513098,135.888005,40.565237,177.551494,137.964223,140.769809,22.575658,134.930909,-3.770418
5,0.6,518.1 - 748.9,338.816451,314.96611,174.402589,69.991685,248.500381,206.278986,179.604369,42.222985,168.010333,6.620361
6,0.7,748.9 - 994.1,427.989567,388.522649,201.631115,59.18924,297.022398,231.71088,205.84124,20.530012,191.040161,-15.746537
7,0.8,994.1 - 1108.8,351.655802,285.785472,141.210712,-26.47831,221.998811,117.7806,149.710119,-33.507504,131.507536,-64.78227
8,0.9,1108.8 - 1154.5,270.656903,219.920388,79.268065,-39.587863,135.420867,50.725526,89.643791,-26.026744,68.559372,-47.482362
9,1.0,1154.5 - 1192.7,162.487407,132.021472,32.622062,-21.511389,61.533999,16.380763,37.343797,-12.448353,25.363587,-19.243543


## 3. Make Forecast more accurate using smoothing of windspeed between datapoints

In [43]:
df_latest.dtypes

reference_time                 datetime64[ns]
valid_time                datetime64[ns, UTC]
WindDirection_dwd                     float64
WindDirection:100_dwd                 float64
WindSpeed:100^3_dwd                   float64
WindDirection_ncep                    float64
WindDirection:100_ncep                float64
WindSpeed:100^3_ncep                  float64
MIP                                   float64
Wind_MW                               float64
WindSpeed_avg                         float64
WindSpeed:100_avg                     float64
WindSpeed_full_avg                    float64
WindSpeed^3_avg                       float64
WindSpeed^3:100_avg                   float64
WindSpeed^3_full_avg                  float64
Temperature_avg                       float64
RelativeHumidity_avg                  float64
Temperature_K                         float64
AirDensity                            float64
WindPower                             float64
UsableWindPower                   

In [69]:
# Iterate over each reference_time
for reference_time, group in df_latest.groupby('reference_time'):

    # Get the valid_times and corresponding wind speeds for this reference_time
    valid_times = group['valid_time'].values
    wind_speeds = group['WindSpeed:100_avg'].values
    actual_generation = group['Wind_MW'].values
    
    # Convert valid_times to seconds for interpolation
    valid_times_in_seconds = (valid_times - valid_times[0]).astype('timedelta64[m]').astype(int) * 60  # seconds
    
    if len(valid_times) < 2:
        wind_power = 0.5 * group['AirDensity'].iloc[0] * rotor_area * avg_cubed_wind * 174 / 1000000
        usable_wind_power = min(wind_power, 7 * 174 * limiter / approximated_total_efficiency)
        power_output = usable_wind_power * approximated_total_efficiency - const_internal_friction_coefficient if (group['WindSpeed:100_avg'].iloc[0] >= minimum_wind_speed) and (group['WindSpeed:100_avg'].iloc[0] <= maximum_wind_speed_for_operation) else 0

        # Update PowerOutput
        df_latest.loc[(df_latest['reference_time'] == reference_time) & (df_latest['valid_time'] == valid_times[0]), 'PowerOutput'] = power_output
        continue
    # Perform cubic spline interpolation for 1-minute intervals
    wind_speed_spline = CubicSpline(valid_times_in_seconds, wind_speeds)
    
    # Generate 1-minute intervals for the valid_time range
    for i in range(len(valid_times) - 1):
        t_start = valid_times_in_seconds[i]
        t_end = valid_times_in_seconds[i + 1]
        
        # Generate time points at 1-minute intervals within this 30-minute window
        times_1min = np.arange(t_start, t_end, 60)
        
        # Interpolate wind speeds at 1-minute intervals
        interpolated_wind_speeds = wind_speed_spline(times_1min)
        
        # Calculate the average wind power using the cubed wind speeds, reduce speeds <3ms to 0
        avg_cubed_wind = np.mean(np.where((interpolated_wind_speeds >= 3) & (interpolated_wind_speeds <= 25), interpolated_wind_speeds, 0) ** 3)
        avg_cubed_wind = np.mean(interpolated_wind_speeds ** 3)
        # get frac with windspeed > 3 and < 25
        frac_generation = np.mean(np.where((interpolated_wind_speeds >= 3) & (interpolated_wind_speeds <= 25), 1, 0))
        
        # Calculate wind power and apply limits for each interval
        wind_power = 0.5 * group['AirDensity'].iloc[i] * rotor_area * avg_cubed_wind * 174 / 1000000
        usable_wind_power = min(wind_power, 7 * 174 * limiter / approximated_total_efficiency)
        
        # Calculate final power output based on cut-in, cut-out wind speeds and efficiency
        power_output = max(0, usable_wind_power * approximated_total_efficiency - const_internal_friction_coefficient * frac_generation)
        
        # Update PowerOutput
        valid_time = pd.to_datetime(valid_times[i]).tz_localize('UTC')
        df_latest.loc[(df_latest['reference_time'] == reference_time) & (df_latest['valid_time'] == valid_time), 'PowerOutput'] = power_output

In [70]:
# Absolute error
print('Mean Absolute error:', np.mean(np.abs(df_latest['Wind_MW'] - df_latest['PowerOutput'])))
# Mean absolute error in 10% quantiles
for q in np.arange(0.1, 1.1, 0.1):
    quantile_df = df_latest[(df_latest['Wind_MW'] < df_latest['Wind_MW'].quantile(q)) & (df_latest['Wind_MW'] >= df_latest['Wind_MW'].quantile(q - 0.1))]
    quantile_row = {
        'quantile': q,
        'mean_absolute_error:100_new': np.mean(np.abs(quantile_df['Wind_MW'] - quantile_df['PowerOutput'])),
        'mean_error:100_new': np.mean(quantile_df['Wind_MW'] - quantile_df['PowerOutput'])
    }
    stats_df.loc[stats_df['quantile'] == q, 'mean_absolute_error:100_new'] = quantile_row['mean_absolute_error:100_new']
    stats_df.loc[stats_df['quantile'] == q, 'mean_error:100_new'] = quantile_row['mean_error:100_new']

stats_df


Mean Absolute error: 96.7226837845743


Unnamed: 0,quantile,quantile_range,mean_absolute_error,mean_error,mean_absolute_error:100,mean_error:100,mean_absolute_error:full,mean_error:full,mean_absolute_error:100_new,mean_error:100_new,mean_absolute_error:100_opt,mean_error:100_opt
0,0.1,0.0 - 18.8,27.932479,-23.754169,39.517385,-37.074092,32.384407,-29.248243,43.855849,-42.678786,42.121918,-39.75837
1,0.2,18.8 - 91.8,36.765569,22.200508,39.392657,-9.300244,34.068893,8.844873,41.751918,-14.031171,43.392514,-16.878471
2,0.3,91.8 - 198.0,89.021038,66.889237,70.498968,1.10643,73.402234,38.245138,73.472864,-7.977886,73.578106,-15.064221
3,0.4,198.0 - 339.9,148.778419,136.584573,96.019216,21.026575,113.473491,86.621036,100.515856,4.945644,98.144508,-8.174356
4,0.5,339.9 - 518.1,237.238041,213.513098,135.888005,40.565237,177.551494,137.964223,140.023367,14.914075,134.930909,-3.770418
5,0.6,518.1 - 748.9,338.816451,314.96611,174.402589,69.991685,248.500381,206.278986,176.564823,32.504004,168.010333,6.620361
6,0.7,748.9 - 994.1,427.989567,388.522649,201.631115,59.18924,297.022398,231.71088,190.6146,21.252765,191.040161,-15.746537
7,0.8,994.1 - 1108.8,351.655802,285.785472,141.210712,-26.47831,221.998811,117.7806,101.048893,7.83152,131.507536,-64.78227
8,0.9,1108.8 - 1154.5,270.656903,219.920388,79.268065,-39.587863,135.420867,50.725526,43.307891,34.531455,68.559372,-47.482362
9,1.0,1154.5 - 1192.7,162.487407,132.021472,32.622062,-21.511389,61.533999,16.380763,56.100945,53.528314,25.363587,-19.243543


In [56]:
df_latest['PowerOutput'].max()

1218.0

## => It got worse????

# ii. Modeling the residium 

## 1. XGBoost

In [72]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Calculate the residuals (difference between actual and forecast)
df_latest['residual'] = df_latest['Wind_MW'] - df_latest['PowerOutput_opt']

# Define the features (X) and the target (y)
X = df_latest[['WindSpeed:100_avg', 'WindDirection:100_dwd', 'Temperature_avg', 'RelativeHumidity_avg', 'WindDirection:100_ncep', 'AirDensity', 'MIP', 'UsableWindPower']]  # Add any other relevant features
y = df_latest['residual']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [73]:
# Initialize the XGBoost regressor
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', seed=42)

# Define a parameter grid for hyperparameter tuning
param_grid = {
    'learning_rate': [0.01, 0.1],  # Controls the step size in each boosting step
    'n_estimators': [100, 500, 1000],  # Number of boosting rounds
    'max_depth': [5, 7],  # Maximum depth of the trees
    'subsample': [0.7, 1.0],  # Proportion of training data used for fitting the individual trees
    'colsample_bytree': [0.7, 1.0],  # Proportion of features used for fitting individual trees
    'gamma': [0.1, 0.3],  # Minimum loss reduction required to make a further partition on a leaf node
    'reg_lambda': [1, 10],  # L2 regularization term
    'reg_alpha': [0, 1],  # L1 regularization term
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xg_reg, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_absolute_error')

# Fit the model
grid_search.fit(X_train, y_train)

# Best model from GridSearchCV
best_xgb_model = grid_search.best_estimator_
print("Best parameters found: ", grid_search.best_params_)


Fitting 3 folds for each of 384 candidates, totalling 1152 fits
[CV] END colsample_bytree=0.7, gamma=0.1, learning_rate=0.01, max_depth=5, n_estimators=100, reg_alpha=0, reg_lambda=1, subsample=0.7; total time=   0.3s
[CV] END colsample_bytree=0.7, gamma=0.1, learning_rate=0.01, max_depth=5, n_estimators=100, reg_alpha=0, reg_lambda=1, subsample=0.7; total time=   0.3s
[CV] END colsample_bytree=0.7, gamma=0.1, learning_rate=0.01, max_depth=5, n_estimators=100, reg_alpha=0, reg_lambda=1, subsample=1.0; total time=   0.3s
[CV] END colsample_bytree=0.7, gamma=0.1, learning_rate=0.01, max_depth=5, n_estimators=100, reg_alpha=0, reg_lambda=1, subsample=1.0; total time=   0.3s
[CV] END colsample_bytree=0.7, gamma=0.1, learning_rate=0.01, max_depth=5, n_estimators=100, reg_alpha=0, reg_lambda=1, subsample=0.7; total time=   0.3s
[CV] END colsample_bytree=0.7, gamma=0.1, learning_rate=0.01, max_depth=5, n_estimators=100, reg_alpha=0, reg_lambda=1, subsample=1.0; total time=   0.3s
[CV] END col

In [74]:
# Predict the residuals for the test set
y_pred = best_xgb_model.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f'Mean Absolute Error (MAE): {mae:.3f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.3f}')


Mean Absolute Error (MAE): 75.430
Root Mean Squared Error (RMSE): 127.758


In [75]:
# compared to always predicting 0
y_pred_baseline = np.zeros_like(y_test)
mae_baseline = mean_absolute_error(y_test, y_pred_baseline)
rmse_baseline = np.sqrt(mean_squared_error(y_test, y_pred_baseline))

print(f'Mean Absolute Error (Baseline): {mae_baseline:.3f}')
print(f'Root Mean Squared Error (Baseline): {rmse_baseline:.3f}')

Mean Absolute Error (Baseline): 95.688
Root Mean Squared Error (Baseline): 160.782


# Training quantile Models

In [76]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

# Calculate the residuals (difference between actual and forecast)
df_latest['residual'] = df_latest['Wind_MW'] - df_latest['PowerOutput_opt']

# Define the features (X) and the target (y)
X = df_latest[['WindSpeed:100_avg', 'WindDirection:100_dwd', 'Temperature_avg', 'RelativeHumidity_avg', 
               'WindDirection:100_ncep', 'AirDensity', 'MIP', 'UsableWindPower']]  # Add any other relevant features
y = df_latest['residual']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [77]:
# Define the quantiles we want to model
quantiles = np.arange(0.1, 1.0, 0.1)

# Dictionary to hold the best models for each quantile
best_models = {}

# Loop over each quantile and train a model
for quantile in quantiles:
    print(f"Training model for {quantile * 100:.0f}% quantile...")
    
    # Initialize the Gradient Boosting Regressor with the quantile loss
    gbr = GradientBoostingRegressor(loss='quantile', alpha=quantile, random_state=42)
    
    # Define the parameter grid for hyperparameter tuning
    param_grid = {
        'learning_rate': [0.01, 0.05, 0.1],  # Controls the step size in each boosting step
        'n_estimators': [100, 300, 500],  # Number of boosting rounds
        'max_depth': [3, 5, 7],  # Maximum depth of the trees
        'subsample': [0.7, 1.0],  # Proportion of training data used for fitting individual trees
        'min_samples_leaf': [1, 5, 10],  # Minimum number of samples required to be at a leaf node
    }

    # Custom scoring function for quantile regression (Pinball loss)
    def pinball_loss(y_true, y_pred):
        delta = y_true - y_pred
        return np.mean(np.maximum(quantile * delta, (quantile - 1) * delta))
    
    pinball_scorer = make_scorer(pinball_loss, greater_is_better=False)
    
    # Initialize GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(estimator=gbr, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring=pinball_scorer)
    
    # Fit the model to the training data
    grid_search.fit(X_train, y_train)
    
    # Store the best model for this quantiles
    best_models[quantile] = grid_search.best_estimator_
    
    print(f"Best parameters for {quantile * 100:.0f}% quantile: {grid_search.best_params_}")

# All best models are now stored in the best_models dictionary.


Training model for 10% quantile...
Fitting 3 folds for each of 162 candidates, totalling 486 fits
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, n_estimators=100, subsample=0.7; total time=   9.5s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, n_estimators=100, subsample=0.7; total time=   9.7s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, n_estimators=100, subsample=0.7; total time=   9.8s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, n_estimators=100, subsample=1.0; total time=  13.1s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, n_estimators=100, subsample=1.0; total time=  13.3s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, n_estimators=100, subsample=1.0; total time=  13.3s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, n_estimators=300, subsample=0.7; total time=  30.9s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, n_estimators=300, subsample=0.7; total tim

In [None]:
# Loop over each quantile to evaluate the models
for quantile, model in best_models.items():
    # Predict the residuals for the test set
    y_pred = model.predict(X_test)
    
    # Calculate the evaluation metrics
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    print(f"Evaluation for {quantile * 100:.0f}% quantile:")
    print(f"Mean Absolute Error (MAE): {mae:.3f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.3f}")
    print("-" * 40)
