In [1]:
import pandas as pd
import numpy as np
import os
import sys
from scipy.interpolate import CubicSpline
import warnings

pd.options.mode.chained_assignment = None  # default='warn'
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [34]:
# Read the CSV file
df_wind = pd.read_csv(os.path.join(os.getcwd(), '..', 'data', 'wind.csv'), index_col=0)

df_wind = df_wind.drop(columns=['SS_Price', 'boa_MWh', 'DA_Price', 'Wind_MWh_credit', 'Solar_MWh_credit', 'Solar_MW', 'Solar_capacity_mwp', 'Solar_installedcapacity_mwp', 'dtm'])
df_wind['reference_time'] = pd.to_datetime(df_wind['reference_time'])
df_wind['valid_time'] = pd.to_datetime(df_wind['valid_time'])
df_latest = df_wind.groupby('valid_time').tail(1)

In [35]:
df_latest['WindSpeed_avg'] = (df_latest['WindSpeed_dwd'] + df_latest['WindSpeed_ncep']) / 2
df_latest['WindSpeed:100_avg'] = (df_latest['WindSpeed:100_dwd'] + df_latest['WindSpeed:100_ncep']) / 2
df_latest['WindSpeed_full_avg'] = (df_latest['WindSpeed_avg'] + df_latest['WindSpeed:100_avg']) / 2
df_latest['Temperature_avg'] = (df_latest['Temperature_dwd'] + df_latest['Temperature_ncep']) / 2
df_latest['RelativeHumidity_avg'] = (df_latest['RelativeHumidity_dwd'] + df_latest['RelativeHumidity_ncep']) / 2

df_latest = df_latest.drop(columns=['WindSpeed_dwd', 'WindSpeed_ncep', 'Temperature_dwd', 'Temperature_ncep', 'RelativeHumidity_dwd', 'RelativeHumidity_ncep', 'WindSpeed:100_dwd', 'WindSpeed:100_ncep'])

# i. Calculate Power Forecast based on Physics

## 1. Calculate Air Density

In [36]:
# Constants
R_d = 287.05  # Specific gas constant for dry air (J/(kg·K))
R_v = 461.5   # Specific gas constant for water vapor (J/(kg·K))
p = 101325    # Standard atmospheric pressure in Pa

# Assuming df_latest is your original DataFrame and contains 'Temperature_dwd', 'RelativeHumidity_dwd', 'WindSpeed_dwd'
# Convert temperature from Celsius to Kelvin
df_latest['Temperature_K'] = df_latest['Temperature_avg'] + 273.15

# Calculate saturation vapor pressure (using temperature in Celsius), Tetens formula
e_s = 0.61078 * np.exp((17.27 * df_latest['Temperature_avg']) / (df_latest['Temperature_avg'] + 237.3))

# in pa
e_s = 1000 * e_s

# Calculate actual vapor pressure
e = df_latest['RelativeHumidity_avg'] / 100 * e_s

# Calculate air density (ρ) in kg/m³
df_latest['AirDensity'] = (p - e) / (R_d * df_latest['Temperature_K']) + (e / (R_v * df_latest['Temperature_K']))

## 2. Calculate Power based on Density and Wind Speed

In [74]:
# Turbine stats
rotor_diameter = 154  # in meters
approximated_total_efficiency = 0.31  # 31% efficiency
minimum_wind_speed = 3  # in m/s
maximum_wind_speed_for_power_curve = 12.5  # in m/s
maximum_wind_speed_for_operation = 25  # in m/s
rotor_area = np.pi * (rotor_diameter / 2) ** 2  # in m²
# turbine requires 3m/s to start rotating
const_internal_friction_coefficient = 0.5 * 1.240 * np.pi * 77**2 * 3**3 * approximated_total_efficiency * 174 / 1000000
maximum_power_per_turbine = 7 # in MW

# Calculating the Generated power
df_latest['WindPower'] = 0.5 * df_latest['AirDensity'] * rotor_area * df_latest['WindSpeed_avg'] ** 3 * 174 / 1000000
df_latest['UsableWindPower'] = np.minimum(df_latest['WindPower'], maximum_power_per_turbine * 174 / approximated_total_efficiency)
# depending on the wind speed, the power output is limited to the maximum power output of the turbine or 0
df_latest['PowerOutput'] = np.where((df_latest['WindSpeed_avg'] >= minimum_wind_speed) & (df_latest['WindSpeed_avg'] <= maximum_wind_speed_for_operation), df_latest['UsableWindPower'] * approximated_total_efficiency - const_internal_friction_coefficient, 0)

# Same for 100m
df_latest['WindPower:100'] = 0.5 * df_latest['AirDensity'] * rotor_area * df_latest['WindSpeed:100_avg'] ** 3 * 174 / 1000000
df_latest['UsableWindPower:100'] = np.minimum(df_latest['WindPower:100'], maximum_power_per_turbine * 174 / approximated_total_efficiency)
df_latest['PowerOutput:100'] = np.where((df_latest['WindSpeed:100_avg'] >= minimum_wind_speed) & (df_latest['WindSpeed:100_avg'] <= maximum_wind_speed_for_operation), df_latest['UsableWindPower:100'] * approximated_total_efficiency - const_internal_friction_coefficient, 0)

# Same for full
df_latest['WindPower_full'] = 0.5 * df_latest['AirDensity'] * rotor_area * df_latest['WindSpeed_full_avg'] ** 3 * 174 / 1000000
# df_latest['UsableWindPower_full'] = np.minimum(df_latest['WindPower_full'], 0.5 * df_latest['AirDensity'] * rotor_area * maximum_wind_speed_for_power_curve ** 3 * 174 / 1000000) - const_internal_friction_coefficient
# df_latest['PowerOutput_full'] = np.where((df_latest['WindSpeed_full_avg'] >= minimum_wind_speed) & (df_latest['WindSpeed_full_avg'] <= maximum_wind_speed_for_operation), df_latest['UsableWindPower_full'] * approximated_total_efficiency, 0)
df_latest['UsableWindPower_full'] = np.minimum(df_latest['WindPower_full'], maximum_power_per_turbine * 174 / approximated_total_efficiency)
df_latest['PowerOutput_full'] = np.where((df_latest['WindSpeed_full_avg'] >= minimum_wind_speed) & (df_latest['WindSpeed_full_avg'] <= maximum_wind_speed_for_operation), df_latest['UsableWindPower_full'] * approximated_total_efficiency - const_internal_friction_coefficient, 0)


In [76]:
print(max(df_latest['PowerOutput_full']))
print(const_internal_friction_coefficient)
print(max(df_latest['Wind_MW']))

1201.1810949493763
16.818905050623663
1192.744


In [75]:
# Forecast stats
# Absolute error
print('Mean Absolute error:', np.mean(np.abs(df_latest['Wind_MW'] - df_latest['PowerOutput'])))
print('Mean Absolute error:100:', np.mean(np.abs(df_latest['Wind_MW'] - df_latest['PowerOutput:100'])))
print('Mean Absolute error:full:', np.mean(np.abs(df_latest['Wind_MW'] - df_latest['PowerOutput_full'])))
# Initialize a list to collect the stats
stats_df = pd.DataFrame({'quantile': [], 'quantile_range': [], 'mean_absolute_error': [], 'mean_error': [], 'mean_absolute_error:100': [], 'mean_error:100': [], 'mean_absolute_error:full': [], 'mean_error:full': []})

# Mean absolute error in 10% quantiles
for q in np.arange(0.1, 1.1, 0.1):
    # Filter the DataFrame for each quantile range
    quantile_df = df_latest[(df_latest['Wind_MW'] < df_latest['Wind_MW'].quantile(q)) & (df_latest['Wind_MW'] >= df_latest['Wind_MW'].quantile(q - 0.1))]
    
    # Calculate the statistics
    quantile_stats = {
        'quantile': q,
        'quantile_range': f'{df_latest["Wind_MW"].quantile(q - 0.1):.1f} - {df_latest["Wind_MW"].quantile(q):.1f}',
        'mean_absolute_error': np.mean(np.abs(quantile_df['Wind_MW'] - quantile_df['PowerOutput'])),
        'mean_error': np.mean(quantile_df['Wind_MW'] - quantile_df['PowerOutput']),
        'mean_absolute_error:100': np.mean(np.abs(quantile_df['Wind_MW'] - quantile_df['PowerOutput:100'])),
        'mean_error:100': np.mean(quantile_df['Wind_MW'] - quantile_df['PowerOutput:100']),
        'mean_absolute_error:full': np.mean(np.abs(quantile_df['Wind_MW'] - quantile_df['PowerOutput_full'])),
        'mean_error:full': np.mean(quantile_df['Wind_MW'] - quantile_df['PowerOutput_full'])
    }
    
    # Append to the list
    stats_df.loc[len(stats_df)] = quantile_stats

# Display the DataFrame
stats_df

Mean Absolute error: 213.76366667539088
Mean Absolute error:100: 120.39145629959857
Mean Absolute error:full: 151.13509440535313


Unnamed: 0,quantile,quantile_range,mean_absolute_error,mean_error,mean_absolute_error:100,mean_error:100,mean_absolute_error:full,mean_error:full
0,0.1,0.0 - 19.2,27.988181,-23.508646,39.726322,-37.045866,32.481284,-29.020234
1,0.2,19.2 - 93.8,37.495335,24.749611,39.341684,-6.964937,34.519601,11.380447
2,0.3,93.8 - 204.2,92.470917,71.936749,72.813571,4.022195,75.904112,42.500355
3,0.4,204.2 - 346.5,158.807524,135.638781,109.830357,8.644994,122.963899,80.701916
4,0.5,346.5 - 501.0,261.132218,139.746091,213.193629,-60.238557,226.47837,46.544058
5,0.6,501.0 - 711.0,330.376442,265.189171,202.307231,14.583515,254.67545,150.501232
6,0.7,711.0 - 925.1,387.227571,301.143265,241.989183,-19.862312,296.172303,137.051712
7,0.8,925.1 - 1093.7,385.285736,330.574757,161.686756,-0.327335,252.896683,161.040318
8,0.9,1093.7 - 1151.4,288.726154,236.482082,88.187369,-38.212327,149.647767,61.855259
9,1.0,1151.4 - 1192.7,168.17605,136.186438,34.861427,-22.527649,65.652959,18.493682


In [59]:
# average difference between WindSpeed_avg and WindSpeed:100_avg
print(np.mean(df_latest['WindSpeed_avg'] - df_latest['WindSpeed:100_avg']))

-1.9159133030361044


## 3. Find optimal $\eta$ to update the approximation of the turbine efficiency

In [98]:
errors = {}
for efficiency in np.arange(0.28, 0.35, 0.001):
    const_internal_friction_coefficient = 0.5 * 1.240 * np.pi * 77**2 * 3**3 * efficiency * 174 / 1000000
    df_latest['WindSpeed_balanced'] = df_latest['WindSpeed:100_avg']
    df_latest['WindPower_full'] = 0.5 * df_latest['AirDensity'] * rotor_area * df_latest['WindSpeed_balanced'] ** 3 * 174 / 1000000
    df_latest['UsableWindPower_full'] = np.minimum(df_latest['WindPower_full'], maximum_power_per_turbine * 174 / efficiency)
    df_latest['PowerOutput_full'] = np.where((df_latest['WindSpeed_balanced'] >= minimum_wind_speed) & (df_latest['WindSpeed_balanced'] <= maximum_wind_speed_for_operation), df_latest['UsableWindPower_full'] * efficiency - const_internal_friction_coefficient, 0)
    #df_latest['PowerOutput_full'] = np.minimum(df_latest['PowerOutput_full'], 1170)
    mean_abs_error = np.mean(np.abs(df_latest['Wind_MW'] - df_latest['PowerOutput_full']))
    mean_error = np.mean(df_latest['Wind_MW'] - df_latest['PowerOutput_full'])
    errors[efficiency] = [mean_abs_error, mean_error]

In [99]:
# Find the minimum error
min_error = min(errors, key=lambda x: errors[x])
print(f'Minimum mean absolute error: {errors[min_error][0]:.2f} with efficiency={min_error}')
print(f'Mean error: {errors[min_error][1]:.2f}')

Minimum mean absolute error: 118.94 with efficiency=0.3340000000000001
Mean error: -34.58


## 3. Make Forecast more accurate using smoothing of windspeed between datapoints

In [7]:
df_latest.dtypes

valid_time                datetime64[ns, UTC]
reference_time            datetime64[ns, UTC]
WindDirection_dwd                     float64
WindDirection:100_dwd                 float64
WindSpeed:100_dwd                     float64
WindDirection_ncep                    float64
WindDirection:100_ncep                float64
WindSpeed:100_ncep                    float64
MIP                                   float64
Wind_MW                               float64
WindSpeed_avg                         float64
Temperature_avg                       float64
RelativeHumidity_avg                  float64
Temperature_K                         float64
AirDensity                            float64
WindPower                             float64
UsableWindPower                       float64
PowerOutput                           float64
dtype: object

In [11]:
# Iterate over each reference_time
for reference_time, group in df_latest.groupby('reference_time'):

    # Get the valid_times and corresponding wind speeds for this reference_time
    valid_times = group['valid_time'].values
    wind_speeds = group['WindSpeed_avg'].values
    actual_generation = group['Wind_MW'].values
    
    # Convert valid_times to seconds for interpolation
    valid_times_in_seconds = (valid_times - valid_times[0]).astype('timedelta64[m]').astype(int) * 60  # seconds
    
    if len(valid_times) < 2:
        wind_power = 0.5 * group['AirDensity'].iloc[0] * rotor_area * avg_cubed_wind * 174 / 1000000
        usable_wind_power = min(wind_power, 0.5 * group['AirDensity'].iloc[0] * rotor_area * maximum_wind_speed_for_power_curve ** 3 * 174 / 1000000) - const_internal_friction_coefficient
        power_output = usable_wind_power * approximated_total_efficiency if (group['WindSpeed_avg'].iloc[0] >= minimum_wind_speed) and (group['WindSpeed_avg'].iloc[0] <= maximum_wind_speed_for_operation) else 0

        # Update PowerOutput
        df_latest.loc[(df_latest['reference_time'] == reference_time) & (df_latest['valid_time'] == valid_times[0]), 'PowerOutput'] = power_output
        continue
    # Perform cubic spline interpolation for 1-minute intervals
    wind_speed_spline = CubicSpline(valid_times_in_seconds, wind_speeds)
    
    # Generate 1-minute intervals for the valid_time range
    for i in range(len(valid_times) - 1):
        t_start = valid_times_in_seconds[i]
        t_end = valid_times_in_seconds[i + 1]
        
        # Generate time points at 1-minute intervals within this 30-minute window
        times_1min = np.arange(t_start, t_end, 60)
        
        # Interpolate wind speeds at 1-minute intervals
        interpolated_wind_speeds = wind_speed_spline(times_1min)
        
        # Calculate the average wind power using the cubed wind speeds
        avg_cubed_wind = np.mean(interpolated_wind_speeds ** 3)
        
        # Calculate wind power and apply limits for each interval
        wind_power = 0.5 * group['AirDensity'].iloc[i] * rotor_area * avg_cubed_wind * 174 / 1000000
        usable_wind_power = min(wind_power, 0.5 * group['AirDensity'].iloc[i] * rotor_area * maximum_wind_speed_for_power_curve ** 3 * 174 / 1000000) - const_internal_friction_coefficient
        
        # Calculate final power output based on cut-in, cut-out wind speeds and efficiency
        power_output = usable_wind_power * approximated_total_efficiency if (group['WindSpeed_avg'].iloc[i] >= minimum_wind_speed) and (group['WindSpeed_avg'].iloc[i] <= maximum_wind_speed_for_operation) else 0
        
        # Update PowerOutput
        valid_time = pd.to_datetime(valid_times[i]).tz_localize('UTC')
        df_latest.loc[(df_latest['reference_time'] == reference_time) & (df_latest['valid_time'] == valid_time), 'PowerOutput'] = power_output

In [12]:
# Absolute error
print('Mean Absolute error:', np.mean(np.abs(df_latest['Wind_MW'] - df_latest['PowerOutput'])))
# Mean absolute error in 10% quantiles
stats_df['mean_absolute_error_new'] = 0.0
stats_df['mean_error_new'] = 0.0
for q in np.arange(0.1, 1, 0.1):
    quantile_df = df_latest[(df_latest['PowerOutput'] < df_latest['Wind_MW'].quantile(q)) & (df_latest['PowerOutput'] >= df_latest['Wind_MW'].quantile(q - 0.1))]
    quantile_row = {
        'quantile': q,
        'mean_absolute_error_new': np.mean(np.abs(quantile_df['Wind_MW'] - quantile_df['PowerOutput'])),
        'mean_error_new': np.mean(quantile_df['Wind_MW'] - quantile_df['PowerOutput'])
    }
    stats_df.loc[stats_df['quantile'] == q, 'mean_absolute_error_new'] = quantile_row['mean_absolute_error_new']
    stats_df.loc[stats_df['quantile'] == q, 'mean_error_new'] = quantile_row['mean_error_new']

stats_df


Mean Absolute error: 152.50188071830652


Unnamed: 0,quantile,mean_absolute_error,mean_error,mean_absolute_error_new,mean_error_new
0,0.1,16.404858,15.116471,16.414064,15.214119
1,0.2,75.416475,62.435527,75.975492,63.115843
2,0.3,151.988686,132.412357,150.202767,130.585283
3,0.4,210.995809,180.039842,211.171806,179.947308
4,0.5,236.36567,191.634755,233.561167,189.275033
5,0.6,230.366862,167.321916,229.660376,168.246432
6,0.7,193.894254,89.085521,194.044068,88.392991
7,0.8,149.725902,-26.284631,149.780567,-26.388598
8,0.9,127.409351,-105.401871,129.913928,-105.539917


## => Neglegible improvement, but because its already done, we'll keep it

# ii. Modeling the residium 

In [13]:
df_latest['residual'] = df_latest['Wind_MW'] - df_latest['PowerOutput']

## 1. XGBoost

In [None]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Calculate the residuals (difference between actual and forecast)
df_latest['residual'] = df_latest['Wind_MW'] - df_latest['PowerOutput']

# Define the features (X) and the target (y)
X = df_latest[['WindSpeed:100_dwd', 'WindDirection:100_dwd', 'Temperature_avg', 'RelativeHumidity_avg', 
               'WindSpeed:100_ncep', 'WindDirection:100_ncep', 'AirDensity']]  # Add any other relevant features
y = df_latest['residual']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
