In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import torch
import torch.nn as nn
import torch.optim as optim
import pickle
import os

In [2]:
def resample_and_interpolate(group):
    return group.reset_index(level=[1, 2]).resample('30T').asfreq().interpolate()

Load Weather Data, solar total and wind total

In [3]:
base_dir = os.getcwd()

weather_df = pd.read_csv(os.path.join(base_dir, '..', 'weather_data', 'DWD_ICON-EU.csv'))
solar_total = pd.read_csv(os.path.join(base_dir, '..', 'basic_files', 'solar_total_production.csv'))
wind_total = pd.read_csv(os.path.join(base_dir, '..', 'basic_files', 'wind_total_production.csv'))
solar_total.generation_mw = solar_total.generation_mw * 0.5
wind_total.generation_mw = wind_total.generation_mw * 0.5 - wind_total.boa
weather_df.sort_values(by='ref_datetime', inplace=True)
weather_df = weather_df.groupby(["valid_datetime","latitude","longitude"]).last().reset_index()
weather_df.reset_index(inplace=True)

Interpolate weather data to 30 minutes periodes

In [4]:
weather_df.valid_datetime = pd.to_datetime(weather_df.valid_datetime)
weather_df = weather_df.set_index(["valid_datetime","latitude","longitude"])
df_resampled = weather_df.groupby(['latitude', 'longitude'], group_keys=False).apply(resample_and_interpolate)
df_resampled = df_resampled.reset_index()

  return group.reset_index(level=[1, 2]).resample('30T').asfreq().interpolate()
  return group.reset_index(level=[1, 2]).resample('30T').asfreq().interpolate()
  return group.reset_index(level=[1, 2]).resample('30T').asfreq().interpolate()
  return group.reset_index(level=[1, 2]).resample('30T').asfreq().interpolate()
  return group.reset_index(level=[1, 2]).resample('30T').asfreq().interpolate()
  return group.reset_index(level=[1, 2]).resample('30T').asfreq().interpolate()
  return group.reset_index(level=[1, 2]).resample('30T').asfreq().interpolate()
  return group.reset_index(level=[1, 2]).resample('30T').asfreq().interpolate()
  return group.reset_index(level=[1, 2]).resample('30T').asfreq().interpolate()
  return group.reset_index(level=[1, 2]).resample('30T').asfreq().interpolate()
  return group.reset_index(level=[1, 2]).resample('30T').asfreq().interpolate()
  return group.reset_index(level=[1, 2]).resample('30T').asfreq().interpolate()
  return group.reset_index(level=[1, 2])

merge wind total and create own wind and soalr dfs with their respective longtitude and latitude

In [5]:
solar_total.timestamp_utc = pd.to_datetime(solar_total.timestamp_utc)
wind_total.timestamp_utc = pd.to_datetime(wind_total.timestamp_utc)
df_resampled.drop(columns=['index','ref_datetime'], inplace=True)
df_resampled_merged = pd.merge(df_resampled, solar_total, how='left', left_on='valid_datetime', right_on='timestamp_utc')
df_resampled_merged_solar = df_resampled_merged.loc[~(df_resampled_merged.latitude == 53.935) & ~(df_resampled_merged.longitude == 1.8645)]
df_resampled_merged_solar1 = df_resampled_merged_solar.groupby("valid_datetime").mean().reset_index()
distinct_lat_lon_pairs = df_resampled_merged_solar[['latitude', 'longitude']].drop_duplicates()

In [6]:
df_resampled_merged_wind = pd.merge(df_resampled, wind_total, how='inner', left_on='valid_datetime', right_on='timestamp_utc')
df_resampled_merged_wind = df_resampled_merged_wind.loc[(df_resampled_merged_wind.latitude == 53.935) & (df_resampled_merged_wind.longitude == 1.8645)]
df_resampled_merged_wind.drop_duplicates(inplace=True)

In [59]:
def get_exact_time_lag_fast(df, value_column, timestamp_column='timestamp_utc', lag_hours=168):
    """
    Optimized version of get_exact_time_lag using vectorized operations.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing the time series data
    value_column : str
        Name of the column containing values to be lagged
    timestamp_column : str
        Name of the column containing timestamps
    lag_hours : int
        Number of hours to look back for the lag
    
    Returns:
    --------
    pandas.Series
        Series containing the lagged values
    """
    # Calculate target timestamps
    target_timestamps = df[timestamp_column] - pd.Timedelta(hours=lag_hours)
    
    # Create a merged dataframe to find matches
    reference_df = pd.DataFrame({
        'reference_time': df[timestamp_column],
        'value': df[value_column]
    }).sort_values('reference_time')
    
    # Use searchsorted to find the insertion points
    idx = np.searchsorted(reference_df['reference_time'], target_timestamps, side='right') - 1
    
    # Handle cases where idx is -1 (target time before any reference time)
    idx = np.where(idx < 0, 0, idx)
    
    # Get the matched values
    return reference_df['value'].iloc[idx].values

In [60]:
def set_up_wind_features(df):
    R_d = 287.05  # Specific gas constant for dry air (J/(kg·K))
    R_v = 461.5   # Specific gas constant for water vapor (J/(kg·K))
    p = 101325    # Standard atmospheric pressure in Pa
    # Calculate saturation vapor pressure (using temperature in Celsius), Tetens formula
    df['Temperature_K'] = df['Temperature'] + 273.15
    e_s = 0.61078 * np.exp((17.27 * (df['Temperature'])) / (df['Temperature'] +237.3))
    # in pa
    e_s = 1000 * e_s
    # Calculate actual vapor pressure
    e = df['RelativeHumidity'] / 100 * e_s
    df['AirDensity'] = (p - e) / (R_d * df['Temperature_K']) + (e / (R_v * df['Temperature_K']))
    # Turbine stats
    rotor_diameter = 154  # in meters
    approximated_total_efficiency = 0.337
    limiter = 0.94
    minimum_wind_speed = 3  # in m/s
    maximum_wind_speed_for_power_curve = 12.5  # in m/s
    maximum_wind_speed_for_operation = np.inf  # in m/s
    rotor_area = np.pi * (rotor_diameter / 2) ** 2  # in m²
    # turbine requires 3m/s to start rotating
    const_internal_friction_coefficient = 0.5 * 1.240 * np.pi * 77**2 * 3**3 * approximated_total_efficiency * 174 / 1000000
    maximum_power_per_turbine = 7 # in MW
    # Same for full
    df['WindSpeed_full_avg'] = (df['WindSpeed'] + df['WindSpeed:100']) / 2
    df['WindPower_full'] = 0.5 * df['AirDensity'] * rotor_area * df['WindSpeed:100'] ** 3 * 174 / 1000000
    df['UsableWindPower_full'] = np.minimum(df['WindPower_full'], maximum_power_per_turbine * 174 * limiter / approximated_total_efficiency)
    df['PowerOutput_full'] = np.where((df['WindSpeed:100'] >= minimum_wind_speed) & (df['WindSpeed:100'] <= maximum_wind_speed_for_operation), df['UsableWindPower_full'] * approximated_total_efficiency - const_internal_friction_coefficient, 0)

    # wind_df["Temperature_avg"] = (wind_df["Temperature"] + wind_df["Temperature:100"]) / 2
    # wind_df["RelativeHumidity_avg"] = (wind_df["RelativeHumidity"] + wind_df["RelativeHumidity:100"]) / 2

    lag_configs = {
    "WindSpeed:100_dwd_lag1": ("WindSpeed:100", 0.5),
    "WindSpeed:100_dwd_lag2": ("WindSpeed:100", 1),
    "WindSpeed:100_dwd_lag3": ("WindSpeed:100", 1.5),
    }
    for new_col, (source_col, hours) in lag_configs.items():
        df[new_col] = get_exact_time_lag_fast(
            df, 
            timestamp_column='valid_time',
            value_column=source_col,
            lag_hours=hours
        )
    df["Temperature_avg"] = df["Temperature"]
    df["RelativeHumidity_avg"] = df["RelativeHumidity"]   
    # df["WindSpeed:100_dwd_lag1"] = df["WindSpeed:100"].shift(1)
    # df["WindSpeed:100_dwd_lag2"] = df["WindSpeed:100"].shift(2)
    # df["WindSpeed:100_dwd_lag3"] = df["WindSpeed:100"].shift(3)
    df["UsableWindPower_opt"] = df.UsableWindPower_full
    df["WindSpeed:100_dwd"] = df["WindSpeed:100"]
    return df

In [8]:
df_resampled_merged_wind_2 = set_up_wind_features(df_resampled_merged_wind)
df_resampled_merged_wind_2.dropna(inplace=True)
X_wind = df_resampled_merged_wind_2[['WindSpeed:100_dwd', 'Temperature_avg', 'RelativeHumidity_avg', 'AirDensity', 'WindSpeed:100_dwd_lag1', 'WindSpeed:100_dwd_lag2', 'WindSpeed:100_dwd_lag3','UsableWindPower_opt']]
y_wind = df_resampled_merged_wind_2['generation_mw']

In [9]:
def set_up_solar_features(df):
    df["hour"] = df.valid_datetime.dt.hour
    df["day_of_year"] = df.valid_datetime.dt.dayofyear
    df["cos_day_of_year"] = np.cos(2 * np.pi * df.day_of_year / 365)
    df["cos_hour"] = np.cos(2 * np.pi * df.hour / 24)
    df["Mean_SolarDownwardRadiation"] = df.SolarDownwardRadiation
    df["Mean_Temperature"] = df.Temperature
    df["Std_Temperature"] = df_resampled_merged_solar.groupby("valid_datetime").std().reset_index().Temperature
    df["SolarDownwardRadiation_RW_Mean_30min"] = df.Mean_SolarDownwardRadiation.rolling(window=1, min_periods=1).mean()
    df["SolarDownwardRadiation_RW_Mean_1hour"] = df.Mean_SolarDownwardRadiation.rolling(window=2, min_periods=1).mean()
    df["SolarDownwardRadiation_dwd_Mean_Lag_30min"] = df.Mean_SolarDownwardRadiation.shift(1)
    df["SolarDownwardRadiation_dwd_Mean_Lag_1h"] = df.Mean_SolarDownwardRadiation.shift(2)
    df["SolarDownwardRadiation_dwd_Mean_Lag_24h"] = df.Mean_SolarDownwardRadiation.shift(48)
    for i in range(len(distinct_lat_lon_pairs)):
        lat = distinct_lat_lon_pairs.latitude.iloc[i]
        lon = distinct_lat_lon_pairs.longitude.iloc[i]
        mask = (df_resampled_merged_solar.latitude == lat) & (df_resampled_merged_solar.longitude == lon)
        df[f"Temperature_{i}"] = pd.Series(df_resampled_merged_solar.Temperature[mask].values)[:len(df)]  # Fill gaps with NaN
        df[f"SolarDownwardRadiation_{i}"] = pd.Series(df_resampled_merged_solar.SolarDownwardRadiation[mask].values)[:len(df)]  # Fill gaps with NaN
    return df
df_resampled_merged_solar2 = set_up_solar_features(df_resampled_merged_solar1)

In [10]:
def pv_temperature_efficiency(irradiance, ambient_temp, NOCT=45, wind_speed=1, eta_0=0.18, beta=0.004):
    # Calculate cell temperature using the simplified NOCT model
    Tc = ambient_temp + (NOCT - 20) * (irradiance / 800)
    
    # Calculate the efficiency loss due to increased cell temperature
    efficiency = eta_0 * (1 - beta * (Tc - 25))
    
    return Tc, efficiency

In [11]:
for i in range(20):
    temp_col = f'Temperature_{i}'
    irradiance_col = f'SolarDownwardRadiation_{i}'
    panel_temp_col = f'Panel_Temperature_Point{i}'
    panel_eff_col = f'Panel_Efficiency_Point{i}'
    df_resampled_merged_solar2[panel_temp_col], df_resampled_merged_solar2[panel_eff_col] = pv_temperature_efficiency(df_resampled_merged_solar2[irradiance_col], df_resampled_merged_solar2[temp_col])
df_resampled_merged_solar2["Panel_Temperature_dwd_mean"] = df_resampled_merged_solar2.filter(regex= r"Panel_Temperature.*").mean(axis= 1)
df_resampled_merged_solar2["Panel_Efficiency_dwd_mean"] = df_resampled_merged_solar2.filter(regex= r"Panel_Efficiency.*").mean(axis= 1)
df_resampled_merged_solar2["Panel_Temperature_dwd_std"] = df_resampled_merged_solar2.filter(regex= r"Panel_Temperature.*").std(axis= 1)
df_resampled_merged_solar2["Panel_Efficiency_dwd_std"] = df_resampled_merged_solar2.filter(regex= r"Panel_Efficiency.*").std(axis= 1)
df_resampled_merged_solar2["solar_mw_lag_48h"] = df_resampled_merged_solar2.generation_mw.shift(periods= 96)
df_resampled_merged_solar2["capacity_mwp_lag_48h"] = df_resampled_merged_solar2.capacity_mwp.shift(periods= 96)
df_resampled_merged_solar2["Target_Capacity_MWP%"] = df_resampled_merged_solar2.generation_mw / df_resampled_merged_solar2.capacity_mwp
df_resampled_merged_solar2["Target_Capacity_MWP%_lag_48h"] = df_resampled_merged_solar2["Target_Capacity_MWP%"].shift(periods= 96)


  df_resampled_merged_solar2["Target_Capacity_MWP%_lag_48h"] = df_resampled_merged_solar2["Target_Capacity_MWP%"].shift(periods= 96)


In [12]:
df_resampled_merged_solar3 = df_resampled_merged_solar2[[ 
    "Mean_SolarDownwardRadiation",
    "SolarDownwardRadiation_RW_Mean_1hour",
    "SolarDownwardRadiation_RW_Mean_30min",
    "SolarDownwardRadiation_dwd_Mean_Lag_30min",
    "SolarDownwardRadiation_dwd_Mean_Lag_1h",
    "SolarDownwardRadiation_dwd_Mean_Lag_24h",
    "Panel_Efficiency_dwd_mean",
    "Panel_Efficiency_dwd_std",
    "Panel_Temperature_dwd_mean",
    "Panel_Temperature_dwd_std",
    "Std_Temperature",
    "Mean_Temperature",
    "cos_hour",
    "cos_day_of_year",
    "solar_mw_lag_48h",
    "capacity_mwp_lag_48h",
    "Target_Capacity_MWP%_lag_48h",
    "Target_Capacity_MWP%"
    ]]
df_resampled_merged_solar3.dropna(inplace=True)
Y_solar = df_resampled_merged_solar3["Target_Capacity_MWP%"]
X_solar = df_resampled_merged_solar3.drop(columns=["Target_Capacity_MWP%"])
X_solar.capacity_mwp_lag_48h.mean()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_resampled_merged_solar3.dropna(inplace=True)


2778.916828188406

In [13]:
def modified_pinball_loss(y_true, y_pred, quantile):
    delta = y_true - y_pred
    return np.mean(np.maximum(quantile * delta, (quantile - 1) * delta))

In [14]:
from sklearn.ensemble import HistGradientBoostingRegressor  # Dies ist nur für die Typisierung notwendig

In [15]:
import pickle
import numpy as np
import pandas as pd
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.exceptions import NotFittedError

In [16]:
import joblib
def load_pickle1(path):
    return joblib.load(path)

In [17]:
import plotly.express as px
import plotly.graph_objects as go
from sklearn.exceptions import InconsistentVersionWarning
import warnings

# Suppress the version mismatch warning
warnings.filterwarnings("ignore", category=InconsistentVersionWarning)
def modified_pinball_loss(y_true, y_pred, quantile):
    error = y_true - y_pred
    return np.mean(np.maximum(quantile * error, (quantile - 1) * error))

path = os.path.join(base_dir, '..', 'Generation_forecast', 'Wind_forecast', 'models', 'gbr_quantile_0.')
quantiles = [1, 2, 3, 4, 5, 6, 7, 8, 9]
losses = []

for quantile in quantiles:
    with open(f"{path}{quantile}_boa_v4_res-True_calc-False.pkl", "rb") as f:
        model_wind = load_pickle1(f)
    
    if not hasattr(model_wind, '_preprocessor'):
        model_wind._preprocessor = None

    predictions_wind = model_wind.predict(X_wind)
    df_resampled_merged_wind_2[f"generation_mw_quantile_{quantile}"] = predictions_wind + df_resampled_merged_wind_2.PowerOutput_full /2
    loss = modified_pinball_loss(y_wind, df_resampled_merged_wind_2[f"generation_mw_quantile_{quantile}"], quantile/10)
    losses.append(loss)
    print(f"Quantile {quantile} loss: {loss}")
print(f"Mean loss: {np.mean(losses)}")

Quantile 1 loss: 13.024049559057628
Quantile 2 loss: 18.39605462875234
Quantile 3 loss: 22.05432580309928
Quantile 4 loss: 24.275159848006773
Quantile 5 loss: 25.227416029497615
Quantile 6 loss: 24.398875744970855
Quantile 7 loss: 22.055651357215243
Quantile 8 loss: 18.005795621780003
Quantile 9 loss: 10.905511880854142
Mean loss: 19.815871163692652


In [113]:
losses = []
for quantile in quantiles:
    with open(f"{path}{quantile}_boa_v4_res-True_calc-False.pkl", "rb") as f:
        model_wind = load_pickle1(f)
    
    if not hasattr(model_wind, '_preprocessor'):
        model_wind._preprocessor = None

    predictions_wind = model_wind.predict(X_wind)
    df_resampled_merged_wind_2[f"generation_mw_quantile_{quantile}"] = predictions_wind + df_resampled_merged_wind_2.PowerOutput_full /2
    loss = modified_pinball_loss(y_wind, df_resampled_merged_wind_2[f"generation_mw_quantile_{quantile}"], quantile/10)
    losses.append(loss)
    print(f"Quantile {quantile} loss: {loss}")
print(f"Mean loss: {np.mean(losses)}")

Quantile 1 loss: 13.024049559057628
Quantile 2 loss: 18.39605462875234
Quantile 3 loss: 22.05432580309928
Quantile 4 loss: 24.275159848006773
Quantile 5 loss: 25.227416029497615
Quantile 6 loss: 24.398875744970855
Quantile 7 loss: 22.055651357215243
Quantile 8 loss: 18.005795621780003
Quantile 9 loss: 10.905511880854142
Mean loss: 19.815871163692652


In [19]:
import plotly.graph_objects as go

# Create an empty figure
fig = go.Figure()

# Plot actual values
fig.add_trace(go.Scatter(
    x=df_resampled_merged_wind_2.index,  # Assuming the index is time or some sequence
    y=y_wind,  # Actual values
    mode='lines',
    customdata=df_resampled_merged_wind_2['WindSpeed:100'],
    hovertemplate='Wind Speed: %{customdata} m/s<br>Generation: %{y} MW<br>%{x}<extra></extra>',
    name='Actual Generation',
    line=dict(color='black', width=2)  # Style for actual values
))

# Plot predicted values for each quantile
for quantile in quantiles:
    fig.add_trace(go.Scatter(
        x=df_resampled_merged_wind_2.index,  # Assuming same x-axis for predictions
        y=df_resampled_merged_wind_2[f'generation_mw_quantile_{quantile}'],  # Predictions for each quantile
        mode='lines',
        name=f'Quantile {quantile} Prediction',
        line=dict(width=1.5)  # Style for predicted values
    ))

# Update layout for better visualization
fig.update_layout(
    title='Actual vs Predicted Wind Power Generation',
    xaxis_title='Time',
    yaxis_title='Power Generation (MW)',
    legend_title='Legend',
    hovermode='x',
    template='plotly_white'
)

# Show the plot
fig.show()


In [20]:
path = os.path.join(base_dir, '..', 'Generation_forecast', 'Solar_forecast', 'models', 'lgbr_model', 'models', 'i8_models', 'lgbr_q')
quantiles = [1, 2, 3, 4, 5, 6, 7, 8, 9]
losses = []

# Berechnung der Verluste für jedes Quantil
for i in quantiles:
    model_light = pickle.load(open(path + str(i) + ".pkl", 'rb'))
    predictions = model_light.predict(X_solar)
    loss = modified_pinball_loss(Y_solar.values, predictions, i/10)
    losses.append(loss)
    print(f"Quantile {i/10}: {loss*2779.3337282577586}")

Quantile 0.1: 4.318654207773378
Quantile 0.2: 6.459783462946888
Quantile 0.3: 7.494959180716615
Quantile 0.4: 8.174652723121847
Quantile 0.5: 8.560751295988924
Quantile 0.6: 8.369417672434114
Quantile 0.7: 7.3703662318454795
Quantile 0.8: 6.06902783115461
Quantile 0.9: 3.856335771451841


In [21]:
import plotly.express as px
# Erstellen eines DataFrames für Plotly
df = pd.DataFrame({
    'Quantile': [i/10 for i in quantiles],
    'Loss': losses
})

# Plotten der Verluste mit Plotly
fig = px.line(df, x='Quantile', y='Loss', markers=True, title='Modified Pinball Loss for Different Quantiles')
fig.update_layout(
    xaxis_title='Quantile',
    yaxis_title='Modified Pinball Loss',
    template='plotly_white'
)
fig.show()

In [22]:
import pickle
import plotly.express as px
import pandas as pd

# Pfad zu den Modellen
path = "D:/Users/paulh/Desktop/Domäneprojekt2/Energy_production_price_prediction/Generation_forecast/Solar_forecast/models/lgbr_model/models/i5_models/lgbr_q"
quantiles = [1, 2, 3, 4, 5, 6, 7, 8, 9]
predictions_dict = {}

# Berechnung der Vorhersagen für jedes Quantil
for i in quantiles:
    model_light = pickle.load(open(path + str(i) + ".pkl", 'rb'))
    predictions = model_light.predict(X_solar)
    predictions_dict[f'Quantile {i/10}'] = predictions

# Erstellen eines DataFrames für Plotly
df = pd.DataFrame(predictions_dict)
df['Actual'] = Y_solar.values

# Plotten der Vorhersagen und der tatsächlichen Werte mit Plotly
fig = px.line(df, title='Predictions and Actual Values for Different Quantiles')
for quantile in predictions_dict.keys():
    fig.add_scatter(x=df.index, y=df[quantile], mode='lines', name=quantile)
fig.add_scatter(x=df.index, y=df['Actual'], mode='lines', name='Actual', line=dict(color='black', width=2))
fig.update_layout(
    xaxis_title='Index',
    yaxis_title='Value',
    template='plotly_white'
)
fig.show()

### create training data

In [23]:
def get_exact_time_lag_fast(df, value_column, timestamp_column='timestamp_utc', lag_hours=168):
    """
    Optimized version of get_exact_time_lag using vectorized operations.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing the time series data
    value_column : str
        Name of the column containing values to be lagged
    timestamp_column : str
        Name of the column containing timestamps
    lag_hours : int
        Number of hours to look back for the lag
    
    Returns:
    --------
    pandas.Series
        Series containing the lagged values
    """
    # Calculate target timestamps
    target_timestamps = df[timestamp_column] - pd.Timedelta(hours=lag_hours)
    
    # Create a merged dataframe to find matches
    reference_df = pd.DataFrame({
        'reference_time': df[timestamp_column],
        'value': df[value_column]
    }).sort_values('reference_time')
    
    # Use searchsorted to find the insertion points
    idx = np.searchsorted(reference_df['reference_time'], target_timestamps, side='right') - 1
    
    # Handle cases where idx is -1 (target time before any reference time)
    idx = np.where(idx < 0, 0, idx)
    
    # Get the matched values
    return reference_df['value'].iloc[idx].values

In [24]:
def pv_temperature_efficiency(irradiance, ambient_temp, NOCT=45, wind_speed=1, eta_0=0.18, beta=0.004):
    # Calculate cell temperature using the simplified NOCT model
    Tc = ambient_temp + (NOCT - 20) * (irradiance / 800)
    
    # Calculate the efficiency loss due to increased cell temperature
    efficiency = eta_0 * (1 - beta * (Tc - 25))
    
    return Tc, efficiency

In [25]:
# df_solar_history = pd.read_csv('D:/Users/paulh/Desktop/Domäneprojekt2/Energy_production_price_prediction/paul_analyse/train.csv')
df_wind_history = pd.read_csv('D:/Users/paulh/Desktop/Domäneprojekt2/Energy_production_price_prediction/HEFTcom24/data/wind3.csv')
df_wind_history.valid_time = pd.to_datetime(df_wind_history.valid_time)
df_day_ahead = pd.read_csv('D:/Users/paulh/Desktop/Domäneprojekt2/Energy_production_price_prediction/basic_files/day_ahead_price.csv')
df_imbalance = pd.read_csv('D:/Users/paulh/Desktop/Domäneprojekt2/Energy_production_price_prediction/basic_files/imbalance_price.csv')
market_index = pd.read_csv('D:/Users/paulh/Desktop/Domäneprojekt2/Energy_production_price_prediction/basic_files/market_index.csv')

In [26]:
df_solar_new = pd.read_csv("D:/Users/paulh/Desktop/Domäneprojekt2/Energy_production_price_prediction/basic_files/solar_total_production.csv")
df_solar = pd.read_csv("D:/Users/paulh/Desktop/Domäneprojekt2/Energy_production_price_prediction/HEFTcom24/data/solar1.csv")
df_solar.drop(columns=['Unnamed: 0','boa_MWh','Wind_MW','Wind_MWh_credit'], inplace=True)
df_solar.valid_time = pd.to_datetime(df_solar.valid_time) 
df_solar.reference_time = pd.to_datetime(df_solar.reference_time)
df_solar = df_solar.groupby("valid_time").last().reset_index()
df_solar['hour'] = df_solar['valid_time'].dt.hour

# Sine and cosine encoding for hour (for cyclical behavior)
df_solar['sin_hour'] = np.sin(2 * np.pi * df_solar['hour'] / 24)
df_solar['cos_hour'] = np.cos(2 * np.pi * df_solar['hour'] / 24)

# Day of the year (seasonality)
df_solar['day_of_year'] = df_solar['valid_time'].dt.dayofyear

# Sine and cosine encoding for day of the year (for cyclical seasonality)
df_solar['sin_day'] = np.sin(2 * np.pi * df_solar['day_of_year'] / 365)
df_solar['cos_day'] = np.cos(2 * np.pi * df_solar['day_of_year'] / 365)
df_solar['Mean_SolarRadiation_dwd'] = df_solar[[f'SolarDownwardRadiation_Point{i}_dwd' for i in range(20)]].mean(axis=1)
df_solar['Mean_Temperature_dwd'] = df_solar[[f'Temperature_Point{i}_dwd' for i in range(7)]].mean(axis=1)
df_solar['Std_Temperature_dwd'] = df_solar[[f'Temperature_Point{i}_dwd' for i in range(7)]].std(axis=1)
df_solar["SolarDownwardRadiation_RW_dwd_Mean_30min"] = df_solar["Mean_SolarRadiation_dwd"].rolling(window= 1).mean()
df_solar["SolarDownwardRadiation_RW_dwd_Mean_1h"] = df_solar["Mean_SolarRadiation_dwd"].rolling(window= 2).mean()
# df_solar["SolarDownwardRadiation_dwd_Mean_Lag_30min"] = df_solar["Mean_SolarRadiation_dwd"].shift(periods= 1)
# df_solar["SolarDownwardRadiation_dwd_Mean_Lag_1h"] = df_solar["Mean_SolarRadiation_dwd"].shift(periods= 2)
# df_solar["SolarDownwardRadiation_dwd_Mean_Lag_24h"] = df_solar["Mean_SolarRadiation_dwd"].shift(periods= 48)
lag_configs = {
    "SolarDownwardRadiation_dwd_Mean_Lag_30min": ("Mean_SolarRadiation_dwd", 0.5),
    "SolarDownwardRadiation_dwd_Mean_Lag_1h": ("Mean_SolarRadiation_dwd", 1),
    "SolarDownwardRadiation_dwd_Mean_Lag_24h": ("Mean_SolarRadiation_dwd", 24),
}
for new_col, (source_col, hours) in lag_configs.items():
    df_solar[new_col] = get_exact_time_lag_fast(
        df_solar, 
        timestamp_column='valid_time',
        value_column=source_col,
        lag_hours=hours
    )

In [27]:
temperature_columns = [
 'Temperature_Point0_dwd',
 'Temperature_Point1_dwd',
 'Temperature_Point2_dwd',
 'Temperature_Point3_dwd',
 'Temperature_Point4_dwd',
 'Temperature_Point5_dwd',
 'Temperature_Point6_dwd',
 'Temperature_Point7_dwd',
 'Temperature_Point8_dwd',
 'Temperature_Point9_dwd',
 'Temperature_Point10_dwd',
 'Temperature_Point11_dwd',
 'Temperature_Point12_dwd',
 'Temperature_Point13_dwd',
 'Temperature_Point14_dwd',
 'Temperature_Point15_dwd',
 'Temperature_Point16_dwd',
 'Temperature_Point17_dwd',
 'Temperature_Point18_dwd',
 'Temperature_Point19_dwd',
]

irradiance_columns = [
    'SolarDownwardRadiation_Point0_dwd', 'SolarDownwardRadiation_Point1_dwd',
    'SolarDownwardRadiation_Point2_dwd', 'SolarDownwardRadiation_Point3_dwd',
    'SolarDownwardRadiation_Point4_dwd', 'SolarDownwardRadiation_Point5_dwd',
    'SolarDownwardRadiation_Point6_dwd', 'SolarDownwardRadiation_Point7_dwd',
    'SolarDownwardRadiation_Point8_dwd', 'SolarDownwardRadiation_Point9_dwd',
    'SolarDownwardRadiation_Point10_dwd', 'SolarDownwardRadiation_Point11_dwd',
    'SolarDownwardRadiation_Point12_dwd', 'SolarDownwardRadiation_Point13_dwd',
    'SolarDownwardRadiation_Point14_dwd', 'SolarDownwardRadiation_Point15_dwd',
    'SolarDownwardRadiation_Point16_dwd', 'SolarDownwardRadiation_Point17_dwd',
    'SolarDownwardRadiation_Point18_dwd', 'SolarDownwardRadiation_Point19_dwd',
]
for i in range(20):
    for source in ['dwd']:
        temp_col = f'Temperature_Point{i}_{source}'
        irradiance_col = f'SolarDownwardRadiation_Point{i}_{source}'
        panel_temp_col = f'Panel_Temperature_Point{i}_{source}'
        panel_eff_col = f'Panel_Efficiency_Point{i}_{source}'
        
        df_solar[panel_temp_col], df_solar[panel_eff_col] = zip(*df_solar.apply(
            lambda row: pv_temperature_efficiency(row[irradiance_col], row[temp_col]), axis=1))

In [28]:
df_solar["Panel_Temperature_dwd_mean"] = df_solar.filter(regex= r"Panel_Temperature.*_dwd").mean(axis= 1)
df_solar["Panel_Efficiency_dwd_mean"] = df_solar.filter(regex= r"Panel_Efficiency.*_dwd").mean(axis= 1)
df_solar["Panel_Temperature_dwd_std"] = df_solar.filter(regex= r"Panel_Temperature.*_dwd").std(axis= 1)
df_solar["Panel_Efficiency_dwd_std"] = df_solar.filter(regex= r"Panel_Efficiency.*_dwd").std(axis= 1)
df_solar_new.timestamp_utc = pd.to_datetime(df_solar_new.timestamp_utc) 
merged_df = pd.merge(df_solar_new, df_solar, left_on='timestamp_utc',right_on='valid_time', how='inner')

In [30]:
merged_df.Solar_MWh_credit = merged_df.Solar_MWh_credit / merged_df.capacity_mwp
merged_df["Target_Capacity_MWP%"] = merged_df.generation_mw / merged_df.capacity_mwp

lag_configs = {
    "solar_mw_lag_48h": ("Solar_MWh_credit", 48),
    "capacity_mwp_lag_48h": ("capacity_mwp", 48),
    "Target_Capacity_MWP%_lag_48h": ("Target_Capacity_MWP%",48),
}
for new_col, (source_col, hours) in lag_configs.items():
    merged_df[new_col] = get_exact_time_lag_fast(
        merged_df, 
        timestamp_column='valid_time',
        value_column=source_col,
        lag_hours=hours)

In [32]:
merged_df = merged_df[(merged_df['timestamp_utc'] < '2022-11-21') | (merged_df['timestamp_utc'] > '2022-12-08')]

In [34]:
df_solar_history = merged_df[[ "timestamp_utc",
    "Mean_SolarRadiation_dwd",
    "SolarDownwardRadiation_RW_dwd_Mean_30min",
    "SolarDownwardRadiation_RW_dwd_Mean_1h",
    "SolarDownwardRadiation_dwd_Mean_Lag_30min",
    "SolarDownwardRadiation_dwd_Mean_Lag_1h",
    "SolarDownwardRadiation_dwd_Mean_Lag_24h",
    "Panel_Efficiency_dwd_mean",
    "Panel_Efficiency_dwd_std",
    "Panel_Temperature_dwd_mean",
    "Panel_Temperature_dwd_std",
    "Std_Temperature_dwd",
    "Mean_Temperature_dwd",
    "cos_hour",
    "cos_day","solar_mw_lag_48h","capacity_mwp_lag_48h","Target_Capacity_MWP%_lag_48h",
    "Target_Capacity_MWP%","Solar_MWh_credit"]]

In [38]:
df_solar_history['timestamp_utc'] = df_solar_history['timestamp_utc'].dt.tz_localize(None)
df_together = pd.merge(df_solar_history, df_wind_history, how='left', left_on='timestamp_utc', right_on='valid_time')
df_together = pd.merge(df_solar_history, df_wind_history, how='left', left_on='timestamp_utc', right_on='valid_time')
df_together["Target_MW"] = df_together["Wind_MWh_credit"] + df_together["Solar_MWh_credit"]
df_together.timestamp_utc = pd.to_datetime(df_together.timestamp_utc)
df_together.columns



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Index(['timestamp_utc', 'Mean_SolarRadiation_dwd',
       'SolarDownwardRadiation_RW_dwd_Mean_30min',
       'SolarDownwardRadiation_RW_dwd_Mean_1h',
       'SolarDownwardRadiation_dwd_Mean_Lag_30min',
       'SolarDownwardRadiation_dwd_Mean_Lag_1h',
       'SolarDownwardRadiation_dwd_Mean_Lag_24h', 'Panel_Efficiency_dwd_mean',
       'Panel_Efficiency_dwd_std', 'Panel_Temperature_dwd_mean',
       'Panel_Temperature_dwd_std', 'Std_Temperature_dwd',
       'Mean_Temperature_dwd', 'cos_hour', 'cos_day', 'solar_mw_lag_48h',
       'capacity_mwp_lag_48h', 'Target_Capacity_MWP%_lag_48h',
       'Target_Capacity_MWP%', 'Solar_MWh_credit', 'reference_time',
       'valid_time', 'RelativeHumidity_dwd', 'Temperature_dwd',
       'WindDirection_dwd', 'WindDirection:100_dwd', 'WindSpeed^3_dwd',
       'WindSpeed:100^3_dwd', 'WindSpeed_dwd', 'WindSpeed:100_dwd',
       'WindSpeed^3:100_dwd', 'RelativeHumidity_ncep', 'Temperature_ncep',
       'WindDirection_ncep', 'WindDirection:100_ncep', 'WindS

In [39]:
full_time_index = pd.date_range(start=df_wind_history['valid_time'].min(), end=df_wind_history['valid_time'].max(), freq='30T')
missing_intervals = full_time_index.difference(df_wind_history['valid_time'])
print("Fehlende Zeitintervalle:", missing_intervals)

Fehlende Zeitintervalle: DatetimeIndex(['2020-11-05 17:30:00', '2020-11-05 18:00:00',
               '2020-11-05 18:30:00', '2020-11-05 19:00:00',
               '2020-11-05 19:30:00', '2020-11-05 20:00:00',
               '2020-11-05 20:30:00', '2020-11-05 21:00:00',
               '2020-11-05 21:30:00', '2020-11-05 22:00:00',
               ...
               '2023-11-13 06:30:00', '2023-11-13 07:00:00',
               '2023-11-13 07:30:00', '2023-11-13 08:00:00',
               '2024-01-10 04:00:00', '2024-01-10 04:30:00',
               '2024-01-13 13:00:00', '2024-01-14 06:30:00',
               '2024-01-14 16:30:00', '2024-04-13 07:00:00'],
              dtype='datetime64[ns]', length=603, freq=None)


In [52]:
df_together.dropna(inplace=True)

In [53]:
df_together_solar = df_together[[ 
    "Mean_SolarRadiation_dwd",
    "SolarDownwardRadiation_RW_dwd_Mean_1h",
    "SolarDownwardRadiation_RW_dwd_Mean_30min",
    "SolarDownwardRadiation_dwd_Mean_Lag_30min",
    "SolarDownwardRadiation_dwd_Mean_Lag_1h",
    "SolarDownwardRadiation_dwd_Mean_Lag_24h",
    "Panel_Efficiency_dwd_mean",
    "Panel_Efficiency_dwd_std",
    "Panel_Temperature_dwd_mean",
    "Panel_Temperature_dwd_std",
    "Std_Temperature_dwd",
    "Mean_Temperature_dwd",
    "cos_hour",
    "cos_day",
    "solar_mw_lag_48h",
    "capacity_mwp_lag_48h",
    "Target_Capacity_MWP%_lag_48h",
    ]]
df_together_solar.dropna(inplace=True)
mean_to_multiply = df_together_solar["capacity_mwp_lag_48h"].mean()

path = "D:/Users/paulh/Desktop/Domäneprojekt2/Energy_production_price_prediction/Generation_forecast/Solar_forecast/models/lgbr_model/models/i8_models/lgbr_q"
quantiles = [1, 2, 3, 4, 5, 6, 7, 8, 9]
losses = []

# Berechnung der Verluste für jedes Quantil
for i in quantiles:
    model_light = pickle.load(open(path + str(i) + ".pkl", 'rb'))
    predictions = model_light.predict(df_together_solar)
    predictions = predictions * mean_to_multiply
    df_together[f"{i}"] = predictions



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [54]:
df_together["Temperature"] = df_together["Temperature_dwd"]
df_together["RelativeHumidity"] = df_together["RelativeHumidity_dwd"]
df_together["WindSpeed"] = df_together["WindSpeed_dwd"]
df_together["WindSpeed:100"] = df_together["WindSpeed:100_dwd"]

In [61]:
df_together_wind = set_up_wind_features(df_together)

In [62]:
df_together_wind1 = df_together_wind[[
    'WindSpeed:100_dwd', 'Temperature_avg', 'RelativeHumidity_avg', 'AirDensity', 'WindSpeed:100_dwd_lag1', 'WindSpeed:100_dwd_lag2', 'WindSpeed:100_dwd_lag3','UsableWindPower_opt'
]]
df_together_wind1.dropna(inplace=True)
value_to_add = df_together_wind["PowerOutput_full"] / 2



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [63]:
path = 'D:/Users/paulh/Desktop/Domäneprojekt2/Energy_production_price_prediction/Generation_forecast/Wind_forecast/models/gbr_quantile_0.'
quantiles = [1, 2, 3, 4, 5, 6, 7, 8, 9]
losses = []

for quantile in quantiles:
    with open(f"{path}{quantile}_boa_v4_res-True_calc-False.pkl", "rb") as f:
        model_wind = load_pickle1(f)
    
    if not hasattr(model_wind, '_preprocessor'):
        model_wind._preprocessor = None

    predictions_wind = model_wind.predict(df_together_wind1)
    predictions_wind = predictions_wind + value_to_add
    df_together[f"{quantile}"] = df_together[f"{quantile}"] + predictions_wind

In [64]:
df_together

Unnamed: 0,timestamp_utc,Mean_SolarRadiation_dwd,SolarDownwardRadiation_RW_dwd_Mean_30min,SolarDownwardRadiation_RW_dwd_Mean_1h,SolarDownwardRadiation_dwd_Mean_Lag_30min,SolarDownwardRadiation_dwd_Mean_Lag_1h,SolarDownwardRadiation_dwd_Mean_Lag_24h,Panel_Efficiency_dwd_mean,Panel_Efficiency_dwd_std,Panel_Temperature_dwd_mean,...,WindSpeed_full_avg,WindPower_full,UsableWindPower_full,PowerOutput_full,Temperature_avg,RelativeHumidity_avg,WindSpeed:100_dwd_lag1,WindSpeed:100_dwd_lag2,WindSpeed:100_dwd_lag3,UsableWindPower_opt
48,2020-09-21 00:00:00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.189078,0.000501,12.391174,...,5.145089,302.423694,302.423694,83.633007,14.764197,81.447110,5.348481,5.348481,5.348481,302.423694
49,2020-09-21 00:30:00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.189162,0.000578,12.274750,...,4.803001,247.520468,247.520468,65.130620,14.848169,81.016785,5.348481,5.348481,5.348481,247.520468
50,2020-09-21 01:00:00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.189246,0.000665,12.158325,...,4.460912,199.708615,199.708615,49.018026,14.932142,80.586460,5.003469,5.348481,5.348481,199.708615
51,2020-09-21 01:30:00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.189333,0.000723,12.038016,...,3.338270,82.292566,82.292566,9.448817,14.879360,80.494950,4.658456,5.003469,5.348481,82.292566
52,2020-09-21 02:00:00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.189419,0.000789,11.917706,...,2.215629,23.243914,23.243914,0.000000,14.826579,80.403440,3.466305,4.658456,5.003469,23.243914
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50122,2023-08-25 23:30:00,0.009818,0.009818,0.004048,-0.001721,-0.003729,0.004615,0.187994,0.000419,13.896639,...,7.017116,782.949746,782.949746,245.570287,15.695299,85.048904,7.598906,7.534676,7.470446,782.949746
50123,2023-08-26 00:00:00,0.021356,0.021356,0.015587,0.009818,-0.001721,0.017088,0.188246,0.000398,13.547243,...,6.349924,565.464357,565.464357,172.277711,15.491116,84.143050,7.353338,7.598906,7.534676,565.464357
50124,2023-08-26 00:30:00,0.012875,0.012875,0.017116,0.021356,0.009818,-0.000814,0.188469,0.000386,13.237374,...,6.224685,531.210155,531.210155,160.734045,15.401097,84.790620,6.595584,7.353338,7.598906,531.210155
50125,2023-08-26 01:00:00,0.004395,0.004395,0.008635,0.012875,0.021356,-0.018716,0.188692,0.000381,12.927505,...,6.099447,498.361886,498.361886,149.664178,15.311079,85.438180,6.458971,6.595584,7.353338,498.361886


In [65]:
df_together.isna().sum()

timestamp_utc                                0
Mean_SolarRadiation_dwd                      0
SolarDownwardRadiation_RW_dwd_Mean_30min     0
SolarDownwardRadiation_RW_dwd_Mean_1h        0
SolarDownwardRadiation_dwd_Mean_Lag_30min    0
                                            ..
RelativeHumidity_avg                         0
WindSpeed:100_dwd_lag1                       0
WindSpeed:100_dwd_lag2                       0
WindSpeed:100_dwd_lag3                       0
UsableWindPower_opt                          0
Length: 74, dtype: int64

In [70]:
df_together.timestamp_utc = pd.to_datetime(df_together.timestamp_utc)
df_day_ahead.timestamp_utc = pd.to_datetime(df_day_ahead.timestamp_utc)
df_day_ahead.timestamp_utc = df_day_ahead.timestamp_utc.dt.tz_localize(None)
df_imbalance.timestamp_utc = pd.to_datetime(df_imbalance.timestamp_utc)
df_imbalance.timestamp_utc = df_imbalance.timestamp_utc.dt.tz_localize(None)
market_index.timestamp_utc = pd.to_datetime(market_index.timestamp_utc)
market_index.timestamp_utc = market_index.timestamp_utc.dt.tz_localize(None)

In [67]:
df_day_ahead.dtypes

timestamp_utc        datetime64[ns, UTC]
settlement_date                   object
settlement_period                  int64
price                            float64
dtype: object

In [76]:
demand_data_2020 = pd.read_csv('demanddata_2020.csv')
demand_data_2021 = pd.read_csv('demanddata_2021.csv')
demand_data_2022 = pd.read_csv('demanddata_2022.csv')
demand_data_2023 = pd.read_csv('demanddata.csv')

In [77]:
demand_data_overall = pd.concat([demand_data_2020, demand_data_2021, demand_data_2022, demand_data_2023])
demand_data_overall

Unnamed: 0,SETTLEMENT_DATE,SETTLEMENT_PERIOD,ND,TSD,ENGLAND_WALES_DEMAND,EMBEDDED_WIND_GENERATION,EMBEDDED_WIND_CAPACITY,EMBEDDED_SOLAR_GENERATION,EMBEDDED_SOLAR_CAPACITY,NON_BM_STOR,...,IFA_FLOW,IFA2_FLOW,BRITNED_FLOW,MOYLE_FLOW,EAST_WEST_FLOW,NEMO_FLOW,NSL_FLOW,ELECLINK_FLOW,SCOTTISH_TRANSFER,VIKING_FLOW
0,01-JAN-2020,1,26340,27153,23821,1244,6465,0,13080,0,...,1703,0,852,-151,-47,854,0,0,,
1,01-JAN-2020,2,26921,27684,24393,1188,6465,0,13080,0,...,1703,0,853,-146,0,854,0,0,,
2,01-JAN-2020,3,26569,27240,24085,1156,6465,0,13080,0,...,1703,0,852,-53,0,854,0,0,,
3,01-JAN-2020,4,25754,26435,23350,1125,6465,0,13080,0,...,1703,0,852,-66,0,854,0,0,,
4,01-JAN-2020,5,25075,25824,22788,1106,6465,0,13080,0,...,1704,0,853,-74,-60,854,0,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17515,2023-12-31,44,25269,25921,22546,2707,6488,0,15905,0,...,1879,-4,907,23,0,999,1334,997,-535.0,601.0
17516,2023-12-31,45,24254,24977,21608,2734,6488,0,15905,0,...,1154,-4,1002,111,0,999,960,997,-336.0,601.0
17517,2023-12-31,46,23455,24134,20889,2761,6488,0,15905,0,...,1085,-5,1003,130,0,999,884,998,-256.0,599.0
17518,2023-12-31,47,22533,23714,20081,2767,6488,0,15905,0,...,475,-4,801,71,-63,999,512,998,-30.0,512.0


In [79]:
demand_data_overall.SETTLEMENT_DATE = pd.to_datetime(demand_data_overall.SETTLEMENT_DATE)


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



In [83]:
demand_data_overall= demand_data_overall[["SETTLEMENT_DATE", "SETTLEMENT_PERIOD", "ND","TSD"]]

In [85]:
demand_data_overall.dtypes

SETTLEMENT_DATE      datetime64[ns]
SETTLEMENT_PERIOD             int64
ND                            int64
TSD                           int64
dtype: object

In [86]:
df_together3.dtypes

timestamp_utc                                datetime64[ns]
Mean_SolarRadiation_dwd                             float64
SolarDownwardRadiation_RW_dwd_Mean_30min            float64
SolarDownwardRadiation_RW_dwd_Mean_1h               float64
SolarDownwardRadiation_dwd_Mean_Lag_30min           float64
                                                  ...      
settlement_period                                     int64
data_provider                                        object
price_y                                             float64
volume                                              float64
date                                                 object
Length: 86, dtype: object

In [89]:
df_together1 = pd.merge(df_together, df_day_ahead, how='inner', left_on='timestamp_utc', right_on='timestamp_utc')
df_together2 = pd.merge(df_together1, df_imbalance, how='inner', left_on='timestamp_utc', right_on='timestamp_utc')
df_together3 = pd.merge(df_together2, market_index, how='inner', left_on='timestamp_utc', right_on='timestamp_utc')
df_together3["date"] = df_together3["timestamp_utc"].dt.date
df_together3["date"] = pd.to_datetime(df_together3["date"])

df_together4 = pd.merge(df_together3, demand_data_overall, how='inner', left_on=['date','settlement_period'], right_on=['SETTLEMENT_DATE','SETTLEMENT_PERIOD'])

In [90]:
df_together4.to_csv("bidding_training.csv", index=False)