In [1]:
import pandas as pd 
import xarray as xr
import os

## Load & Concat Energy data

In [2]:
energy_data_20200920_20240118 = pd.read_csv("HEFTcom24/data/energy_data/Energy_Data_20200920_20240118.csv")
energy_data_20240119_20240519 = pd.read_csv("HEFTcom24/data/energy_data/energy_data_20240119_20240519.csv")

In [3]:
energy_data_old = energy_data_20200920_20240118.copy()
energy_data_new = energy_data_20240119_20240519.copy()

energy_data_old["dtm"] = pd.to_datetime(energy_data_old["dtm"])
energy_data_old["Wind_MWh_credit"] = 0.5 * energy_data_old["Wind_MW"] - energy_data_old["boa_MWh"]
energy_data_old["Solar_MWh_credit"] = 0.5 * energy_data_old["Solar_MW"]

energy_data_new["dtm"] = pd.to_datetime(energy_data_new["dtm"])
energy_data_new["Wind_MWh_credit"] = 0.5 * energy_data_new["Wind_MW"] - energy_data_new["boa_MWh"]
energy_data_new["Solar_MWh_credit"] = 0.5 * energy_data_new["Solar_MW"]

energy_data_merged = pd.concat([energy_data_old, energy_data_new], ignore_index=True)

In [4]:
energy_data = energy_data_merged.copy()

In [5]:
energy_data.head()

Unnamed: 0,dtm,MIP,Solar_MW,Solar_capacity_mwp,Solar_installedcapacity_mwp,Wind_MW,SS_Price,boa_MWh,DA_Price,Wind_MWh_credit,Solar_MWh_credit
0,2020-09-20 00:00:00+00:00,20.06,0.0,2130.537493,2228.208777,996.284,2.5,0.0,32.17,498.142,0.0
1,2020-09-20 00:30:00+00:00,19.77,0.0,2130.537493,2228.208777,957.576,15.0,0.0,32.17,478.788,0.0
2,2020-09-20 01:00:00+00:00,28.68,0.0,2130.537493,2228.208777,941.044,47.95,0.0,32.0,470.522,0.0
3,2020-09-20 01:30:00+00:00,28.97,0.0,2130.537493,2228.208777,964.366,29.13,0.0,32.0,482.183,0.0
4,2020-09-20 02:00:00+00:00,28.19,0.0,2130.537493,2228.208777,918.432,28.95,0.0,31.99,459.216,0.0


### Remove NaN rows

In [6]:
print(energy_data["MIP"].isna().sum())
print(energy_data["Solar_MW"].isna().sum())
print(energy_data["Wind_MW"].isna().sum())
print(energy_data["boa_MWh"].isna().sum())
print(energy_data["Wind_MWh_credit"].isna().sum())
print(energy_data["Solar_MWh_credit"].isna().sum())
print(energy_data["Solar_capacity_mwp"].isna().sum())
print(energy_data["Solar_installedcapacity_mwp"].isna().sum())
print(energy_data["SS_Price"].isna().sum())
print(energy_data["DA_Price"].isna().sum())

425
75
103
103
103
75
0
0
0
0


In [7]:
energy_data = energy_data.dropna(subset= ["MIP", "Solar_MW", "Wind_MW", "boa_MWh", "Wind_MWh_credit", "Solar_MWh_credit", "Solar_capacity_mwp", "Solar_installedcapacity_mwp", "SS_Price", "DA_Price"])

## Load weather forecasts

In [8]:
def process_and_concat_files(directory):
    """
    Processes and concatenates all netCDF files in a given directory.
    """
    dfs = []

    for file in os.listdir(directory):
        if file.endswith(".nc"):
            file_path = os.path.join(directory, file)
            ds = xr.open_dataset(file_path)
            df = ds.to_dataframe().reset_index()
            
            if "ref_datetime" in df.columns and "valid_datetime" in df.columns:
                df.rename(columns={"ref_datetime": "reference_time", "valid_datetime": "valid_time"}, inplace=True)
            elif "reference_time" not in df.columns or "valid_time" not in df.columns:
                raise ValueError("Neither 'ref_datetime' and 'valid_datetime' nor 'reference_time' and 'valid_time' found in the dataset.")
            
            df["reference_time"] = df["reference_time"].dt.tz_localize("UTC")
            df["valid_time"] = df["reference_time"] + pd.to_timedelta(df["valid_time"], unit="hours")
            df["reference_time"] = df["reference_time"].dt.strftime("%Y-%m-%d %H:%M:%S")
            df["valid_time"] = df["valid_time"].dt.strftime("%Y-%m-%d %H:%M:%S")
            dfs.append(df)
    
    concatenated_df = pd.concat(dfs, ignore_index=True)
    concatenated_df["reference_time"] = pd.to_datetime(concatenated_df["reference_time"])
    concatenated_df["valid_time"] = pd.to_datetime(concatenated_df["valid_time"])
    concatenated_df["reference_time"] = concatenated_df["reference_time"].dt.strftime("%Y-%m-%d %H:%M:%S")
    concatenated_df["valid_time"] = concatenated_df["valid_time"].dt.strftime("%Y-%m-%d %H:%M:%S")
    
    return concatenated_df

### Create forecast df

In [9]:
base_dir = "HEFTcom24/data"

dwd_pes10_df = process_and_concat_files(os.path.join(base_dir, "dwd_pes10"))
dwd_hornsea_1_df = process_and_concat_files(os.path.join(base_dir, "dwd_hornsea_1"))

ncep_pes10_df = process_and_concat_files(os.path.join(base_dir, "ncep_pes10"))
ncep_hornsea_1_df = process_and_concat_files(os.path.join(base_dir, "ncep_hornsea_1"))

### Hornsea 1: Remove NaN rows, aggregate features (mean over all stations) and combine forecasts in one df

In [32]:
dwd_hornsea_1_df.head()

Unnamed: 0,reference_time,valid_time,latitude,longitude,RelativeHumidity,Temperature,WindDirection,WindDirection:100,WindSpeed,WindSpeed:100
0,2020-09-20 00:00:00,2020-09-20 00:00:00,53.77,1.702,86.991142,15.276886,59.552521,60.032074,10.064903,11.884751
1,2020-09-20 00:00:00,2020-09-20 00:00:00,53.77,1.767,87.532158,15.290558,59.626404,60.064697,10.121807,11.880851
2,2020-09-20 00:00:00,2020-09-20 00:00:00,53.77,1.832,87.661064,15.313019,59.638,60.069794,10.165877,11.868978
3,2020-09-20 00:00:00,2020-09-20 00:00:00,53.77,1.897,87.342705,15.343292,59.621643,59.918152,10.187956,11.820547
4,2020-09-20 00:00:00,2020-09-20 00:00:00,53.77,1.962,87.473564,15.373566,59.688599,60.0896,10.216055,11.864362


In [33]:
print(dwd_hornsea_1_df["RelativeHumidity"].isna().sum())
print(dwd_hornsea_1_df["Temperature"].isna().sum())
print(dwd_hornsea_1_df["WindDirection"].isna().sum())
print(dwd_hornsea_1_df["WindDirection:100"].isna().sum())
print(dwd_hornsea_1_df["WindSpeed"].isna().sum())
print(dwd_hornsea_1_df["WindSpeed:100"].isna().sum())

print(ncep_hornsea_1_df["RelativeHumidity"].isna().sum())
print(ncep_hornsea_1_df["Temperature"].isna().sum())
print(ncep_hornsea_1_df["WindDirection"].isna().sum())
print(ncep_hornsea_1_df["WindDirection:100"].isna().sum())
print(ncep_hornsea_1_df["WindSpeed"].isna().sum())
print(ncep_hornsea_1_df["WindSpeed:100"].isna().sum())

13356
13392
13356
12204
13356
12204
3456
3627
5733
5733
5733
5733


In [51]:
dwd_hornsea_1_df.dropna(subset=["RelativeHumidity", "Temperature", "WindDirection", "WindDirection:100", "WindSpeed", "WindSpeed:100"], inplace=True)
ncep_hornsea_1_df.dropna(subset=["RelativeHumidity", "Temperature", "WindDirection", "WindDirection:100", "WindSpeed", "WindSpeed:100"], inplace=True)

In [52]:
import numpy as np

dwd_hornsea_1_df["WindSpeed^3"] = dwd_hornsea_1_df["WindSpeed"] ** 3
dwd_hornsea_1_df["WindSpeed:100^3"] = dwd_hornsea_1_df["WindSpeed:100"] ** 3

ncep_hornsea_1_df["WindSpeed^3"] = ncep_hornsea_1_df["WindSpeed"] ** 3
ncep_hornsea_1_df["WindSpeed:100^3"] = ncep_hornsea_1_df["WindSpeed:100"] ** 3

dwd_h1df_mean = dwd_hornsea_1_df.groupby(["reference_time", "valid_time"])[["RelativeHumidity", "Temperature", "WindDirection", "WindDirection:100", "WindSpeed^3", "WindSpeed:100^3", "WindSpeed", "WindSpeed:100"]].mean().reset_index()
ncep_h1df_mean = ncep_hornsea_1_df.groupby(["reference_time", "valid_time"])[["RelativeHumidity", "Temperature", "WindDirection", "WindDirection:100", "WindSpeed^3", "WindSpeed:100^3", "WindSpeed", "WindSpeed:100"]].mean().reset_index()

dwd_h1df_mean["WindSpeed^3"] = np.cbrt(dwd_h1df_mean["WindSpeed^3"])
dwd_h1df_mean["WindSpeed^3:100"] = np.cbrt(dwd_h1df_mean["WindSpeed:100^3"])

ncep_h1df_mean["WindSpeed^3"] = np.cbrt(ncep_h1df_mean["WindSpeed^3"])
ncep_h1df_mean["WindSpeed^3:100"] = np.cbrt(ncep_h1df_mean["WindSpeed:100^3"])

In [53]:
merged_h1_df = dwd_h1df_mean.merge(ncep_h1df_mean, on=["reference_time", "valid_time"], suffixes=("_dwd", "_ncep"), how= "outer")
merged_h1_df["reference_time"] = pd.to_datetime(merged_h1_df["reference_time"], format="%Y-%m-%d %H:%M:%S")
merged_h1_df["valid_time"] = pd.to_datetime(merged_h1_df["valid_time"], format="%Y-%m-%d %H:%M:%S")

### Hornsea 1: Merge with energy data

In [54]:
merged_h1_df = merged_h1_df.set_index("valid_time").groupby("reference_time").resample("30min").interpolate("linear")

In [55]:
merged_h1_df = merged_h1_df.drop(columns=["reference_time"], axis= 1).reset_index()

In [56]:
merged_h1_df["valid_time"] = pd.to_datetime(merged_h1_df["valid_time"], format="%Y-%m-%d %H:%M:%S", utc= True)

In [57]:
merged_h1_energy_data = merged_h1_df.merge(energy_data, how= "inner", left_on= "valid_time", right_on= "dtm")

In [58]:
merged_h1_energy_data = merged_h1_energy_data.groupby("valid_time").tail(1).reset_index()
merged_h1_energy_data["valid_time"] = pd.to_datetime(merged_h1_energy_data["valid_time"], format="%Y-%m-%d %H:%M:%S", utc= True)

In [60]:
merged_h1_energy_data = merged_h1_energy_data.drop(columns=["index", "Solar_MW", "Solar_MWh_credit", "Solar_capacity_mwp", "Solar_installedcapacity_mwp"], axis= 1)

In [61]:
merged_h1_energy_data.to_csv("HEFTcom24/data/wind2.csv", index= False)

###  Pes10: Remove NaN rows, aggregate features (mean over all stations) and combine forecasts in one df

In [10]:
dwd_pes10_df.head()

Unnamed: 0,reference_time,valid_time,point,CloudCover,SolarDownwardRadiation,Temperature,latitude,longitude
0,2024-01-08 00:00:00,2024-01-08 00:00:00,0,0.409668,0.0,0.31604,52.487256,0.401245
1,2024-01-08 00:00:00,2024-01-08 00:00:00,1,0.508945,0.0,1.136353,52.877668,0.790653
2,2024-01-08 00:00:00,2024-01-08 00:00:00,2,0.546328,0.0,0.637329,52.135428,-0.264034
3,2024-01-08 00:00:00,2024-01-08 00:00:00,3,0.30252,0.0,0.057251,52.48805,-0.126705
4,2024-01-08 00:00:00,2024-01-08 00:00:00,4,0.621309,0.0,1.15979,51.95637,0.658817


In [11]:
print(dwd_pes10_df["SolarDownwardRadiation"].isna().sum())
print(dwd_pes10_df["Temperature"].isna().sum())
print(dwd_pes10_df["CloudCover"].isna().sum())

print(ncep_pes10_df["SolarDownwardRadiation"].isna().sum())
print(ncep_pes10_df["Temperature"].isna().sum())
print(ncep_pes10_df["CloudCover"].isna().sum())

126920
7440
7440
104180
10173
15313


In [12]:
dwd_pes10_df = dwd_pes10_df.dropna(subset= ["Temperature", "SolarDownwardRadiation", "CloudCover"])
ncep_pes10_df = ncep_pes10_df.dropna(subset= ["Temperature", "SolarDownwardRadiation", "CloudCover"])

In [13]:
pivot_dwd_pes10_df = dwd_pes10_df.pivot_table(
    index=["valid_time", "reference_time"],
    columns="point",
    values=["Temperature", "CloudCover", "SolarDownwardRadiation"]
)
pivot_dwd_pes10_df.columns = ['_Point'.join(map(str, col)).strip() for col in pivot_dwd_pes10_df.columns.values]
pivot_dwd_pes10_df.reset_index(inplace=True)

pivot_ncep_pes10_df = ncep_pes10_df.pivot_table(
    index=["valid_time", "reference_time"],
    columns="point",
    values=["Temperature", "CloudCover", "SolarDownwardRadiation"]
)
pivot_ncep_pes10_df.columns = ['_Point'.join(map(str, col)).strip() for col in pivot_ncep_pes10_df.columns.values]
pivot_ncep_pes10_df.reset_index(inplace=True)


In [14]:
pivot_dwd_pes10_df["reference_time"] = pd.to_datetime(pivot_dwd_pes10_df["reference_time"], format="%Y-%m-%d %H:%M:%S")
pivot_dwd_pes10_df["valid_time"] = pd.to_datetime(pivot_dwd_pes10_df["valid_time"], format="%Y-%m-%d %H:%M:%S")

pivot_ncep_pes10_df["reference_time"] = pd.to_datetime(pivot_ncep_pes10_df["reference_time"], format="%Y-%m-%d %H:%M:%S")
pivot_ncep_pes10_df["valid_time"] = pd.to_datetime(pivot_ncep_pes10_df["valid_time"], format="%Y-%m-%d %H:%M:%S")

In [21]:
merged_pes10_df = pivot_dwd_pes10_df.merge(pivot_ncep_pes10_df, on=["reference_time", "valid_time"], suffixes=("_dwd", "_ncep"), how= "outer")

### Pes10: Merge with Energy data

In [22]:
merged_pes10_df = merged_pes10_df.set_index("valid_time").groupby("reference_time").resample("30min").interpolate("linear")
merged_pes10_df = merged_pes10_df.drop(columns=["reference_time"], axis= 1).reset_index()
merged_pes10_df["valid_time"] = pd.to_datetime(merged_pes10_df["valid_time"], format="%Y-%m-%d %H:%M:%S", utc= True)

In [36]:
solar = merged_pes10_df.merge(energy_data, how= "inner", left_on= "valid_time", right_on= "dtm")

In [37]:
solar["reference_time"] = pd.to_datetime(solar["reference_time"], utc=True)
solar["valid_time"] = pd.to_datetime(solar["valid_time"], utc=True)

In [39]:
solar = solar.drop(columns= ["dtm", "MIP", "Solar_MW", "SS_Price", "DA_Price","Wind_MW", "Wind_MWh_credit", "boa_MWh", "date_diff"], axis= 1)

In [41]:
solar.to_csv("HEFTcom24/data/solar2.csv")