In [2]:
import pandas as pd 
import xarray as xr

## Energy data

In [3]:
energy_data_20200920_20240118 = pd.read_csv("HEFTcom24/data/energy_data/Energy_Data_20200920_20240118.csv")
energy_data_20240119_20240519 = pd.read_csv("HEFTcom24/data/energy_data/energy_data_20240119_20240519.csv")

In [4]:
energy_data_old = energy_data_20200920_20240118.copy()
energy_data_new = energy_data_20240119_20240519.copy()

energy_data_old["dtm"] = pd.to_datetime(energy_data_old["dtm"])
energy_data_old["Wind_MWh_credit"] = 0.5 * energy_data_old["Wind_MW"] - energy_data_old["boa_MWh"]
energy_data_old["Solar_MWh_credit"] = 0.5 * energy_data_old["Solar_MW"]

energy_data_new["dtm"] = pd.to_datetime(energy_data_new["dtm"])
energy_data_new["Wind_MWh_credit"] = 0.5 * energy_data_new["Wind_MW"] - energy_data_new["boa_MWh"]
energy_data_new["Solar_MWh_credit"] = 0.5 * energy_data_new["Solar_MW"]

energy_data_merged = pd.concat([energy_data_old, energy_data_new], ignore_index=True)

In [5]:
energy_data = energy_data_merged.copy()

In [6]:
energy_data.head()

Unnamed: 0,dtm,MIP,Solar_MW,Solar_capacity_mwp,Solar_installedcapacity_mwp,Wind_MW,SS_Price,boa_MWh,DA_Price,Wind_MWh_credit,Solar_MWh_credit
0,2020-09-20 00:00:00+00:00,20.06,0.0,2130.537493,2228.208777,996.284,2.5,0.0,32.17,498.142,0.0
1,2020-09-20 00:30:00+00:00,19.77,0.0,2130.537493,2228.208777,957.576,15.0,0.0,32.17,478.788,0.0
2,2020-09-20 01:00:00+00:00,28.68,0.0,2130.537493,2228.208777,941.044,47.95,0.0,32.0,470.522,0.0
3,2020-09-20 01:30:00+00:00,28.97,0.0,2130.537493,2228.208777,964.366,29.13,0.0,32.0,482.183,0.0
4,2020-09-20 02:00:00+00:00,28.19,0.0,2130.537493,2228.208777,918.432,28.95,0.0,31.99,459.216,0.0


In [7]:
print(energy_data["MIP"].isna().sum())
print(energy_data["Solar_MW"].isna().sum())
print(energy_data["Wind_MW"].isna().sum())
print(energy_data["boa_MWh"].isna().sum())
print(energy_data["Wind_MWh_credit"].isna().sum())
print(energy_data["Solar_MWh_credit"].isna().sum())
print(energy_data["Solar_capacity_mwp"].isna().sum())
print(energy_data["Solar_installedcapacity_mwp"].isna().sum())
print(energy_data["SS_Price"].isna().sum())
print(energy_data["DA_Price"].isna().sum())

425
75
103
103
103
75
0
0
0
0


In [8]:
energy_data = energy_data.dropna(subset= ["MIP", "Solar_MW", "Wind_MW", "boa_MWh", "Wind_MWh_credit", "Solar_MWh_credit", "Solar_capacity_mwp", "Solar_installedcapacity_mwp", "SS_Price", "DA_Price"])

## Weather forecasts

In [9]:
import os
import pandas as pd
import xarray as xr

def process_and_concat_files(directory):
    dfs = []

    for file in os.listdir(directory):
        if file.endswith(".nc"):
            file_path = os.path.join(directory, file)
            ds = xr.open_dataset(file_path)
            df = ds.to_dataframe().reset_index()
            
            if "ref_datetime" in df.columns and "valid_datetime" in df.columns:
                df.rename(columns={"ref_datetime": "reference_time", "valid_datetime": "valid_time"}, inplace=True)
            elif "reference_time" not in df.columns or "valid_time" not in df.columns:
                raise ValueError("Neither 'ref_datetime' and 'valid_datetime' nor 'reference_time' and 'valid_time' found in the dataset.")
            
            df["reference_time"] = df["reference_time"].dt.tz_localize("UTC")
            df["valid_time"] = df["reference_time"] + pd.to_timedelta(df["valid_time"], unit="hours")
            df["reference_time"] = df["reference_time"].dt.strftime("%Y-%m-%d %H:%M:%S")
            df["valid_time"] = df["valid_time"].dt.strftime("%Y-%m-%d %H:%M:%S")
            dfs.append(df)
    
    concatenated_df = pd.concat(dfs, ignore_index=True)
    concatenated_df["reference_time"] = pd.to_datetime(concatenated_df["reference_time"])
    concatenated_df["valid_time"] = pd.to_datetime(concatenated_df["valid_time"])
    concatenated_df["reference_time"] = concatenated_df["reference_time"].dt.strftime("%Y-%m-%d %H:%M:%S")
    concatenated_df["valid_time"] = concatenated_df["valid_time"].dt.strftime("%Y-%m-%d %H:%M:%S")
    
    return concatenated_df

In [10]:
base_dir = "HEFTcom24/data"

dwd_demand_df = process_and_concat_files(os.path.join(base_dir, "dwd_demand"))
dwd_pes10_df = process_and_concat_files(os.path.join(base_dir, "dwd_pes10"))
dwd_hornsea_1_df = process_and_concat_files(os.path.join(base_dir, "dwd_hornsea_1"))

ncep_demand_df = process_and_concat_files(os.path.join(base_dir, "ncep_demand"))
ncep_pes10_df = process_and_concat_files(os.path.join(base_dir, "ncep_pes10"))
ncep_hornsea_1_df = process_and_concat_files(os.path.join(base_dir, "ncep_hornsea_1"))

## Mean Percentage Diff

In [11]:
dwd_test = dwd_hornsea_1_df.groupby(["reference_time", "valid_time", "longitude", "latitude"]).mean().reset_index()

In [12]:
dwd_test1 = dwd_test.copy()

In [13]:
for i in range(1, 36):
    dwd_test1[f"perc_diff_{i}"] = (dwd_test1["WindSpeed:100"] - dwd_test1["WindSpeed:100"].shift(-i))# / dwd_test1["WindSpeed:100"] * 100

In [14]:
dwd_test1[dwd_test1["perc_diff_2"] > 6] 

Unnamed: 0,reference_time,valid_time,longitude,latitude,RelativeHumidity,Temperature,WindDirection,WindDirection:100,WindSpeed,WindSpeed:100,...,perc_diff_26,perc_diff_27,perc_diff_28,perc_diff_29,perc_diff_30,perc_diff_31,perc_diff_32,perc_diff_33,perc_diff_34,perc_diff_35
3130,2020-09-20 00:00:00,2020-09-24 06:00:00,2.027,54.03,78.325760,12.230774,249.352356,249.625259,14.446177,16.669464,...,8.454851,7.892223,7.569984,7.915836,7.915836,7.090984,8.352722,7.939036,7.541823,7.662777
3131,2020-09-20 00:00:00,2020-09-24 06:00:00,2.027,54.10,73.458572,12.554016,255.263550,255.727737,14.931273,17.539986,...,8.762745,8.440506,8.786358,8.786358,7.961506,9.223244,8.809558,8.412345,8.533298,8.533298
9331,2020-09-20 12:00:00,2020-09-23 13:00:00,1.767,53.84,94.978195,15.205566,209.058807,211.456253,6.136590,8.575730,...,2.217823,2.217823,5.753573,3.099650,5.806690,6.348791,4.942386,4.942386,5.131303,2.162550
9421,2020-09-20 12:00:00,2020-09-23 15:00:00,1.962,53.84,94.489174,15.007751,225.962173,225.839127,6.280030,7.684906,...,4.476149,4.476149,4.042365,3.770056,5.173798,6.085272,5.039148,5.039148,3.810954,2.627956
12425,2020-09-20 18:00:00,2020-09-23 12:00:00,1.702,54.10,92.196167,13.860901,21.759888,22.890015,7.447649,8.157602,...,2.516127,5.046050,4.683619,4.683619,2.936748,7.211439,5.480062,2.347166,-1.161321,-1.161321
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17640610,2024-05-15 12:00:00,2024-05-20 12:00:00,2.027,54.03,77.687225,10.420715,20.580292,21.015106,7.731364,8.948258,...,8.373289,8.031327,7.739954,7.368941,7.368941,7.285115,8.171923,7.853416,7.590491,7.266784
17640611,2024-05-15 12:00:00,2024-05-20 12:00:00,2.027,54.10,74.380585,10.451965,20.786011,20.982086,8.161761,9.196252,...,8.279321,7.987947,7.616935,7.616935,7.533108,8.419916,8.101409,7.838485,7.514778,7.514778
17674090,2024-05-18 00:00:00,2024-05-23 00:00:00,2.027,54.03,96.540459,12.282776,306.477051,312.259918,9.496290,13.164660,...,7.687050,7.385567,7.127760,6.354115,6.354115,6.063227,7.658544,7.366518,7.017605,6.242725
17674091,2024-05-18 00:00:00,2024-05-23 00:00:00,2.027,54.10,96.270927,12.409729,309.585541,314.219299,10.293956,13.601790,...,7.822697,7.564890,6.791245,6.791245,6.500357,8.095674,7.803648,7.454735,6.679855,6.679855


## HORNSEA_1

In [15]:
dwd_hornsea_1_df.head()

Unnamed: 0,reference_time,valid_time,latitude,longitude,RelativeHumidity,Temperature,WindDirection,WindDirection:100,WindSpeed,WindSpeed:100
0,2024-01-08 00:00:00,2024-01-08 00:00:00,53.77,1.702,59.105434,4.916626,54.52597,54.250305,6.547671,6.854635
1,2024-01-08 00:00:00,2024-01-08 00:00:00,53.77,1.767,58.628872,4.860962,54.28894,54.106964,6.524992,6.817597
2,2024-01-08 00:00:00,2024-01-08 00:00:00,53.77,1.832,58.3652,4.823853,53.520721,53.44812,6.493145,6.795032
3,2024-01-08 00:00:00,2024-01-08 00:00:00,53.77,1.897,58.441372,4.814087,52.447235,52.500397,6.495604,6.822487
4,2024-01-08 00:00:00,2024-01-08 00:00:00,53.77,1.962,58.501919,4.804321,51.579865,51.684021,6.521748,6.85629


In [16]:
print(dwd_hornsea_1_df["RelativeHumidity"].isna().sum())
print(dwd_hornsea_1_df["Temperature"].isna().sum())
print(dwd_hornsea_1_df["WindDirection"].isna().sum())
print(dwd_hornsea_1_df["WindDirection:100"].isna().sum())
print(dwd_hornsea_1_df["WindSpeed"].isna().sum())
print(dwd_hornsea_1_df["WindSpeed:100"].isna().sum())

print(ncep_hornsea_1_df["RelativeHumidity"].isna().sum())
print(ncep_hornsea_1_df["Temperature"].isna().sum())
print(ncep_hornsea_1_df["WindDirection"].isna().sum())
print(ncep_hornsea_1_df["WindDirection:100"].isna().sum())
print(ncep_hornsea_1_df["WindSpeed"].isna().sum())
print(ncep_hornsea_1_df["WindSpeed:100"].isna().sum())

13356
13392
13356
12204
13356
12204
3456
3627
5733
5733
5733
5733


In [17]:
dwd_hornsea_1_df.dropna(subset=["RelativeHumidity", "Temperature", "WindDirection", "WindDirection:100", "WindSpeed", "WindSpeed:100"], inplace=True)
ncep_hornsea_1_df.dropna(subset=["RelativeHumidity", "Temperature", "WindDirection", "WindDirection:100", "WindSpeed", "WindSpeed:100"], inplace=True)

In [18]:
dwd_h1df_mean = dwd_hornsea_1_df.groupby(["reference_time", "valid_time"])[["RelativeHumidity", "Temperature", "WindDirection", "WindDirection:100", "WindSpeed", "WindSpeed:100"]].mean().reset_index()
ncep_h1df_mean = ncep_hornsea_1_df.groupby(["reference_time", "valid_time"])[["RelativeHumidity", "Temperature", "WindDirection", "WindDirection:100", "WindSpeed", "WindSpeed:100"]].mean().reset_index()

In [19]:
merged_h1_df = dwd_h1df_mean.merge(ncep_h1df_mean, on=["reference_time", "valid_time"], suffixes=("_dwd", "_ncep"), how= "outer")
# merged_h1_df["RelativeHumidity"] = merged_h1_df[["RelativeHumidity_dwd", "RelativeHumidity_ncep"]].mean(axis=1)
# merged_h1_df["Temperature"] = merged_h1_df[["Temperature_dwd", "Temperature_ncep"]].mean(axis=1)
# merged_h1_df["WindDirection"] = merged_h1_df[["WindDirection_dwd", "WindDirection_ncep"]].mean(axis=1)
# merged_h1_df["WindDirection:100"] = merged_h1_df[["WindDirection:100_dwd", "WindDirection:100_ncep"]].mean(axis=1)
# merged_h1_df["WindSpeed"] = merged_h1_df[["WindSpeed_dwd", "WindSpeed_ncep"]].mean(axis=1)
# merged_h1_df["WindSpeed:100"] = merged_h1_df[["WindSpeed:100_dwd", "WindSpeed:100_ncep"]].mean(axis=1)
merged_h1_df["reference_time"] = pd.to_datetime(merged_h1_df["reference_time"], format="%Y-%m-%d %H:%M:%S")
merged_h1_df["valid_time"] = pd.to_datetime(merged_h1_df["valid_time"], format="%Y-%m-%d %H:%M:%S")
#merged_h1_df = merged_h1_df[["reference_time", "valid_time", "RelativeHumidity", "Temperature", "WindDirection", "WindDirection:100", "WindSpeed", "WindSpeed:100"]]

In [20]:
merged_h1_df

Unnamed: 0,reference_time,valid_time,RelativeHumidity_dwd,Temperature_dwd,WindDirection_dwd,WindDirection:100_dwd,WindSpeed_dwd,WindSpeed:100_dwd,RelativeHumidity_ncep,Temperature_ncep,WindDirection_ncep,WindDirection:100_ncep,WindSpeed_ncep,WindSpeed:100_ncep
0,2020-09-20,2020-09-20 00:00:00,85.213745,15.416670,61.588081,62.085178,10.043627,11.802604,84.066673,15.450012,56.139633,58.721077,9.116803,11.338992
1,2020-09-20,2020-09-20 01:00:00,84.810768,15.408349,60.819256,61.368774,9.767447,11.495033,84.800003,15.344140,56.170006,58.268215,9.513059,11.693331
2,2020-09-20,2020-09-20 02:00:00,83.904999,15.494086,60.202801,60.853306,9.494630,11.213223,85.066666,15.267812,59.059029,60.833061,9.662121,11.740044
3,2020-09-20,2020-09-20 03:00:00,82.977676,15.459883,56.277557,57.054367,9.455199,11.135883,85.177780,15.199863,59.058838,60.966099,9.540911,11.492398
4,2020-09-20,2020-09-20 04:00:00,82.238251,15.506005,54.625362,55.247120,9.515403,11.254492,85.588890,15.135688,58.351414,60.182644,9.456993,11.297719
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1113641,2024-05-19,2024-06-03 12:00:00,,,,,,,89.977776,13.626984,75.030380,83.595985,3.930665,5.688190
1113642,2024-05-19,2024-06-03 15:00:00,,,,,,,88.566666,14.016028,66.449463,72.859047,5.794817,8.217826
1113643,2024-05-19,2024-06-03 18:00:00,,,,,,,89.833336,13.756535,66.883980,73.883461,7.639992,11.537951
1113644,2024-05-19,2024-06-03 21:00:00,,,,,,,94.166664,13.350006,92.814087,95.731903,7.890777,11.280699


## PES10

In [21]:
dwd_pes10_df.head()

Unnamed: 0,reference_time,valid_time,point,CloudCover,SolarDownwardRadiation,Temperature,latitude,longitude
0,2024-01-08 00:00:00,2024-01-08 00:00:00,0,0.409668,0.0,0.31604,52.487256,0.401245
1,2024-01-08 00:00:00,2024-01-08 00:00:00,1,0.508945,0.0,1.136353,52.877668,0.790653
2,2024-01-08 00:00:00,2024-01-08 00:00:00,2,0.546328,0.0,0.637329,52.135428,-0.264034
3,2024-01-08 00:00:00,2024-01-08 00:00:00,3,0.30252,0.0,0.057251,52.48805,-0.126705
4,2024-01-08 00:00:00,2024-01-08 00:00:00,4,0.621309,0.0,1.15979,51.95637,0.658817


In [22]:
print(dwd_pes10_df["SolarDownwardRadiation"].isna().sum())
print(dwd_pes10_df["Temperature"].isna().sum())
print(dwd_pes10_df["CloudCover"].isna().sum())

print(ncep_pes10_df["SolarDownwardRadiation"].isna().sum())
print(ncep_pes10_df["Temperature"].isna().sum())
print(ncep_pes10_df["CloudCover"].isna().sum())

126920
7440
7440
104180
10173
15313


In [23]:
dwd_pes10_df = dwd_pes10_df.dropna(subset= ["Temperature", "SolarDownwardRadiation", "CloudCover"])
ncep_pes10_df = ncep_pes10_df.dropna(subset= ["Temperature", "SolarDownwardRadiation", "CloudCover"])

In [24]:
pivot_dwd_pes10_df = dwd_pes10_df.pivot_table(
    index=["valid_time", "reference_time"],
    columns="point",
    values=["Temperature", "CloudCover", "SolarDownwardRadiation"]
)
pivot_dwd_pes10_df.columns = ['_Point'.join(map(str, col)).strip() for col in pivot_dwd_pes10_df.columns.values]
pivot_dwd_pes10_df.reset_index(inplace=True)

pivot_ncep_pes10_df = ncep_pes10_df.pivot_table(
    index=["valid_time", "reference_time"],
    columns="point",
    values=["Temperature", "CloudCover", "SolarDownwardRadiation"]
)
pivot_ncep_pes10_df.columns = ['_Point'.join(map(str, col)).strip() for col in pivot_ncep_pes10_df.columns.values]
pivot_ncep_pes10_df.reset_index(inplace=True)


In [25]:
# pivot_dwd_pes10_df["Mean_CloudCover"] = pivot_dwd_pes10_df.filter(like="CloudCover").mean(axis=1)
# pivot_dwd_pes10_df["Mean_Temperature"] = pivot_dwd_pes10_df.filter(like="Temperature").mean(axis=1)
# pivot_dwd_pes10_df["Mean_SolarDownwardRadiation"] = pivot_dwd_pes10_df.filter(like="SolarDownwardRadiation").mean(axis=1)

# pivot_dwd_pes10_df["Std_CloudCover"] = pivot_dwd_pes10_df.filter(like="CloudCover").std(axis=1)
# pivot_dwd_pes10_df["Std_Temperature"] = pivot_dwd_pes10_df.filter(like="Temperature").std(axis=1)
# pivot_dwd_pes10_df["Std_SolarDownwardRadiation"] = pivot_dwd_pes10_df.filter(like="SolarDownwardRadiation").std(axis=1)

pivot_dwd_pes10_df["reference_time"] = pd.to_datetime(pivot_dwd_pes10_df["reference_time"], format="%Y-%m-%d %H:%M:%S")
pivot_dwd_pes10_df["valid_time"] = pd.to_datetime(pivot_dwd_pes10_df["valid_time"], format="%Y-%m-%d %H:%M:%S")


# pivot_ncep_pes10_df["Mean_CloudCover"] = pivot_ncep_pes10_df.filter(like="CloudCover").mean(axis=1)
# pivot_ncep_pes10_df["Mean_Temperature"] = pivot_ncep_pes10_df.filter(like="Temperature").mean(axis=1)
# pivot_ncep_pes10_df["Mean_SolarDownwardRadiation"] = pivot_ncep_pes10_df.filter(like="SolarDownwardRadiation").mean(axis=1)

# pivot_ncep_pes10_df["Std_CloudCover"] = pivot_ncep_pes10_df.filter(like="CloudCover").std(axis=1)
# pivot_ncep_pes10_df["Std_Temperature"] = pivot_ncep_pes10_df.filter(like="Temperature").std(axis=1)
# pivot_ncep_pes10_df["Std_SolarDownwardRadiation"] = pivot_ncep_pes10_df.filter(like="SolarDownwardRadiation").std(axis=1)

pivot_ncep_pes10_df["reference_time"] = pd.to_datetime(pivot_ncep_pes10_df["reference_time"], format="%Y-%m-%d %H:%M:%S")
pivot_ncep_pes10_df["valid_time"] = pd.to_datetime(pivot_ncep_pes10_df["valid_time"], format="%Y-%m-%d %H:%M:%S")

In [26]:
merged_pes10_df = pivot_dwd_pes10_df.merge(pivot_ncep_pes10_df, on=["reference_time", "valid_time"], suffixes=("_dwd", "_ncep"), how= "outer")

In [27]:
merged_pes10_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1109494 entries, 0 to 1109493
Columns: 122 entries, valid_time to Temperature_Point19_ncep
dtypes: datetime64[ns](2), float32(80), float64(40)
memory usage: 694.1 MB


## Demand

In [28]:
ncep_demand_df.head()

Unnamed: 0,reference_time,valid_time,point,RelativeHumidity,Temperature,TotalPrecipitation,WindDirection,WindSpeed,latitude,longitude
0,2020-09-20 00:00:00,2020-09-20 00:00:00,0,74.300003,15.350006,0.0,38.035126,4.788564,51.479,-0.451
1,2020-09-20 00:00:00,2020-09-20 00:00:00,1,83.800003,12.75,0.0,35.796936,5.266498,51.453,-2.6
2,2020-09-20 00:00:00,2020-09-20 00:00:00,2,88.599998,11.25,0.0,40.649506,4.298924,52.449,-1.926
3,2020-09-20 00:00:00,2020-09-20 00:00:00,3,84.599998,10.350006,0.0,78.319733,1.144141,53.175,-2.986
4,2020-09-20 00:00:00,2020-09-20 00:00:00,4,87.0,9.050018,0.0,65.584869,2.350657,55.86,-4.264


In [29]:
print(ncep_demand_df["RelativeHumidity"].isna().sum())
print(ncep_demand_df["Temperature"].isna().sum())
print(ncep_demand_df["WindDirection"].isna().sum())
print(ncep_demand_df["WindSpeed"].isna().sum())
print(ncep_demand_df["WindDirection"].isna().sum())
print(ncep_demand_df["TotalPrecipitation"].isna().sum())


print(dwd_demand_df["Temperature"].isna().sum())
print(dwd_demand_df["RelativeHumidity"].isna().sum())
print(dwd_demand_df["WindDirection"].isna().sum())
print(dwd_demand_df["WindSpeed"].isna().sum())
print(dwd_demand_df["WindDirection"].isna().sum())
print(dwd_demand_df["TotalPrecipitation"].isna().sum())

4987
3657
5922
5922
5922
1956
2604
2597
2597
2597
2597
76615


In [30]:
ncep_demand_df = ncep_demand_df.dropna(subset= ["Temperature", "RelativeHumidity", "WindDirection", "WindSpeed", "TotalPrecipitation"])
dwd_demand_df = dwd_demand_df.dropna(subset= ["Temperature", "RelativeHumidity", "WindSpeed", "WindDirection", "TotalPrecipitation"])

In [31]:
pivot_ncep_demand_df = ncep_demand_df.pivot_table(
    index=["valid_time", "reference_time"],
    columns="point",
    values=["Temperature", "RelativeHumidity", "WindDirection", "WindSpeed", "TotalPrecipitation"]
)
pivot_ncep_demand_df.columns = ['_Point'.join(map(str, col)).strip() for col in pivot_ncep_demand_df.columns.values]
pivot_ncep_demand_df.reset_index(inplace=True)


pivot_dwd_demand_df = dwd_demand_df.pivot_table(
    index=["valid_time", "reference_time"],
    columns="point",
    values=["Temperature", "RelativeHumidity", "WindDirection", "WindSpeed", "TotalPrecipitation"]
)
pivot_dwd_demand_df.columns = ['_Point'.join(map(str, col)).strip() for col in pivot_dwd_demand_df.columns.values]
pivot_dwd_demand_df.reset_index(inplace=True)

In [32]:
pivot_dwd_demand_df["reference_time"] = pd.to_datetime(pivot_dwd_demand_df["reference_time"], format="%Y-%m-%d %H:%M:%S")
pivot_dwd_demand_df["valid_time"] = pd.to_datetime(pivot_dwd_demand_df["valid_time"], format="%Y-%m-%d %H:%M:%S")

pivot_ncep_demand_df["reference_time"] = pd.to_datetime(pivot_ncep_demand_df["reference_time"], format="%Y-%m-%d %H:%M:%S")
pivot_ncep_demand_df["valid_time"] = pd.to_datetime(pivot_ncep_demand_df["valid_time"], format="%Y-%m-%d %H:%M:%S")

In [33]:
merged_demand_df = pivot_dwd_demand_df.merge(pivot_ncep_demand_df, on=["valid_time", "reference_time"], suffixes=("_dwd", "_ncep"), how="outer")

## Merger

In [34]:
merged_weather_dwd_ncep_forecasts = merged_pes10_df.merge(merged_demand_df, on=["valid_time", "reference_time"], how="outer", suffixes=("_pes10", "_demand"))
merged_weather_dwd_ncep_forecasts = merged_weather_dwd_ncep_forecasts.merge(merged_h1_df, on=["valid_time", "reference_time"], how="outer")

In [40]:
merged_weather_dwd_ncep_forecasts_res = merged_weather_dwd_ncep_forecasts.set_index("valid_time").groupby("reference_time").resample("30min").interpolate("linear")

In [41]:
merged_weather_dwd_ncep_forecasts_res = merged_weather_dwd_ncep_forecasts_res.drop(columns= "reference_time",axis=1).reset_index()

In [51]:
merged_weather_dwd_ncep_forecasts_res["valid_time"] = pd.to_datetime(merged_weather_dwd_ncep_forecasts_res["valid_time"], format="%Y-%m-%d %H:%M:%S", utc= True)

In [52]:
import pickle 

merged_weather_dwd_ncep_forecasts_res.to_pickle("merged_weather_dwd_ncep_forecasts_res_30min.pkl")

In [73]:
merged_dataset = merged_weather_dwd_ncep_forecasts_res.merge(energy_data, how= "inner", left_on= "valid_time", right_on="dtm")

In [75]:
import numpy as np

merged_dataset["valid_time"] = pd.to_datetime(merged_dataset["valid_time"], format= "%Y-%m-%d %H:%M:%S", utc=True)
merged_dataset["reference_time"] = pd.to_datetime(merged_dataset["reference_time"], format= "%Y-%m-%d %H:%M:%S", utc=True)

merged_dataset = merged_dataset[merged_dataset["valid_time"] - merged_dataset["reference_time"] < np.timedelta64(50, "h")]

In [77]:
merged_dataset.to_pickle("merged_dataset_prep_utc.pkl")

In [78]:
merged_dataset.to_csv("merged_dataset_prep_utc.csv")