In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from datetime import datetime, timedelta, date, time
import calendar

import os
from datetime import datetime
import holidays

In [2]:
df_energy = pd.read_csv("../energy_data/energy_data.csv", index_col=0, parse_dates=True)
df_energy.index = pd.to_datetime(df_energy.index, utc=True)
df_energy.index = df_energy.index.tz_convert('Europe/Berlin')

df_weather = pd.read_csv("../germany_weather/germany_weather.csv", parse_dates=["Datetime"], index_col="Datetime")
df_weather.index = pd.to_datetime(df_weather.index, utc=True)
df_weather.index = df_weather.index.tz_convert('Europe/Berlin')

In [3]:
# Combine hisorical weather and weather forecast 
df = df_weather.combine_first(df_energy)

#  Add germany holiday information to the dataframe
de_feiertage = holidays.Germany(years=range(2012, 2026))
df['public_holiday'] = df.index.to_series().apply(lambda x: x in de_feiertage)
df['public_holiday'] = df['public_holiday'].astype(int)

In [4]:
df = df.loc["2016-01-01":"2025-02-28"]
df

Unnamed: 0_level_0,ghi,rain,target,temperature,wind_speed_100m,wind_speed_10m,public_holiday
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-01-01 00:00:00+01:00,0.000000e+00,0.000029,43.73175,2.24603,4.417082,2.170952,1
2016-01-01 01:00:00+01:00,0.000000e+00,0.000032,41.96200,2.16920,4.422775,2.168508,1
2016-01-01 02:00:00+01:00,0.000000e+00,0.000031,40.22250,2.12734,4.435587,2.186163,1
2016-01-01 03:00:00+01:00,0.000000e+00,0.000029,39.24300,2.10092,4.398698,2.169982,1
2016-01-01 04:00:00+01:00,0.000000e+00,0.000028,38.74950,2.06100,4.459150,2.197282,1
...,...,...,...,...,...,...,...
2025-02-28 19:00:00+01:00,4.594054e-02,0.000042,64.60625,3.01013,4.626363,2.381010,0
2025-02-28 20:00:00+01:00,-6.467518e-14,0.000055,61.87725,2.65682,4.588312,2.408167,0
2025-02-28 21:00:00+01:00,-6.467518e-14,0.000044,58.74500,2.37823,4.511706,2.366980,0
2025-02-28 22:00:00+01:00,-6.467518e-14,0.000042,55.37925,2.16564,4.400239,2.306548,0


In [5]:
# Start- und Enddatum im DataFrame-Index bestimmen
start_date = df.index.min()
end_date = df.index.max()

# Alle Stunden zwischen Start- und Enddatum erzeugen
all_hours = pd.date_range(start=start_date, end=end_date, freq='H')

# Fehlende Stunden finden
missing_hours = all_hours.difference(df.index)

# Anzahl der fehlenden Stunden
missing_count = len(missing_hours)

print(f"Es fehlen {missing_count} Stunden im Index.")
print("Liste der fehlenden Stunden:")
print(missing_hours)

nan_count = df['target'].isna().sum()
print(f"Anzahl der NaN-Werte in der target-Spalte: {nan_count}")

nan_count_ghi = df['ghi'].isna().sum()
print(f"Anzahl der NaN-Werte in der ghi-Spalte: {nan_count_ghi}")

nan_count_rain = df['rain'].isna().sum()
print(f"Anzahl der NaN-Werte in der rain-Spalte: {nan_count_rain}")

nan_count_temperature = df['temperature'].isna().sum()
print(f"Anzahl der NaN-Werte in der temperature-Spalte: {nan_count_temperature}")

nan_count_wind_speed_100m = df['wind_speed_100m'].isna().sum()
print(f"Anzahl der NaN-Werte in der wind_speed_100m-Spalte: {nan_count_wind_speed_100m}")

nan_count_wind_speed_10m = df['wind_speed_10m'].isna().sum()
print(f"Anzahl der NaN-Werte in der wind_speed_10m-Spalte: {nan_count_wind_speed_10m}")

nan_count_public_holiday = df['public_holiday'].isna().sum()
print(f"Anzahl der NaN-Werte in der public_holiday-Spalte: {nan_count_public_holiday}")

Es fehlen 0 Stunden im Index.
Liste der fehlenden Stunden:
DatetimeIndex([], dtype='datetime64[ns, Europe/Berlin]', freq=None)
Anzahl der NaN-Werte in der target-Spalte: 0
Anzahl der NaN-Werte in der ghi-Spalte: 0
Anzahl der NaN-Werte in der rain-Spalte: 0
Anzahl der NaN-Werte in der temperature-Spalte: 2
Anzahl der NaN-Werte in der wind_speed_100m-Spalte: 2
Anzahl der NaN-Werte in der wind_speed_10m-Spalte: 2
Anzahl der NaN-Werte in der public_holiday-Spalte: 0


In [6]:
df['temperature'] = df['temperature'].interpolate()
df['wind_speed_100m'] = df['wind_speed_100m'].interpolate()
df['wind_speed_10m'] = df['wind_speed_10m'].interpolate()

In [7]:
df.to_csv("combined_energy_data.csv", index=True)