In [1]:
import pandas as pd
import calendar
import holidays

In [2]:
# read csv files
df_bike = pd.read_csv("../bike_data/bike_data.csv", parse_dates=["Datetime"], index_col="Datetime")
df_bike.index = pd.to_datetime(df_bike.index, utc=True)
df_bike.index = df_bike.index.tz_convert('Europe/Berlin')

df_weather = pd.read_csv("../karlsruhe_weather/historical_weather_karslruhe.csv", parse_dates=["Datetime"], index_col="Datetime")
df_weather.index = pd.to_datetime(df_weather.index, utc=True)
df_weather.index = df_weather.index.tz_convert('Europe/Berlin')

df_weather_forecast = pd.read_csv("../karlsruhe_weather/weather_forecast_karlsruhe.csv", parse_dates=["Datetime"], index_col="Datetime")
df_weather_forecast.index = pd.to_datetime (df_weather_forecast.index, utc=True)
df_weather_forecast.index = df_weather_forecast.index.tz_convert('Europe/Berlin')

In [3]:
df_weather

Unnamed: 0_level_0,temperature_2m,relative_humidity_2m,precipitation,weather_code,cloud_cover,wind_speed_10m,shortwave_radiation
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2012-01-01 01:00:00+01:00,9.762501,96.372925,0.0,3.0,100.0,16.434305,0.0
2012-01-01 02:00:00+01:00,9.812500,96.699100,0.0,3.0,100.0,15.575981,0.0
2012-01-01 03:00:00+01:00,9.912500,96.053635,0.0,3.0,100.0,15.307410,0.0
2012-01-01 04:00:00+01:00,9.962501,96.055170,0.0,3.0,100.0,16.299694,0.0
2012-01-01 05:00:00+01:00,9.962501,95.732710,0.0,3.0,100.0,15.530151,0.0
...,...,...,...,...,...,...,...
2025-03-01 20:00:00+01:00,5.228000,72.985780,0.0,3.0,100.0,12.632101,0.0
2025-03-01 21:00:00+01:00,5.178000,73.769310,0.0,3.0,97.0,14.233664,0.0
2025-03-01 22:00:00+01:00,4.128000,76.291830,0.0,2.0,58.0,14.813683,0.0
2025-03-01 23:00:00+01:00,2.978000,77.780180,0.0,1.0,30.0,13.780290,0.0


In [4]:
df = df_bike.combine_first(df_weather)

#  Add Baden Württemberg holiday information to the dataframe
bw_feiertage = holidays.Germany(years=range(2012, 2026), state='BW')
df['public_holiday'] = df.index.to_series().apply(lambda x: x in bw_feiertage)
df['public_holiday'] = df['public_holiday'].astype(int)

In [5]:
df = df.loc["2013-01-01":"2025-03-01"]

In [6]:
# Start- und Enddatum im DataFrame-Index bestimmen
start_date = df.index.min()
end_date = df.index.max()

# Alle Stunden zwischen Start- und Enddatum erzeugen
all_hours = pd.date_range(start=start_date, end=end_date, freq='H')

# Fehlende Stunden finden
missing_hours = all_hours.difference(df.index)

# Anzahl der fehlenden Stunden
missing_count = len(missing_hours)

print(f"Es fehlen {missing_count} Stunden im Index.")
print("Liste der fehlenden Stunden:")
print(missing_hours)

nan_count = df['target'].isna().sum()
print(f"Anzahl der NaN-Werte in der target-Spalte: {nan_count}")

Es fehlen 0 Stunden im Index.
Liste der fehlenden Stunden:
DatetimeIndex([], dtype='datetime64[ns, Europe/Berlin]', freq='H')
Anzahl der NaN-Werte in der target-Spalte: 0


In [7]:
df['target'] = pd.to_numeric(df['target'], errors='coerce')

In [8]:
df.to_csv("combined_bike_data.csv", index=True)

In [9]:
df

Unnamed: 0_level_0,cloud_cover,precipitation,relative_humidity_2m,shortwave_radiation,target,temperature_2m,weather_code,wind_speed_10m,public_holiday
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2013-01-01 00:00:00+01:00,95.0,0.0,55.362625,0.0,30.0,6.3625,3.0,13.551500,1
2013-01-01 01:00:00+01:00,99.0,0.0,54.982826,0.0,19.0,6.4625,3.0,13.104197,1
2013-01-01 02:00:00+01:00,100.0,0.0,54.834316,0.0,33.0,6.6625,3.0,14.332340,1
2013-01-01 03:00:00+01:00,99.0,0.0,56.056946,0.0,31.0,6.6625,3.0,15.778518,1
2013-01-01 04:00:00+01:00,100.0,0.0,57.355602,0.0,40.0,6.8625,3.0,17.102840,1
...,...,...,...,...,...,...,...,...,...
2025-03-01 19:00:00+01:00,99.0,0.0,69.709120,0.0,224.0,5.5780,3.0,10.990322,0
2025-03-01 20:00:00+01:00,100.0,0.0,72.985780,0.0,159.0,5.2280,3.0,12.632101,0
2025-03-01 21:00:00+01:00,97.0,0.0,73.769310,0.0,94.0,5.1780,3.0,14.233664,0
2025-03-01 22:00:00+01:00,58.0,0.0,76.291830,0.0,57.0,4.1280,2.0,14.813683,0
