In [13]:
import pandas as pd
import numpy as np

In [14]:
df = pd.read_csv('Realisierter_Stromverbrauch_201811010000_202411010000_Stunde.csv', 
                 sep= ';',
                 index_col= [0],
                 parse_dates= [0])

df = df.iloc[:,[1,2]]


# Replace '.' with '' (remove thousand separator) and ',' with '.' (convert to decimal point)
df.iloc[:, 0] = df.iloc[:, 0].str.replace('.', '', regex=False).str.replace(',', '.', regex=False)

# Convert the column to float
df.iloc[:, 0] = df.iloc[:, 0].astype(float)

df_load = df['Gesamt (Netzlast) [MWh] Berechnete Auflösungen']

df_load


Datum von
01.11.2018 00:00     48879.5
01.11.2018 01:00     47312.5
01.11.2018 02:00    46093.75
01.11.2018 03:00     46710.5
01.11.2018 04:00    47624.25
                      ...   
31.10.2024 19:00     59605.0
31.10.2024 20:00    56859.25
31.10.2024 21:00     54040.5
31.10.2024 22:00     51405.0
31.10.2024 23:00    48110.75
Name: Gesamt (Netzlast) [MWh] Berechnete Auflösungen, Length: 52608, dtype: object

In [15]:
df = pd.read_csv('merged_data.csv', 
                 sep= ',',
                 index_col= [0],
                 parse_dates= [0])



df.columns

Index(['Datum bis_x', 'Gesamt (Netzlast) [MWh] Berechnete Auflösungen',
       'Residuallast [MWh] Berechnete Auflösungen',
       'Pumpspeicher [MWh] Berechnete Auflösungen_x', 'Datum bis_y',
       'Biomasse [MWh] Berechnete Auflösungen',
       'Wasserkraft [MWh] Berechnete Auflösungen',
       'Wind Offshore [MWh] Berechnete Auflösungen',
       'Wind Onshore [MWh] Berechnete Auflösungen',
       'Photovoltaik [MWh] Berechnete Auflösungen',
       'Sonstige Erneuerbare [MWh] Berechnete Auflösungen',
       'Kernenergie [MWh] Berechnete Auflösungen',
       'Braunkohle [MWh] Berechnete Auflösungen',
       'Steinkohle [MWh] Berechnete Auflösungen',
       'Erdgas [MWh] Berechnete Auflösungen',
       'Pumpspeicher [MWh] Berechnete Auflösungen_y',
       'Sonstige Konventionelle [MWh] Berechnete Auflösungen', 'Datum bis',
       'Deutschland/Luxemburg [€/MWh] Originalauflösungen',
       '∅ Anrainer DE/LU [€/MWh] Originalauflösungen',
       'Belgien [€/MWh] Originalauflösungen',
   

In [16]:
df_lo = df[['Gesamt (Netzlast) [MWh] Berechnete Auflösungen', 
         'Deutschland/Luxemburg [€/MWh] Originalauflösungen']]

df_po = df.iloc[:, 5:17]

df = pd.concat([df_lo, df_po], axis=1)

In [17]:
# Replace '-' with NaN
df = df.replace('-', float('nan'))

# Forward-fill to replace NaNs with the last valid value
df = df.ffill()

In [18]:
# Replace '.' with '' (remove thousand separator) and ',' with '.' (convert to decimal point) for all columns
df = df.apply(lambda x: x.astype(str).str.replace('.', '', regex=False).str.replace(',', '.', regex=False))

# Convert all columns back to float
df = df.astype(float)



In [19]:
def create_time_features(df, label=None):
    """
    Creates time series features from datetime index with sinusoidal encoding for cyclical features
    and adds a binary variable for weekends vs. workdays.
    """
    df = df.copy()
    
    # Ensure the index is a datetime object
    df['date'] = df.index
    
    # Create time-based features
    df['hour'] = df['date'].dt.hour
    df['dayofweek'] = df['date'].dt.dayofweek
    df['day_name'] = df['date'].dt.day_name()
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day
    df['date_offset'] = (df['date'].dt.month * 100 + df['date'].dt.day - 320) % 1300
    
    # Encode cyclical features with sin and cos
    # Hour encoding
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    
    # Day of week encoding
    df['dayofweek_sin'] = np.sin(2 * np.pi * df['dayofweek'] / 7)
    df['dayofweek_cos'] = np.cos(2 * np.pi * df['dayofweek'] / 7)
    
    # Month encoding
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    
    # Day of year encoding
    df['dayofyear_sin'] = np.sin(2 * np.pi * df['dayofyear'] / 365)
    df['dayofyear_cos'] = np.cos(2 * np.pi * df['dayofyear'] / 365)
    
    # Add binary weekend/workday variable
    df['is_workday'] = df['dayofweek'].apply(lambda x: 1 if x < 5 else 0)
    
    # Optionally add label column
    if label is not None:
        df['label'] = df[label]
    
    return df


In [20]:
from workalendar.europe import Germany

# Initialize the German holiday calendar
cal = Germany()

# Ensure your 'Date' index is a datetime object
df.index = pd.to_datetime(df.index)

# Create a new column to hold holiday names or "Not a Holiday"
df['Holiday'] = df.index.map(lambda date: cal.get_holiday_label(date) or "Not a Holiday")

# Display the first few rows to verify
print(df['Holiday'].unique())


['Not a Holiday' 'Christmas Day' 'Second Christmas Day' 'New year'
 'Good Friday' 'Easter Monday' 'Labour Day' 'Ascension Thursday'
 'Whit Monday' 'Day of German Unity']


In [21]:
df_features = create_time_features(df)

In [22]:
df_features.to_csv("merged_cleaned.csv")

In [23]:
df_features

Unnamed: 0_level_0,Gesamt (Netzlast) [MWh] Berechnete Auflösungen,Deutschland/Luxemburg [€/MWh] Originalauflösungen,Biomasse [MWh] Berechnete Auflösungen,Wasserkraft [MWh] Berechnete Auflösungen,Wind Offshore [MWh] Berechnete Auflösungen,Wind Onshore [MWh] Berechnete Auflösungen,Photovoltaik [MWh] Berechnete Auflösungen,Sonstige Erneuerbare [MWh] Berechnete Auflösungen,Kernenergie [MWh] Berechnete Auflösungen,Braunkohle [MWh] Berechnete Auflösungen,...,date_offset,hour_sin,hour_cos,dayofweek_sin,dayofweek_cos,month_sin,month_cos,dayofyear_sin,dayofyear_cos,is_workday
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-11-01 00:00:00,48879.50,40.86,4754.75,1583.00,2721.00,11628.50,0.0,153.00,7933.0,16108.75,...,781,0.000000,1.000000,0.433884,-0.900969,-0.500000,0.866025,-0.858764,0.512371,1
2018-11-01 01:00:00,47312.50,41.10,4741.25,1587.50,2678.00,11029.00,0.0,153.00,7933.5,16119.00,...,781,0.258819,0.965926,0.433884,-0.900969,-0.500000,0.866025,-0.858764,0.512371,1
2018-11-01 02:00:00,46093.75,40.59,4733.25,1567.50,2836.75,11025.00,0.0,153.00,7927.0,16111.00,...,781,0.500000,0.866025,0.433884,-0.900969,-0.500000,0.866025,-0.858764,0.512371,1
2018-11-01 03:00:00,46710.50,40.09,4725.25,1550.00,2968.50,10480.25,0.0,152.50,7930.5,16134.75,...,781,0.707107,0.707107,0.433884,-0.900969,-0.500000,0.866025,-0.858764,0.512371,1
2018-11-01 04:00:00,47624.25,39.11,4715.00,1535.75,2765.25,10871.25,0.0,152.75,7935.5,16120.50,...,781,0.866025,0.500000,0.433884,-0.900969,-0.500000,0.866025,-0.858764,0.512371,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-30 19:00:00,61139.50,159.44,4542.75,1999.75,1552.25,4700.50,0.0,86.25,0.0,12781.50,...,710,-0.965926,0.258819,0.974928,-0.222521,-0.866025,0.500000,-0.867456,0.497513,1
2024-10-30 20:00:00,58366.50,133.56,4538.50,1755.25,1501.25,4608.50,0.0,85.00,0.0,12389.75,...,710,-0.866025,0.500000,0.974928,-0.222521,-0.866025,0.500000,-0.867456,0.497513,1
2024-10-30 21:00:00,54690.25,122.67,4528.75,1683.75,1453.25,4438.00,0.0,85.00,0.0,12266.00,...,710,-0.707107,0.707107,0.974928,-0.222521,-0.866025,0.500000,-0.867456,0.497513,1
2024-10-30 22:00:00,51107.75,115.34,4476.75,1699.50,1606.50,4357.00,0.0,85.00,0.0,11943.75,...,710,-0.500000,0.866025,0.974928,-0.222521,-0.866025,0.500000,-0.867456,0.497513,1


In [24]:
from sklearn.preprocessing import MinMaxScaler

# Exclude non-numeric and pre-encoded columns
exclude_columns = ['date', 'date_offset', 'day_name', 'hour_sin', 'hour_cos', 
                   'dayofweek_sin', 'dayofweek_cos', 'month_sin', 'month_cos', 
                   'dayofyear_sin', 'dayofyear_cos', 'quarter', 'month', 'year', 
                   'dayofyear', 'dayofmonth', 'hour', 'dayofweek']

# Select numeric columns excluding the ones to exclude
columns_to_normalize = [col for col in df.columns if col not in exclude_columns and pd.api.types.is_numeric_dtype(df[col])]

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Normalize the selected columns
df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])

# upload normalized df 
df.to_csv("merged_cleaned_normalized.csv")

In [25]:
df

Unnamed: 0_level_0,Gesamt (Netzlast) [MWh] Berechnete Auflösungen,Deutschland/Luxemburg [€/MWh] Originalauflösungen,Biomasse [MWh] Berechnete Auflösungen,Wasserkraft [MWh] Berechnete Auflösungen,Wind Offshore [MWh] Berechnete Auflösungen,Wind Onshore [MWh] Berechnete Auflösungen,Photovoltaik [MWh] Berechnete Auflösungen,Sonstige Erneuerbare [MWh] Berechnete Auflösungen,Kernenergie [MWh] Berechnete Auflösungen,Braunkohle [MWh] Berechnete Auflösungen,Steinkohle [MWh] Berechnete Auflösungen,Erdgas [MWh] Berechnete Auflösungen,Pumpspeicher [MWh] Berechnete Auflösungen_y,Sonstige Konventionelle [MWh] Berechnete Auflösungen,Holiday
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2018-11-01 00:00:00,0.346145,0.394500,0.757719,0.322566,0.356467,0.239550,0.0,0.540079,0.832948,0.906579,0.316880,0.177891,0.004524,0.671370,Not a Holiday
2018-11-01 01:00:00,0.315318,0.394675,0.750209,0.324544,0.350834,0.227087,0.0,0.540079,0.833001,0.907261,0.317376,0.178543,0.006347,0.669254,Not a Holiday
2018-11-01 02:00:00,0.291341,0.394303,0.745758,0.315755,0.371631,0.227003,0.0,0.540079,0.832318,0.906729,0.295756,0.173178,0.000157,0.669254,Not a Holiday
2018-11-01 03:00:00,0.303475,0.393939,0.741307,0.308064,0.388891,0.215678,0.0,0.537451,0.832686,0.908310,0.304886,0.169643,0.010525,0.663186,Not a Holiday
2018-11-01 04:00:00,0.321451,0.393224,0.735605,0.301802,0.362264,0.223807,0.0,0.538765,0.833211,0.907361,0.286962,0.193628,0.010085,0.665726,Not a Holiday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-30 19:00:00,0.587336,0.480992,0.639777,0.505713,0.203354,0.095516,0.0,0.189225,0.000000,0.685028,0.403579,0.599752,0.414038,0.389869,Not a Holiday
2024-10-30 20:00:00,0.532782,0.462115,0.637413,0.398264,0.196672,0.093603,0.0,0.182654,0.000000,0.658943,0.395997,0.556248,0.200452,0.390574,Not a Holiday
2024-10-30 21:00:00,0.460460,0.454172,0.631989,0.366842,0.190384,0.090058,0.0,0.182654,0.000000,0.650702,0.385100,0.497178,0.054575,0.388881,Not a Holiday
2024-10-30 22:00:00,0.389982,0.448826,0.603060,0.373764,0.210461,0.088374,0.0,0.182654,0.000000,0.629245,0.374786,0.454512,0.032016,0.387329,Not a Holiday
