# Bibliotheken/Einlesen

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.impute import SimpleImputer

In [None]:
df_test1 = pd.read_parquet("../Daten/energy_test1.parquet")
df_test2 = pd.read_parquet("../Daten/energy_test2.parquet")
df_train = pd.read_parquet("../Daten/energy_train.parquet")
df_forecasts = pd.read_parquet("../Daten/forecasts.parquet")

# Beobachtung der Daten

In [None]:
df_test1

In [None]:
df_test1.columns

In [None]:
df_test1.dtypes

In [None]:
df_test1.info()

In [None]:
df_test2

In [None]:
df_test2.columns

In [None]:
df_test2.info()

In [None]:
df_train

In [None]:
df_train.columns

In [None]:
df_train.info()

In [None]:
# TODO Zeilen anpassen (durchschnitt, vorheriger Wert) oder löschen
df_train[df_train.Solar_MWh.isna()]

In [None]:
len(df_train[df_train.Solar_MWh.isna()])

In [None]:
df_forecasts

In [69]:
df_forecasts.columns

Index(['ref_datetime', 'valid_time', 'SolarDownwardRadiation', 'CloudCover',
       'Temperature', 'Weather Model', 'valid_datetime'],
      dtype='object')

In [None]:
df_forecasts.info()

In [None]:
df_forecasts[df_forecasts.SolarDownwardRadiation.isna()]

# Umgang mit NaN-Werten

__Umgang mit NaN-Werten in df_train__
- Betroffene Zeilen 4 von 19968 (ca.0,02%)
- Mein Ansatz: Zeilen, wo bei `Solar_MWh` NaN auftaucht, löschen

In [None]:
df_train = df_train.dropna()
df_test1 = df_test1.dropna()
df_test2 = df_test2.dropna()

**Umgang mit NaN-Werten in df_forecasts**
- Betroffene Zeilen: max. 1226 von 606797 (ca. 0,2%)
- Mein Ansatz: Daten behalten und Auffüllen

In [None]:
# # Spalte SolarDownwardRadiation
# df_forecasts["SolarDownwardRadiation"] = df_forecasts["SolarDownwardRadiation"].interpolate(method="linear")

# # Spalte CloudCover
# df_forecasts["CloudCover"] = df_forecasts["CloudCover"].fillna(df_forecasts["CloudCover"].median())

# # Spalte Temperature
# df_forecasts["Temperature"] = df_forecasts["Temperature"].interpolate(method="linear")

In [None]:
df_forecasts = df_forecasts.dropna()

In [None]:
df_forecasts

**Validierung nach der Bereinigung**

In [None]:
print("NaN-Werte in df_train:", df_train.isna().sum())
print("NaN-Werte in df_forecasts:", df_forecasts.isna().sum())
print("NaN-Werte in df_train:", df_test1.isna().sum())
print("NaN-Werte in df_train:", df_test2.isna().sum())

# Umgang mit Negativen Werten `SolarDownwardRadiation` (Optional)

#### Negative SolarDownwardRadiation anpassen

In [None]:
df_forecasts["SolarDownwardRadiation"] = df_forecasts["SolarDownwardRadiation"].clip(lower=0)

# Aufgabe 1 Untersuchung der Energiedaten

In [None]:
# 1. Wie viele Datenpunkte, die vorhergesagt werden sollen, gibt es in den Trainings- bzw. Testdaten?
train_points = len(df_train)

# Anzahl der Datenpunkte in den beiden Testsets
test1_points = len(df_test1)
test2_points = len(df_test2)

print(f"Trainingsdatenpunkte: {train_points}")
print(f"Testdatenpunkte - Test1: {test1_points}, Test2: {test2_points}")

In [None]:
# 2. Tagesverlauf visualisieren
# (Zufälige)Geburtstage auswählen, bzw. Frühlings-, Sommer- und Winterzeit
birthdates = ["2022-05-21", "2022-07-28", "2022-12-16"]
birthdates = pd.to_datetime(birthdates) # Strings in datetime umwandeln

df_train["date"] = pd.to_datetime(df_train["dtm"]).dt.date  # Extrahiere das Datum

# Filter für die ausgewählten Tage
filtered_data = df_train[df_train["date"].isin(birthdates.date)]

# Plot
plt.figure(figsize=(10, 6))
for date in birthdates:
    daily_data = filtered_data[filtered_data["date"] == date.date()]
    plt.plot(
        pd.to_datetime(daily_data["dtm"]).dt.hour,
        daily_data["Solar_MWh"],
        label=str(date.date())
    )
plt.xlabel("Stunde des Tages")
plt.ylabel("Stromerzeugung (Solar_MWh)")
plt.title("Tagesverlauf der Stromerzeugung")
plt.legend()
plt.grid()
plt.show()

2022-05-21  Frühling <br>
2022-07-28 -> Sommer <br>
2022-12-16 -> Winter <br>
Man erkennt deutlich, dass logischerweise Nachts kein Strom produziert wird und der Tag der wichtige Teil für die Stromerzeugung. Auch erkennbar ist, dass der Winter weniger Strom produziert, was sehr stark an der Wolkenbedeckung liegen kann und die Tageszeit kürzer anhält als Frühling und Sommer.
Einen kleinen Ausreißer erkennt man beim Frühling, was bei ca. 10-12Uhr kurz wenig Strom produziert. Das könnte an einem Regenschauer liegen oder anderen technischen Fehlern

In [None]:
# 3. Gesamtverlauf visualisieren
plt.figure(figsize=(14, 7))
plt.plot(pd.to_datetime(df_train["dtm"]), df_train["Solar_MWh"], color="blue")
plt.xlabel("Zeit")
plt.ylabel("Stromerzeugung (Solar_MWh)")
plt.title("Gesamtverlauf der Stromerzeugung (Trainingsdaten)")
plt.grid()
plt.show()

Im Sommer steigt die Stromerzeugung, da es länger hell bleibt und wenig Wolkenbedeckung hat. <br>
Im Winter sinkt die Stromerzeugung, da es schneller dunkel wird und weniger Sonnenschein tagsüber hat.

# Aufgabe 2 Merge und Untersuchung von Zusammenhänge

## Merge mit Trainingsdaten

In [None]:
# Konvertiere `ref_datetime` und `dtm` zu datetime-Objekten
df_train["ref_datetime"] = pd.to_datetime(df_train["ref_datetime"])
df_test1["ref_datetime"] = pd.to_datetime(df_test1["ref_datetime"])
df_test2["ref_datetime"] = pd.to_datetime(df_test2["ref_datetime"])
df_train["dtm"] = pd.to_datetime(df_train["dtm"])
df_test1["dtm"] = pd.to_datetime(df_test1["dtm"])
df_test2["dtm"] = pd.to_datetime(df_test2["dtm"])

# Gültigen Zeitpunkt für Wettervorhersagen berechnen
df_forecasts["valid_datetime"] = df_forecasts["ref_datetime"] + pd.to_timedelta(df_forecasts["valid_time"], unit="h")

In [None]:
df_merged = pd.merge(
    df_train,
    df_forecasts,
    left_on=["dtm", "ref_datetime"],
    right_on=["valid_datetime", "ref_datetime"],
    how="inner"
)
df_merged_test1 = pd.merge(
    df_test1,
    df_forecasts,
    left_on=["dtm", "ref_datetime"],
    right_on=["valid_datetime", "ref_datetime"],
    how="inner"
)
df_merged_test2 = pd.merge(
    df_test2,
    df_forecasts,
    left_on=["dtm", "ref_datetime"],
    right_on=["valid_datetime", "ref_datetime"],
    how="inner"
)

#### Train Merge

In [None]:
df_merged

In [None]:
df_merged.dtypes

#### Test1 Merge

In [None]:
df_merged_test1

In [None]:
df_merged_test1.dtypes

#### Test2 Merge

In [None]:
df_merged_test2

#### Feature Engineered Merge

In [None]:
df_merged

### Plot erstellung

In [None]:
# 1.Plot erstellen
# Scatterplots für jede Wettervariable vs. Solar_MWh
weather_attributes = ["SolarDownwardRadiation", "CloudCover", "Temperature"]

plt.figure(figsize=(18, 6))
for i, attr in enumerate(weather_attributes):
    plt.subplot(1, 3, i+1)
    sns.scatterplot(data=df_merged, x=attr, y="Solar_MWh", alpha=0.5)
    plt.title(f"{attr} vs Solar_MWh")
    plt.xlabel(attr)
    plt.ylabel("Solar_MWh")
plt.tight_layout()
plt.show()

In [None]:
# 2. Erkennung von Zusammenhängen
# Korrelationen zwischen Wetterattributen und Solar_MWh berechnen
correlations = df_merged[weather_attributes + ["Solar_MWh"]].corr()["Solar_MWh"].sort_values(ascending=False)
print("Korrelationen mit Solar_MWh:")
print(correlations)

Die Wolkenbedeckung weißt auf einen schwachen Zusammenhang zur Stromerzeugung, während die Sonneneinstrahlung wichtiger ist.

# Aufgabe 3 Vorverarbeitung

#### Behandlung von Ausreißern

In [None]:
# Verteilung von Solar_MWh
sns.histplot(df_train["Solar_MWh"], kde=True, bins=30)
plt.title("Train - Verteilung von Solar_MWh")
plt.show()

# Boxplot zur Erkennung von Ausreißern
sns.boxplot(x=df_train["Solar_MWh"])
plt.title("Train - Boxplot von Solar_MWh")
plt.show()

**Interpretation**: Es wäre für mich eine logische Entscheidung die Ausreißer 0 mitzunehmen ins Modell, weil diese möglicherweise echte Werte sind. Das liegt daran, das Nachts keine Stromproduktion stattfindet, sowie im Winter die Nacht länger andauert. <br>
Zusätzlich ist es tagsüber deutlich inkonsistenter, da verschiedene Feature (Sonnenstrahlung, Wolkenbedeckung, Temperatur) Einfluss auf die die Stromproduktion nehmen.

Zusammengefasst entscheide ich mich die Ausreißer nicht rauszunehmen

#### Vorverarbeitung der Daten

In [None]:
df_merged

In [None]:
y = df_merged.pop("Solar_MWh")
# Split in Trainings- und Testset
X_train, X_test, y_train, y_test = train_test_split(df_merged, y, test_size=0.2, random_state=42)

In [None]:
vorverarbeitung = ColumnTransformer([
    ("O-H-Encoding", OneHotEncoder(handle_unknown="ignore"),["Weather Model"]),
    ("nanTransform", SimpleImputer(missing_values=np.nan, strategy="mean"), ["SolarDownwardRadiation", "CloudCover", "Temperature" ]),
    ("Skalieren", StandardScaler(), ["Solar_capacity_mwp", "SolarDownwardRadiation", "CloudCover", "Temperature"]),
    ("Entfernen von Spalten", "drop", ["dtm", "ref_datetime", "valid_time", "valid_datetime"])
], remainder="passthrough")

In [None]:
vorverarbeitung

In [None]:
pd.DataFrame(vorverarbeitung.fit_transform(X_train))

# Aufgabe 4 Generierung von neuen Features

### Zeitbasierte Features

In [None]:
# Tageszeit
df_merged["hour"] = df_merged["dtm"].dt.hour
# Monat oder Saison
df_merged["month"] = df_merged["dtm"].dt.month
df_merged["season"] = df_merged["month"].apply(lambda x: (x % 12 + 3) // 3)

### Lag-Features

In [None]:
df_merged["lag_1"] = df_merged["Solar_MWh"].shift(1)
df_merged["lag_24"] = df_merged["Solar_MWh"].shift(24)

### Rolling Features (Gleitender Durchschnitt):

In [None]:
df_merged["rolling_mean_24"] = df_merged["Solar_MWh"].rolling(window=24).mean()

### Interaktion von Wetter und Zeit
- temp_radiation_interaction: Produkt von Temperatur und Sonneneinstrahlung
- cloud_hour_interaction: Produkt von CloudCover und Stunde des Tages

In [None]:
df_merged["temp_radiation_interaction"] = (
    df_merged["Temperature"] * df_merged["SolarDownwardRadiation"]
)
df_merged["cloud_hour_interaction"] = (
    df_merged["CloudCover"] * df_merged["hour"]
)

### Relative Merkmale
Verhältnis von erzeugtem Strom zur verfügbaren Kapazität

In [None]:
df_merged["solar_efficiency"] = df_merged["Solar_MWh"] / df_merged["Solar_capacity_mwp"]

### Transformation von Wetterattributen:
- scaled_temperature: Skaliere Temperatur auf den Bereich [0, 1].
- adjusted_radiation: Negative Werte auf 0 setzen (falls noch nicht gemacht).

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_merged["scaled_temperature"] = scaler.fit_transform(
    df_merged[["Temperature"]]
)
df_merged["adjusted_radiation"] = df_merged["SolarDownwardRadiation"].clip(lower=0)

In [None]:
df_merged

# Aufgabe 5 Modell trainieren

### Baseline-Modell

In [None]:
X_test

In [None]:
pipe_linear = Pipeline([("Vorverarbeitung", vorverarbeitung), ("Linear-Model Training", LinearRegression())])
pipe_linear.fit(X_train, y_train)

y_pred = pipe_linear.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
print("RMSE:", rmse)

In [None]:
pipe_linear

In [None]:
y_pred = pipe_linear.predict(df_merged_test1)
df_merged_test1["Solar_MWh_pred"] = y_pred

In [None]:
df_merged_test1

### Optimierung von Ridge und Lasso

In [None]:
# Hyperparameter für Ridge und Lasso
param_grid = {'alpha': [0.01, 0.1, 1, 10, 100]}

ridge = GridSearchCV(Ridge(), param_grid, cv=5)
ridge.fit(X_train, y_train)

print("Best alpha (Ridge):", ridge.best_params_["alpha"])
print("Best RMSE (Ridge):", -ridge.best_score_)

# Lasso Regression
lasso = GridSearchCV(Lasso(max_iter=10000), param_grid, scoring="neg_root_mean_squared_error", cv=5)
lasso.fit(X_train, y_train)
print("Best alpha (Lasso):", lasso.best_params_["alpha"])
print("Best RMSE (Lasso):", -lasso.best_score_)

### Modell 2: Entscheidungsbaum

In [None]:
# Hyperparameter für den Entscheidungsbaum
param_grid = {
    'max_depth': [3, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}

tree = GridSearchCV(DecisionTreeRegressor(), param_grid, cv=5)
tree.fit(X_train, y_train)
print("Best params (Decision Tree):", tree.best_params_)
print("Best RMSE (Decision Tree):", -tree.best_score_)

### Modell 3: Ensemble-Modell (Random Forest)

In [None]:
# Hyperparameter für Random Forest
param_grid = {
    'n_estimators': [15, 20, 25],
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}

forest = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5)
forest.fit(X_train, y_train)
print("Best params (Random Forest):", forest.best_params_)
print("Best RMSE (Random Forest):", -forest.best_score_)

### Wichtigste Features bestimmen

In [None]:
# Feature-Wichtigkeiten für Linear
linear_coefficients = pd.DataFrame({
    "Feature": X_train.columns,
    "Importance": pipe_linear.coef_
}).sort_values(by="Importance", ascending=False)
print("Ridge Feature Importance:\n", linear_coefficients)

In [None]:
# Feature-Wichtigkeiten für Ridge
ridge_coefficients = pd.DataFrame({
    "Feature": X_train.columns,
    "Importance": ridge.best_estimator_.coef_
}).sort_values(by="Importance", ascending=False)
print("Ridge Feature Importance:\n", ridge_coefficients)

In [None]:
# Feature-Wichtigkeiten für Random Forest
forest_importance = pd.DataFrame({
    "Feature": X_train.columns,
    #"Importance": forest.best_estimator_.feature_importances_
})#.sort_values(by="Importance", ascending=False)
print("Random Forest Feature Importance:\n", forest_importance)

In [None]:
df_merged_test1.to_pickle('test1.pkl')
df_merged_test2.to_pickle('test2.pkl')

# Zusammenfassung | Wenig Zellen

## Aufgabe 5 | 6

#### Useful testing Codes

In [156]:
df_train[df_train.Solar_MWh.isna()]

Unnamed: 0,dtm,ref_datetime,Solar_capacity_mwp,Solar_MWh


### Basic Start

In [29]:
# Zusammengefassetes Modelltraining ohne zusätzliche Zellen
# ? Kommentare sind gehighlighted mit der "Better Comments" Extension
# * Bibliotheken laden und Daten einlesen

import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.impute import SimpleImputer

df_test1 = pd.read_parquet("../Daten/energy_test1.parquet")
df_test2 = pd.read_parquet("../Daten/energy_test2.parquet")
df_train = pd.read_parquet("../Daten/energy_train.parquet")
df_forecasts = pd.read_parquet("../Daten/forecasts.parquet")
#-----------------------------------------------------------------------------------------------
# ! Umgang von NaN-Werten 
# TODO Statt .dropna() eine Imputation einsetzen
# ? Beim Forecast kann es ignoriert werden, da es im Transformer verwendet wird (aber erst im nach dem Merge)
df_train = df_train.dropna()
df_test1 = df_test1.dropna()
df_test2 = df_test2.fillna(0)
# df_forecasts = df_forecasts.dropna()
# Spalte SolarDownwardRadiation
df_forecasts["SolarDownwardRadiation"] = df_forecasts["SolarDownwardRadiation"].interpolate(method="linear")
# Spalte CloudCover
df_forecasts["CloudCover"] = df_forecasts["CloudCover"].fillna(df_forecasts["CloudCover"].median())
# Spalte Temperature
df_forecasts["Temperature"] = df_forecasts["Temperature"].interpolate(method="linear")
#-----------------------------------------------------------------------------------------------
# ! Umgang von Negativen Werten
# TODO Kann auch ignoriert werden
# df_forecasts["SolarDownwardRadiation"] = df_forecasts["SolarDownwardRadiation"].clip(lower=0)
#-----------------------------------------------------------------------------------------------
# ! Merge von Test-/Trainingsdaten mit Forecast-Dataset
# TODO Konvertiere `ref_datetime` und `dtm` zu datetime-Objekten
df_train["ref_datetime"] = pd.to_datetime(df_train["ref_datetime"])
df_test1["ref_datetime"] = pd.to_datetime(df_test1["ref_datetime"])
df_test2["ref_datetime"] = pd.to_datetime(df_test2["ref_datetime"])
df_train["dtm"] = pd.to_datetime(df_train["dtm"])
df_test1["dtm"] = pd.to_datetime(df_test1["dtm"])
df_test2["dtm"] = pd.to_datetime(df_test2["dtm"])

# Gültigen Zeitpunkt für Wettervorhersagen berechnen
df_forecasts["valid_datetime"] = df_forecasts["ref_datetime"] + pd.to_timedelta(df_forecasts["valid_time"], unit="h")

df_merged = pd.merge(
    df_train,
    df_forecasts,
    left_on=["dtm", "ref_datetime"],
    right_on=["valid_datetime", "ref_datetime"],
    how="inner"
)
df_merged_test1 = pd.merge(
    df_test1,
    df_forecasts,
    left_on=["dtm", "ref_datetime"],
    right_on=["valid_datetime", "ref_datetime"],
    how="inner"
)
df_merged_test2 = pd.merge(
    df_test2,
    df_forecasts,
    left_on=["dtm", "ref_datetime"],
    right_on=["valid_datetime", "ref_datetime"],
    how="inner"
)

In [161]:
# Vom fehlerhaften Merge
# // df_merged = df_merged.rename(columns={"ref_datetime_x": "date_train", "ref_datetime_y": "date_weather"})
# // df_merged_test1 = df_merged_test1.rename(columns={"ref_datetime_x": "date_train", "ref_datetime_y": "date_weather"})
# // df_merged_test2 = df_merged_test2.rename(columns={"ref_datetime_x": "date_train", "ref_datetime_y": "date_weather"})

### Feature Engineering und Vorverarbeitung

#### Zeitbasiert Feature

In [15]:
# Funktion für zyklische Transformation | Abruf im ColumnTransformer
def cyclical_features(X):
    X = X.copy()
    X["hour_sin"] = np.sin(2 * np.pi * X["hour"] / 24)
    X["hour_cos"] = np.cos(2 * np.pi * X["hour"] / 24)
    return X[["hour_sin", "hour_cos"]]

In [16]:
# Methode zur Bestimmung des Zeitraums
def assign_time_period(hour):
    if 5 <= hour < 9:
        return 'Morgen'
    elif 9 <= hour < 12:
        return 'Vormittag'
    elif 12 <= hour < 15:
        return 'Mittag'
    elif 15 <= hour < 18:
        return 'Nachmittag'
    elif 18 <= hour < 22:
        return 'Abend'
    else:
        return 'Nacht'

In [17]:
#
# ! Feature Engineering
merged_list = [df_merged, df_merged_test1, df_merged_test2]

for add_time in merged_list:  
    # Tageszeit
    add_time["hour"] = add_time["dtm"].dt.hour
    add_time["time_period"] = add_time["hour"].apply(assign_time_period)
    # add_time["day_of_week"] = add_time["dtm"].dt.day_of_week
    # Monat oder Saison
    add_time["month"] = add_time["dtm"].dt.month
    add_time["season"] = add_time["month"].apply(lambda x: (x % 12 + 3) // 3)

#### Wetterbasierte Feature

In [18]:
def assign_temperature_level(temperature):
    if temperature < 0:
        return 'Sehr kalt'
    elif 0 <= temperature < 10:
        return 'Kalt'
    elif 10 <= temperature < 15:
        return 'Kühl'
    elif 15 <= temperature < 20:
        return 'Mild'
    elif 20 <= temperature < 25:
        return 'Warm'
    elif 25 <= temperature < 30:
        return 'Sehr warm'
    elif 30 <= temperature < 35:
        return 'Heiß'
    else:
        return 'Sehr heiß'

In [19]:
#
# ! Feature Engineering
merged_list = [df_merged, df_merged_test1, df_merged_test2]

for add_weather in merged_list:  
    # Interaktion zwischen Sonneneinstrahlung und Temperatur: Hohe Temperaturen können die Effizienz von Solaranlagen reduzieren, trotz hoher Sonneneinstrahlung
    add_weather["Sun_CloudCover"] = add_weather["Temperature"] * add_weather["SolarDownwardRadiation"]
    # Bewölkerungsdynamik
    add_weather["CloudCover_change"] = add_weather["CloudCover"].diff()
    add_weather["Temperature_Level"] = add_weather["Temperature"].apply(assign_temperature_level)
    
df_merged = df_merged.fillna(0)
df_merged_test1 = df_merged_test1.fillna(0)
df_merged_test2 = df_merged_test2.fillna(0)
    

#### Historische Feature

In [20]:
#
# ! Feature Engineering
merged_list = [df_merged, df_merged_test1, df_merged_test2]

for add_weather in merged_list:  
    # Rolling Average für Sonneneinstrahlung (z. B. über die letzten 3 Stunden)
    # add_weather['AvgSolarRadiation_last_3h'] = add_weather['SolarDownwardRadiation'].rolling(window=3).mean()
    # # Lag-Feature: Sonneneinstrahlung der letzten Stunde
    add_weather['SolarRadiation_lag_1h'] = add_weather['SolarDownwardRadiation'].shift(1)
    # # Lag-Feature: Temperatur der letzten Stunde
    # add_weather['Temperature_lag_1h'] = add_weather['Temperature'].shift(1)
    
df_merged = df_merged.fillna(0)
df_merged_test1 = df_merged_test1.fillna(0)
df_merged_test2 = df_merged_test2.fillna(0)

---
## Vorverarbeitung

In [21]:
#
# * Daten aufteilen
y = df_merged.pop("Solar_MWh")
# * Split in Trainings- und Testset
X_train, X_test, y_train, y_test = train_test_split(df_merged, y, test_size=0.2, random_state=42)

In [22]:
#
# ! Vorverarbeitung: Transformer
vorverarbeitung = ColumnTransformer([
    ("O-H-Encoding", OneHotEncoder(handle_unknown="ignore"),["Weather Model", "time_period", "season", "Temperature_Level"]),
    ("Zyklisch_hour", FunctionTransformer(cyclical_features), ["hour"]),
    #("nanTransform", SimpleImputer(missing_values=np.nan, strategy="mean"), ["SolarDownwardRadiation", "CloudCover", "Temperature"]),
    ("Skalieren", StandardScaler(), [
        "SolarDownwardRadiation", "Sun_CloudCover", "CloudCover_change", "SolarRadiation_lag_1h"
        ]),
    ("drop_columns", 'drop', ["dtm", "valid_time", "valid_datetime", "Solar_capacity_mwp", "ref_datetime", "hour", "month", "CloudCover", "Temperature"])
])


In [169]:
print("NaN-Werte:", X_train.isna().sum())

NaN-Werte: dtm                          0
ref_datetime                 0
Solar_capacity_mwp           0
valid_time                   0
SolarDownwardRadiation       0
CloudCover                   0
Temperature                  0
Weather Model                0
valid_datetime               0
hour                         0
time_period                  0
month                        0
season                       0
Sun_CloudCover               0
CloudCover_change            0
AvgSolarRadiation_last_3h    0
SolarRadiation_lag_1h        0
dtype: int64


### Pipeline

#### Lineares Modell

In [23]:
#
# !Pipeline: Lineares Modell
# * Hyperparametersuche
param_grid = {"fit_intercept": [True, False]}

pipe_linear = Pipeline([("Vorverarbeitung", vorverarbeitung),
                        ("gs_linear", GridSearchCV(LinearRegression(), param_grid, cv=5))
                         ])

# Trainiere Lineares Modell
pipe_linear.fit(X_train, y_train)

# Extrahiere das beste Modell und die besten Parameter
best_params = pipe_linear.named_steps["gs_linear"].best_params_
print("Beste Parameter:", best_params)

# RMSE
y_pred = pipe_linear.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
print("RMSE:", rmse)

# Vorhersage für Testdaten 1 und 2
y_pred = pipe_linear.predict(df_merged_test1)
df_merged_test1["Solar_MWh_pred"] = y_pred
y_pred = pipe_linear.predict(df_merged_test2)
df_merged_test2["Solar_MWh_pred"] = y_pred

Beste Parameter: {'fit_intercept': False}
RMSE: 57.38255073061118


In [12]:
old_rmse = rmse
old_rmse

np.float64(57.38255073061118)

In [13]:

# Extrahiere den besten Linearen Regressor aus GridSearchCV
best_linear_model = pipe_linear.named_steps["gs_linear"].best_estimator_

# Abrufe der Feature-Namen nach Vorverarbeitung
feature_names = pipe_linear.named_steps["Vorverarbeitung"].get_feature_names_out()

# Erstelle einen DataFrame mit den Koeffizienten
coefficients = pd.DataFrame({
    "Feature": feature_names,
    "Coefficient": best_linear_model.coef_
})

coefficients["Feature_Importance"] = coefficients["Coefficient"].abs().round(1)
coefficients = coefficients.sort_values(by="Feature_Importance", ascending=False)

# Ausgabe der wichtigsten Features
coefficients

AttributeError: Transformer Zyklisch_hour (type FunctionTransformer) does not provide get_feature_names_out.

#### Ridge und Lasso

In [173]:
# Hyperparameter für Ridge und Lasso
param_grid = {'alpha': [0.01, 0.05, 0.1, 0.5, 1, 10, 20, 40, 80, 100, 125, 200, 500]}

In [174]:
#
# !Pipeline: Ridge Modell
pipe_ridge = Pipeline([("Vorverarbeitung", vorverarbeitung),
                       ("gs_ridge", GridSearchCV(Ridge(), param_grid, cv=5))
                       ])
pipe_ridge.fit(X_train, y_train)

# Extrahiere das beste Modell und die besten Parameter
best_params = pipe_ridge.named_steps["gs_ridge"].best_params_
print("Beste Parameter:", best_params)

y_pred = pipe_ridge.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
print("RMSE:", rmse)

y_pred = pipe_ridge.predict(df_merged_test1)
df_merged_test1["Solar_MWh_pred"] = y_pred
y_pred = pipe_ridge.predict(df_merged_test2)
df_merged_test2["Solar_MWh_pred"] = y_pred

Beste Parameter: {'alpha': 1}
RMSE: 58.4996850092601


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merged_test1["Solar_MWh_pred"] = y_pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merged_test2["Solar_MWh_pred"] = y_pred


In [175]:
old_rmse = rmse
old_rmse

np.float64(58.4996850092601)

In [150]:
# Extrahiere den besten Linearen Regressor aus GridSearchCV
best_ridge_model = pipe_ridge.named_steps["gs_ridge"].best_estimator_

# Abrufe der Feature-Namen nach Vorverarbeitung
feature_names = pipe_ridge.named_steps["Vorverarbeitung"].get_feature_names_out()

# Erstelle einen DataFrame mit den Koeffizienten
coefficients = pd.DataFrame({
    "Feature": feature_names,
    "Coefficient": best_ridge_model.coef_
})

coefficients["Feature_Importance"] = coefficients["Coefficient"].abs()
coefficients = coefficients.sort_values(by="Feature_Importance", ascending=False)

# Ausgabe der wichtigsten Features
coefficients

Unnamed: 0,Feature,Coefficient,Feature_Importance
12,Skalieren__SolarDownwardRadiation,103.291188,103.291188
15,Skalieren__SolarRadiation_lag_1h,65.101986,65.101986
7,O-H-Encoding__time_period_Vormittag,46.575643,46.575643
5,O-H-Encoding__time_period_Nachmittag,-31.374148,31.374148
3,O-H-Encoding__time_period_Mittag,22.352317,22.352317
2,O-H-Encoding__time_period_Abend,-16.905081,16.905081
6,O-H-Encoding__time_period_Nacht,-10.621326,10.621326
4,O-H-Encoding__time_period_Morgen,-10.027404,10.027404
10,O-H-Encoding__season_3,-9.878982,9.878982
11,O-H-Encoding__season_4,5.189995,5.189995


In [176]:
#
# !Pipeline: Lasso Modell
pipe_lasso = Pipeline([("Vorverarbeitung", vorverarbeitung),
                       ("gs_lasso", GridSearchCV(Lasso(), param_grid, cv=5))
                       ])
pipe_lasso.fit(X_train, y_train)

# Extrahiere das beste Modell und die besten Parameter
best_params = pipe_lasso.named_steps["gs_lasso"].best_params_
print("Beste Parameter:", best_params)

y_pred = pipe_lasso.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
print("RMSE:", rmse)

y_pred = pipe_lasso.predict(df_merged_test1)
df_merged_test1["Solar_MWh_pred"] = y_pred
y_pred = pipe_lasso.predict(df_merged_test2)
df_merged_test2["Solar_MWh_pred"] = y_pred

Beste Parameter: {'alpha': 0.01}
RMSE: 58.50693605461164


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merged_test1["Solar_MWh_pred"] = y_pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merged_test2["Solar_MWh_pred"] = y_pred


In [177]:
old_rmse = rmse
old_rmse

np.float64(58.50693605461164)

In [178]:
# Extrahiere den besten Linearen Regressor aus GridSearchCV
best_lasso_model = pipe_lasso.named_steps["gs_lasso"].best_estimator_

# Abrufe der Feature-Namen nach Vorverarbeitung
feature_names = pipe_lasso.named_steps["Vorverarbeitung"].get_feature_names_out()

# Erstelle einen DataFrame mit den Koeffizienten
coefficients = pd.DataFrame({
    "Feature": feature_names,
    "Coefficient": best_lasso_model.coef_
})

coefficients["Feature_Importance"] = coefficients["Coefficient"].abs()
coefficients = coefficients.sort_values(by="Feature_Importance", ascending=False)

# Ausgabe der wichtigsten Features
coefficients

Unnamed: 0,Feature,Coefficient,Feature_Importance
12,Skalieren__SolarDownwardRadiation,120.8729,120.8729
15,Skalieren__SolarRadiation_lag_1h,78.25981,78.25981
7,O-H-Encoding__time_period_Vormittag,55.39818,55.39818
3,O-H-Encoding__time_period_Mittag,35.9093,35.9093
16,Skalieren__AvgSolarRadiation_last_3h,-31.26565,31.26565
5,O-H-Encoding__time_period_Nachmittag,-16.68769,16.68769
10,O-H-Encoding__season_3,-11.70344,11.70344
2,O-H-Encoding__time_period_Abend,-5.448076,5.448076
0,O-H-Encoding__Weather Model_DWD ICON,3.087895,3.087895
11,O-H-Encoding__season_4,2.67754,2.67754


#### DecisionTree

In [24]:
# Hyperparameter für den Entscheidungsbaum
param_grid = {
    'max_depth': [3, 5, 10, 15],
    'min_samples_split': [2, 4, 5, 8, 10],
    'min_samples_leaf': [1, 2, 5]
}

In [25]:
#
# !Pipeline: DecisionTree Modell
pipe_tree = Pipeline([("Vorverarbeitung", vorverarbeitung),
                       ("gs_tree", GridSearchCV(DecisionTreeRegressor(), param_grid, cv=5))
                       ])
pipe_tree.fit(X_train, y_train)

# Extrahiere das beste Modell und die besten Parameter
best_params = pipe_tree.named_steps["gs_tree"].best_params_
print("Beste Parameter:", best_params)

y_pred = pipe_tree.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
print("RMSE:", rmse)

y_pred = pipe_tree.predict(df_merged_test1)
df_merged_test1["Solar_MWh_pred"] = y_pred
y_pred = pipe_tree.predict(df_merged_test2)
df_merged_test2["Solar_MWh_pred"] = y_pred

Beste Parameter: {'max_depth': 10, 'min_samples_leaf': 5, 'min_samples_split': 4}
RMSE: 54.89852266685542


In [26]:
old_rmse = rmse
old_rmse

np.float64(54.89852266685542)

In [182]:
# Extrahiere den besten DecisionTreeRegressor aus GridSearchCV
best_tree_model = pipe_tree.named_steps["gs_tree"].best_estimator_

# Abrufen der Feature-Namen nach Vorverarbeitung
feature_names = pipe_tree.named_steps["Vorverarbeitung"].get_feature_names_out()

# Erstelle einen DataFrame mit den Feature Importances
feature_importances = pd.DataFrame({
    "Feature": feature_names,
    "Importance": best_tree_model.feature_importances_
})

# Sortiere die Features nach ihrer Importance (absolut, um die wichtigsten Features zuerst zu sehen)
feature_importances = feature_importances.sort_values(by="Importance", ascending=False)

# Ausgabe der Feature Importances
feature_importances

Unnamed: 0,Feature,Importance
12,Skalieren__SolarDownwardRadiation,0.819577
16,Skalieren__AvgSolarRadiation_last_3h,0.117158
15,Skalieren__SolarRadiation_lag_1h,0.033221
5,O-H-Encoding__time_period_Nachmittag,0.006381
7,O-H-Encoding__time_period_Vormittag,0.005624
13,Skalieren__Sun_CloudCover,0.004533
14,Skalieren__CloudCover_change,0.00379
1,O-H-Encoding__Weather Model_NCEP GFS,0.003241
3,O-H-Encoding__time_period_Mittag,0.001616
8,O-H-Encoding__season_1,0.001535


#### Ensemble-Modell

##### Random Forrest

In [27]:
# Hyperparameter für Random Forest
param_grid = {
    'n_estimators': [15, 20, 25],
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}
# !Pipeline: Random Forest Modell
pipe_rand_frst = Pipeline([("Vorverarbeitung", vorverarbeitung),
                       ("gs_rand_frst", GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5))
                       ])
pipe_rand_frst.fit(X_train, y_train)

# Extrahiere das beste Modell und die besten Parameter
best_params = pipe_rand_frst.named_steps["gs_rand_frst"].best_params_
print("Beste Parameter:", best_params)

y_pred = pipe_rand_frst.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
print("RMSE:", rmse)

y_pred = pipe_rand_frst.predict(df_merged_test1)
df_merged_test1["Solar_MWh_pred"] = y_pred
y_pred = pipe_rand_frst.predict(df_merged_test2)
df_merged_test2["Solar_MWh_pred"] = y_pred

Beste Parameter: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 25}
RMSE: 50.465222223505016


In [215]:
old_rmse = rmse
old_rmse

np.float64(50.496053918857356)

In [187]:
# Extrahiere den besten DecisionTreeRegressor aus GridSearchCV
best_randtree_model = pipe_rand_frst.named_steps["gs_rand_frst"].best_estimator_

# Abrufen der Feature-Namen nach Vorverarbeitung
feature_names = pipe_rand_frst.named_steps["Vorverarbeitung"].get_feature_names_out()

# Erstelle einen DataFrame mit den Feature Importances
feature_importances = pd.DataFrame({
    "Feature": feature_names,
    "Importance": best_randtree_model.feature_importances_
})

# Sortiere die Features nach ihrer Importance (absolut, um die wichtigsten Features zuerst zu sehen)
feature_importances = feature_importances.sort_values(by="Importance", ascending=False)

# Ausgabe der Feature Importances
feature_importances

Unnamed: 0,Feature,Importance
12,Skalieren__SolarDownwardRadiation,0.812659
16,Skalieren__AvgSolarRadiation_last_3h,0.120524
15,Skalieren__SolarRadiation_lag_1h,0.034452
5,O-H-Encoding__time_period_Nachmittag,0.006365
7,O-H-Encoding__time_period_Vormittag,0.006199
13,Skalieren__Sun_CloudCover,0.005338
14,Skalieren__CloudCover_change,0.004979
3,O-H-Encoding__time_period_Mittag,0.00191
1,O-H-Encoding__Weather Model_NCEP GFS,0.001623
10,O-H-Encoding__season_3,0.0015


##### Gradient Boosting

In [217]:
# Hyperparameter für Gradient Boosting
param_grid = {
    "learning_rate": [0.01, 0.1, 0.2],
    "n_estimators": [50, 100, 200],
    "max_depth": [3, 5, 7],
    "max_features": [None, "sqrt", "log2"]
}
# !Pipeline: Gradient Boosting Modell
pipe_grad_boost = Pipeline([("Vorverarbeitung", vorverarbeitung),
                       ("gs_grad_boost", GridSearchCV(GradientBoostingRegressor(random_state=42), param_grid))
                       ])
pipe_grad_boost.fit(X_train, y_train)

# Extrahiere das beste Modell und die besten Parameter
best_params = pipe_grad_boost.named_steps["gs_grad_boost"].best_params_
print("Beste Parameter:", best_params)

y_pred = pipe_grad_boost.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
print("RMSE:", rmse)

y_pred = pipe_grad_boost.predict(df_merged_test1)
df_merged_test1["Solar_MWh_pred"] = y_pred
y_pred = pipe_grad_boost.predict(df_merged_test2)
df_merged_test2["Solar_MWh_pred"] = y_pred

Beste Parameter: {'learning_rate': 0.1, 'max_depth': 7, 'max_features': 'sqrt', 'n_estimators': 100}
RMSE: 50.74023095252428


In [219]:
old_rmse = rmse
old_rmse

np.float64(50.74023095252428)

In [28]:
df_merged_test1.to_pickle('test1.pkl')
df_merged_test2.to_pickle('test2.pkl')