In [None]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error

In [None]:
df_train = pd.read_csv("../Daten/train_data.csv")

In [None]:
df_train['Date and time'] = pd.to_datetime(df_train['Date and time'], format='mixed', dayfirst=True, errors='coerce')
df_train.set_index('Date and time', inplace=True)

In [None]:
df_train.describe()

In [None]:
df_val = pd.read_csv("../Daten/validation_data.csv")

In [None]:
df_val['Date and time'] = pd.to_datetime(df_val['Date and time'], format='mixed', dayfirst=True, errors='coerce')
df_val.set_index('Date and time', inplace=True)

# Random Forest

In [None]:
X_train = df_train["Density adjusted wind speed (m/s)"]
y_train = df_train["Power (kW)"]
X_train = X_train.values.reshape(-1, 1)  # Falls nur eine Spalte vorhanden ist

X_val = df_val["Density adjusted wind speed (m/s)"]
y_val = df_val["Power (kW)"]

X_val = X_val.values.reshape(-1, 1)  # Falls nur eine Spalte vorhanden ist

In [None]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

rf = RandomForestRegressor(random_state=42)

param_grid = {
    'n_estimators': [ np.linspace(100, 200, 300)], 
    'max_depth': [10, 20, None], 
    'min_samples_split':  np.linspace(2, 5, 10),
    'min_samples_leaf':  np.linspace(1, 2, 4),
    'max_features': ['auto', 'sqrt']
}

grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)

print(grid_search_rf)

In [None]:
rf_fit = grid_search_rf.fit(X_train, y_train)

y_pred = grid_search_rf.predict(X_val)

In [None]:
import pickle

with open('rf_model.pkl', 'wb') as file:
    pickle.dump(rf_fit, file)

In [None]:
actual_values = df_val["Power (kW)"].iloc[:144]

mae = mean_absolute_error(actual_values, y_pred[:144])
rmse = root_mean_squared_error(actual_values, y_pred[:144])

print("Vorhersagen:", y_pred)
print("Tatsächliche Werte:", actual_values.values)
print("Mean Absolute Error (MAE):", mae)
print("Root Mean Squared Error (RMSE):", rmse)

In [None]:
horizon = 144

plt.figure(figsize=(10, 6))
plt.title("Random Forest Vorhersage mit Winddaten")
plt.plot(df_val.index[:horizon], df_val["Power (kW)"].iloc[:horizon], label="Tatsächliche Werte (Validierung)")
plt.plot(df_val.index[:horizon], y_pred[:horizon], label="Vorhersagen")
plt.ylabel("Power (kW)")
plt.xlabel("Datum")
plt.legend()
plt.show()

# Gradient Boosting

In [None]:
X_train = df_train["Density adjusted wind speed (m/s)"]
y_train = df_train["Power (kW)"]
X_train = X_train.values.reshape(-1, 1)

X_val = df_val["Density adjusted wind speed (m/s)"]
y_val = df_val["Power (kW)"]

X_val = X_val.values.reshape(-1, 1)

In [None]:
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

gbr = GradientBoostingRegressor(random_state=42)

param_grid = {
    'n_estimators': np.linspace(100, 200, num=5, dtype=int),
    'learning_rate': np.logspace(-3, -1, num=5),
    'max_depth': np.linspace(3, 5, num=3, dtype=int),
    'min_samples_split': np.linspace(2, 5, num=4, dtype=int),
    'min_samples_leaf': np.linspace(1, 2, num=2, dtype=int),
}

grid_search = GridSearchCV(estimator=gbr, param_grid=param_grid, 
                           scoring='neg_mean_squared_error', 
                           cv=2, verbose=2, n_jobs=-1)

gb_fit = grid_search.fit(X_train, y_train)

print("Beste Parameter:", grid_search.best_params_)
print("Beste Score (neg MSE):", grid_search.best_score_)

best_model = grid_search.best_estimator_
val_score = best_model.score(X_val, y_val)
print("Test Score (R^2):", val_score)

In [None]:
import pickle

with open('gb_model.pkl', 'wb') as file:
    pickle.dump(gb_fit, file)

In [None]:
y_pred = grid_search.predict(X_val)

In [None]:
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42, learning_rate=0.1, max_depth=3, subsample=0.8)
gb_model.fit(X_train, y_train)

y_pred = gb_model.predict(X_val)

In [None]:
actual_values = df_val["Power (kW)"].iloc[:144]

mae = mean_absolute_error(actual_values, y_pred[:144])
rmse = root_mean_squared_error(actual_values, y_pred[:144])

print("Vorhersagen:", y_pred)
print("Tatsächliche Werte:", actual_values.values)
print("Mean Absolute Error (MAE):", mae)
print("Root Mean Squared Error (RMSE):", rmse)

In [None]:
horizon = 144

plt.figure(figsize=(10, 6))
plt.title("Gradient Boosting Vorhersage mit Winddaten")
plt.plot(df_val.index[:horizon], df_val["Power (kW)"].iloc[:horizon], label="Tatsächliche Werte (Validierung)")
plt.plot(df_val.index[:horizon], y_pred[:horizon], label="Vorhersagen")
plt.ylabel("Power (kW)")
plt.xlabel("Datum")
plt.legend()
plt.show()

# XGBoost

In [None]:
import xgboost as xgb
from xgboost import XGBRegressor

In [None]:
# XGBoost-Modell trainieren
from xgboost import XGBRegressor

model = XGBRegressor(
    objective='reg:squarederror',
    max_depth=3,
    learning_rate=0.05,
    n_estimators=100,
    gamma=0.5,
    min_child_weight=10,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=46.41588833612773,
    reg_lambda=2.154434690031882,
    random_state=42
)
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

In [None]:
from sklearn.model_selection import GridSearchCV

# Parameter-Raster für die Suche
param_grid = {
    'alpha': np.logspace(- np.linspace(3, 3, 10)),
    'lambda': np.logspace(-3, 3, 10),
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth':  np.linspace(3, 5, 7)
}

# Modell
xgb = XGBRegressor(n_estimators=100)

# Grid Search
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
xgb_fit = grid_search.fit(X_train, y_train)

print("Beste Parameter:", grid_search.best_params_)
print("Bestes Ergebnis:", -grid_search.best_score_)

In [None]:
import pickle

with open('xgb_model.pkl', 'wb') as file:
    pickle.dump(xgb_fit, file)

In [None]:
y_pred = grid_search.predict(X_val)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Parameter-Raster für die Zufallssuche
param_dist = {
    'alpha': np.logspace(-3, 3, 10),
    'lambda': np.logspace(-3, 3, 10),
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth':  np.linspace(3, 5, 7)
}

xgb = XGBRegressor(n_estimators=100)

# Randomized Search
random_search = RandomizedSearchCV(estimator=xgb, param_distributions=param_dist, n_iter=50, scoring='neg_mean_squared_error', cv=5, random_state=42)
random_search.fit(X_train, y_train)

print("Beste Parameter:", random_search.best_params_)
print("Bestes Ergebnis:", -random_search.best_score_)

In [None]:
y_pred = random_search.predict(X_val)

In [None]:
y_pred = model.predict(X_val)

In [None]:
actual_values = df_val["Power (kW)"].iloc[:144]

mae = mean_absolute_error(actual_values, y_pred[:144])
rmse = root_mean_squared_error(actual_values, y_pred[:144])

print("Vorhersagen:", y_pred)
print("Tatsächliche Werte:", actual_values.values)
print("Mean Absolute Error (MAE):", mae)
print("Root Mean Squared Error (RMSE):", rmse)

In [None]:
horizon = 144

plt.figure(figsize=(10, 6))
plt.title("XGBoost Vorhersage mit Winddaten")
plt.plot(df_val.index[:horizon], df_val["Power (kW)"].iloc[:horizon], label="Tatsächliche Werte (Validierung)")
plt.plot(df_val.index[:horizon], y_pred[:horizon], label="Vorhersagen")
plt.ylabel("Power (kW)")
plt.xlabel("Datum")
plt.legend()
plt.show()

## Forecast

In [None]:
def create_features(df):
    df = df.copy()
    df["hour"] = df.index.hour
    df["dayofweek"] = df.index.dayofweek
    df["month"] = df.index.month
    df["year"] = df.index.year
    df["dayofyear"] = df.index.dayofyear

    return df

In [None]:
def add_lags(df):
    # Sicherstellen, dass die Zielspalte existiert
    if "Power (kW)" not in df.columns:
        raise ValueError("Die Spalte 'Power (kW)' muss im DataFrame enthalten sein.")
    
    # Lag-Features iterativ hinzufügen
    for lag_steps in [6, 12, 18]:
        lag_column = f"lag_{lag_steps}"  # Eindeutiger Name für jede Lag-Spalte
        df[lag_column] = df["Power (kW)"].shift(lag_steps)  # Werte verschieben
    
    return df

In [None]:
train = create_features(df_train)
val = create_features(df_val)

In [None]:

trainval = pd.concat([train, val])

In [None]:
train.columns

In [None]:
trainval = add_lags(trainval)

In [None]:
trainval["lag_6"]

In [None]:
print(len(train))
print(len(val))

In [None]:
train = trainval[:-52704]
val = trainval[-52704:]

In [None]:
train.drop(columns=['Density adjusted wind speed (m/s)', 'Wind direction (°)',
       'Nacelle position (°)'])
val.drop(columns=['Density adjusted wind speed (m/s)', 'Wind direction (°)',
       'Nacelle position (°)'])

In [None]:
FEATURES = ['hour', 'dayofweek', 'month',
       'year', 'dayofyear', 'lag_6', 'lag_12', 'lag_18']
TARGET = ['Power (kW)']

In [None]:
X_train = train[FEATURES]
y_train = train[TARGET]

X_val = val[FEATURES]
y_val = val[TARGET]

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

param_grid = {
    'alpha': np.linspace( np.linspace(0, 10, 5)),
    'lambda': np.linspace(0, 100, 5),
    'learning_rate': np.linspace(0.03, 0.07, 5),
    'max_depth':  np.linspace(3, 5, 7)
}

xgb = XGBRegressor(n_estimators=100)

grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

print("Beste Parameter:", grid_search.best_params_)
print("Bestes Ergebnis:", -grid_search.best_score_)

In [None]:
grid_search.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)],
        verbose=True)

In [None]:
reg = XGBRegressor(
    objective='reg:squarederror',
    max_depth=3,
    learning_rate=0.05,
    n_estimators=100,
    gamma=0.5,
    min_child_weight=10,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=46.41588833612773,
    reg_lambda=2.154434690031882,
    random_state=42
)
reg.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)],
        verbose=True)

## Feature Importance

In [None]:
fi = pd.DataFrame(data=reg.feature_importances_, index=reg.feature_names_in_,columns=['importance'])
fi.sort_values('importance').plot(kind='barh', title='Feature Importance')

## Forecast on Validationset

In [None]:
val['prediction'] = reg.predict(X_val)
df_val.merge(val[['prediction']], how='left', left_index=True, right_index=True)

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np

actual_values = df_val["Power (kW)"].iloc[:144]
predicted_values = y_pred[:144]

mae = mean_absolute_error(actual_values, predicted_values)
rmse = np.sqrt(np.mean((actual_values - predicted_values) ** 2))

accuracy = 100 - (mae / actual_values.mean() * 100)

mean_deviation = np.mean(predicted_values - actual_values)

r2 = r2_score(actual_values, predicted_values)

print("Vorhersagen:", predicted_values)
print("Tatsächliche Werte:", actual_values.values)
print("Mean Absolute Error (MAE):", mae)
print("Root Mean Squared Error (RMSE):", rmse)
print("Genauigkeit in Prozent:", accuracy, "%")
print("Mean Deviation (MD):", mean_deviation)
print("R2-Score:", r2)

In [None]:
horizon = 144

plt.figure(figsize=(10, 6))
plt.title("XGBoost Vorhersage")
plt.plot(df_val.index[:horizon], df_val["Power (kW)"].iloc[:horizon], label="Tatsächliche Werte (Validierung)")
plt.plot(df_val.index[:horizon], val['prediction'].iloc[:horizon], label="Vorhersagen")
plt.ylabel("Power (kW)")
plt.xlabel("Datum")
plt.legend()
plt.show()

In [None]:
ax = val["Power (kW)"].plot(figsize=(15,5))
val['prediction'].plot(ax=ax, style='.')

## Random Forest Forecast

In [None]:
grid_search_rf.fit(X_train, y_train)

In [None]:
grid_search_rf.best_params

In [None]:
fi = pd.DataFrame(data=grid_search_rf.feature_importances_, index=grid_search_rf.feature_names_in_,columns=['importance'])
fi.sort_values('importance').plot(kind='barh', title='Feature Importance')

In [None]:
val['prediction'] = grid_search_rf.predict(X_val)
df_val.merge(val[['prediction']], how='left', left_index=True, right_index=True)

In [None]:
horizon = 144

plt.figure(figsize=(10, 6))
plt.title("Random Forest Vorhersage")
plt.plot(df_val.index[:horizon], df_val["Power (kW)"].iloc[:horizon], label="Tatsächliche Werte (Validierung)")
plt.plot(df_val.index[:horizon], val['prediction'], label="Vorhersagen")
plt.ylabel("Power (kW)")
plt.xlabel("Datum")
plt.legend()
plt.show()

# S-Kurve

In [None]:
df = pd.read_csv("../Daten/kelmarsh_scada_2018/kelmarsh_turbine1.csv")

In [None]:
from scipy.optimize import curve_fit

wind_speed = df["Wind speed (m/s)"]
power = df["Energy Theoretical (kWh)"] * 6

valid_mask = np.isfinite(wind_speed) & np.isfinite(power)
wind_speed = wind_speed[valid_mask]
power = power[valid_mask]

P_max_fixed = 2050 

def s_curve(v, k, v_0):
    return P_max_fixed / (1 + np.exp(-k * (v - v_0)))

weights = 1 / (1 + np.abs(power - 2050)) 
popt, _ = curve_fit(s_curve, wind_speed, power, p0=[0.5, 10], sigma=weights)

v_model = np.linspace(0, 25, 100)
p_model = s_curve(v_model, *popt)

plt.figure()
plt.scatter(wind_speed, power, label="Daten", color="red")
plt.plot(v_model, p_model, label="S-Kurve", color="blue")
plt.xlabel("Windgeschwindigkeit (m/s)")
plt.ylabel("Leistung (kW)")
plt.title("S-Kurve: Leistung vs. Windgeschwindigkeit")
plt.legend()
plt.grid()
plt.show()

In [None]:
with open("s_curve_model.pkl", "wb") as file:
    pickle.dump((s_curve, popt), file)

In [None]:
new_wind_speeds = df_val["Density adjusted wind speed (m/s)"]

predicted_power = s_curve(new_wind_speeds, *popt)

for wind, power in zip(new_wind_speeds, predicted_power):
    print(f"Windgeschwindigkeit: {wind:.1f} m/s -> Vorhergesagte Leistung: {power:.2f} kW")

In [None]:
actual_values = df_val["Power (kW)"].iloc[:144]

mae = mean_absolute_error(actual_values, predicted_power[:144])
rmse = root_mean_squared_error(actual_values, predicted_power[:144])

print("Vorhersagen:", predicted_power)
print("Tatsächliche Werte:", actual_values.values)
print("Mean Absolute Error (MAE):", mae)
print("Root Mean Squared Error (RMSE):", rmse)

In [None]:
horizon = 144

plt.figure(figsize=(10, 6))
plt.title("S-Kurve Vorhersage")
plt.plot(df_val.index[:horizon], df_val["Power (kW)"].iloc[:horizon], label="Tatsächliche Werte (Validierung)")
plt.plot(df_val.index[:horizon], predicted_power[:144], label="Vorhersagen")
plt.ylabel("Power (kW)")
plt.xlabel("Datum")
plt.legend()
plt.show()