In [None]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt

## Training data

In [None]:
df_2016 = pd.read_csv("../Daten/Penmanshiel_SCADA_2016/penmanshiel_turbine1.csv")
df_2017 = pd.read_csv("../Daten/Penmanshiel_SCADA_2017/penmanshiel_turbine1.csv")
df_2018 = pd.read_csv("../Daten/Penmanshiel_SCADA_2018/penmanshiel_turbine1.csv")
df_2019 = pd.read_csv("../Daten/Penmanshiel_SCADA_2019/penmanshiel_turbine1.csv")

In [None]:
dfs = [df_2016, df_2017, df_2018, df_2019]

df_combined = pd.concat(dfs, ignore_index=True)

In [None]:
cutoff_date = "2019-06-30 23:50:00"
df_train = df_combined[df_combined["Date and time"] <= cutoff_date]

In [None]:
df_train["Wind speed, Maximum (m/s)"]

In [None]:
df_train['Date and time'] = pd.to_datetime(df_train['Date and time'], format='mixed', dayfirst=True, errors='coerce')

In [None]:
import matplotlib.pyplot as plt

df_train['Month'] = df_train['Date and time'].dt.month

df_train.boxplot(column='Wind speed (m/s)', by='Month', grid=False, showfliers=True, figsize=(10,6))

plt.title("Monatliche Verteilung der Windgeschwindigkeit")
plt.suptitle("")
plt.xlabel("Monat")
plt.ylabel("Windgeschwindigkeit (m/s)")
plt.xticks(rotation=0)
plt.rcParams.update({'font.size': 14})
plt.show()

In [None]:
import matplotlib.pyplot as plt

df_train['Hour'] = df_train['Date and time'].dt.hour

df_train.boxplot(column='Wind speed (m/s)', by='Hour', grid=False, showfliers=True, figsize=(10,6))

plt.title("Stündliche Verteilung der Windgeschwindigkeit")
plt.suptitle("")
plt.xlabel("Stunde")
plt.ylabel("Windgeschwindigkeit (m/s)")
plt.xticks(rotation=0)
plt.rcParams.update({'font.size': 14})
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

filtered_df = df_train['Wind speed (m/s)']

bin_edges = np.arange(0, filtered_df.max() + 1, 1)

plt.figure(figsize=(10, 6))
plt.hist(filtered_df, bins=bin_edges, density=False, alpha=0.6, color='skyblue', edgecolor='black')
plt.title("Histogram der Windgeschwindigkeiten (Penmanshiel)")
plt.xlabel("Windgeschwindidkeit (m/s)")
plt.ylabel("Frequenz")
plt.grid(True)

plt.rcParams.update({'font.size': 19})
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(df_train["Density adjusted wind speed (m/s)"], df_train['Power (kW)'])
plt.xlabel('Wind Speed (m/s)')
plt.ylabel('Power (kW)')
plt.title('Power Kurve vor Bereinigung der Daten')
plt.grid(True)

plt.rcParams.update({'font.size': 18})
plt.show()

In [None]:

numerische_spalten = df_train.select_dtypes(include=['float64', 'int64'])

korrelationen = numerische_spalten.corr()['Power (kW)']
korrelationen = korrelationen.dropna()


In [None]:
korrelationen = korrelationen.sort_values()
print(korrelationen)

In [None]:
threshold = 0

condition = (df_train["Lost Production Total (kWh)"] > threshold)


df_train = df_train[~condition]

print(df_train)

In [None]:

threshold = 0

condition = ((df_train["Lost Production Total (kWh)"] >= threshold) | (df_train["Lost Production Total (kWh)"] <= -20)) & (df_train["Wind speed (m/s)"] >= 2.5)

df_train = df_train[~condition]

In [None]:

condition = (df_train["Yaw bearing angle (°)"] != df_train["Wind direction (°)"]) & (df_train["Wind speed (m/s)"] > 13) & (df_train["Power (kW)"] < 1800)
cond = df_train[condition]
print(cond[["Power (kW)", "Wind speed (m/s)"]])

df_train = df_train[~condition]

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(df_train["Density adjusted wind speed (m/s)"], df_train['Power (kW)'])
plt.xlabel('Wind Speed (m/s)')
plt.ylabel('Power (kW)')
plt.title('Power Kurve nach Bereinigung der Daten')
plt.grid(True)

plt.rcParams.update({'font.size': 18})
plt.show()

In [None]:
relevant_columns = [
    "Date and time",
    "Density adjusted wind speed (m/s)", "Wind direction (°)", "Nacelle position (°)",
    "Power (kW)", "Wind speed (m/s)"
]
df_train = df_train[relevant_columns]

In [None]:
df_train.describe()

In [None]:
df_train["Power (kW)"][df_train["Power (kW)"] < 0] = 0

In [None]:
df_train.describe()

In [None]:
df_train.isna().sum()

In [None]:
df_numeric = df_train.select_dtypes(include=["float64", "int64"])
df_train[df_numeric.columns] = df_numeric.interpolate().fillna(method="bfill").fillna(method="ffill")

In [None]:
df_train.isna().sum()

In [None]:
output_file = "../Daten/train_data_penman.csv"
df_train.to_csv(output_file, index=False)

print(f"Gefilterte Daten wurden als '{output_file}' gespeichert.")

In [None]:
COLUMNS = [
    "Density adjusted wind speed (m/s)", "Wind direction (°)", "Nacelle position (°)",
    "Power (kW)", "Wind speed (m/s)"
]

## Validation data

In [None]:
df_2020 = pd.read_csv("../Daten/Penmanshiel_SCADA_2020/penmanshiel_turbine1.csv")

In [None]:
dfs = [df_2019, df_2020]

df_combined_val = pd.concat(dfs, ignore_index=True)

In [None]:
relevant_columns = [
    "Date and time",
    "Density adjusted wind speed (m/s)", "Wind direction (°)", "Nacelle position (°)",
    "Power (kW)", "Wind speed (m/s)"
]
df_combined_val = df_combined_val[relevant_columns]

In [None]:
cutoff_date = "2020-06-30 23:50:00"
start_date = "2019-07-01 00:00:00"
df_val = df_combined_val[df_combined_val["Date and time"] >= start_date]

df_val = df_val[df_val["Date and time"] <= cutoff_date]

In [None]:
df_val.last

In [None]:
df_val["Power (kW)"][df_val["Power (kW)"] < 0] = 0

In [None]:
df_numeric = df_val.select_dtypes(include=["float64", "int64"])
df_val[df_numeric.columns] = df_numeric.interpolate().fillna(method="bfill").fillna(method="ffill")

In [None]:
output_file = "../Daten/validation_data_penman.csv"
df_val.to_csv(output_file, index=False)

print(f"Gefilterte Daten wurden als '{output_file}' gespeichert.")

## Test data

In [None]:
df_2021 = pd.read_csv("../Daten/penmanshield_scada_2021/penmanshield_turbine1.csv")

In [None]:
dfs = [df_2020, df_2021]

df_combined_test = pd.concat(dfs, ignore_index=True)

In [None]:
plt.figure(figsize=(14, 6))
plt.scatter(df_combined_test['Date and time'].iloc[:10000], df_combined_test["Lost Production Total (kWh)"].iloc[:10000])
plt.xlabel('Date')
plt.ylabel('Lost Production (kWh)')
plt.title('Production Loss')
plt.grid(True)
plt.show()

In [None]:
relevant_columns = [
    "Date and time",
    "Density adjusted wind speed (m/s)", "Wind direction (°)", "Nacelle position (°)",
    "Power (kW)", "Wind speed (m/s)"
]
df_combined_val = df_combined_val[relevant_columns]

In [None]:
start_date = "2020-07-01 00:00:00"
df_test = df_combined_val[df_combined_val["Date and time"] >= start_date]

In [None]:
df_test.last

In [None]:
df_test["Power (kW)"][df_test["Power (kW)"] < 0] = 0

In [None]:
df_test = df_test.loc[:, ~df_test.columns.duplicated()]

In [None]:
df_numeric = df_test.select_dtypes(include=["float64", "int64"])
df_test[df_numeric.columns] = df_numeric.interpolate().fillna(method="bfill").fillna(method="ffill")

In [None]:
output_file = "../Daten/test_data_penman.csv"
df_test.to_csv(output_file, index=False)

print(f"Gefilterte Daten wurden als '{output_file}' gespeichert.")