# Workshop Outline: Predicting Turbine Power with Neural Networks

This workshop will guide you through the steps of using a neural network (MLP) to predict turbine power from SCADA data.

---

## Steps

1. **Load SCADA Data**  
   Load 10-minute statistics of preprocessed SCADA turbine data.

2. **Explore and Visualize Parameters**  
   Visualize key SCADA parameters such as windspeed, pitch, and power.

3. **Train a Neural Network (MLP)**  
   Train a simple predefined neural network architecture to predict turbine power.

4. **Plot and Interpret Performance**  
   Evaluate the network with training and validation error plots, and visualize predictions.

5. **Experiment with Model Complexity**  
   Incrementally increase model complexity and add features to observe changes in performance.

---

**Goal:** Understand the workflow of machine learning for turbine power prediction, from data exploration to model evaluation and experimentation.


This workshop utilizes open source data from [Aventa AV-7 (6kW) IET-OST Research Wind Turbine SCADA](https://zenodo.org/records/17362783). The original dataset is 1Hz SCADA but is preprocessed in 10 minute SCADA statistics.

# **A simple Neural Network**

![Simple Neural Network](https://raw.githubusercontent.com/OWI-Lab/AI_for_SHM-Workshop/main/Figures/sample_NN.png)


![Forward Propagation](https://raw.githubusercontent.com/OWI-Lab/AI_for_SHM-Workshop/main/Figures/forward_prop.png)


Examples of activation functions

![Activation Function Diagram](https://raw.githubusercontent.com/OWI-Lab/AI_for_SHM-Workshop/main/Figures/activation_func.png)


# *Loading Necessary Packages*

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Packages for handling and processing load data
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go



# Loading Data

In [None]:
url = "https://raw.githubusercontent.com/OWI-Lab/AI_for_SHM-Workshop/main/Notebooks/AV-7-10min-Data.parquet"
df = pd.read_parquet(url)

df.dropna(inplace=True)
df.head()

In [None]:
## Inspect the data frame using .tail() and .head()
## Check the duration of dataframe
## Check what paramters are in the dataframe along with sampling frequency

# Plotting Power Curve

In [None]:
plt.figure(figsize=(10,4))

# Plot the power curve by setting x and y to windspeed and power
#plt.scatter(x , y, s=2)
plt.xlabel('')
plt.ylabel('')
plt.title('')
plt.grid()
plt.show()

In [None]:
## Check here plots of other paramters (maybe windspeed vs pitch) ? 
## What do we see in the data?

# Prepare Data and Split by Datetime

In [None]:
## Filter first 2 months of data
start = df.index[0]
end = '2025-02-28'
df_2m = df.loc[start:end].copy()

In [None]:
df_2m.head()

In [None]:
cols = [c for c in df.columns if c not in ['power_output-mean', 'power_output-std']]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## first understand the architecture and input-output parameters
## We will return here to improve the network performance

input_features = ['wind_speed-mean', 'blade_pitch_deg-mean']
target_feature = ['power_output-mean']

X_all = df_2m[input_features]
y_all = df_2m[target_feature]

# Extract index as array
idx_all = df_2m.index.values

# Random split of the dataframe index so we know exactly the test and training data
idx_train, idx_test = train_test_split(
    idx_all, test_size=0.2, random_state=42
)

# Masks for selecting rows
train_mask = df_2m.index.isin(idx_train)
test_mask  = df_2m.index.isin(idx_test)

# Build train/test datasets
X_train = X_all[train_mask].values
y_train = y_all[train_mask].values

X_test  = X_all[test_mask].values
y_test  = y_all[test_mask].values


## Train model with and without feature scaling?
## What do we notice?

# Scale (fit only on train!)
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled  = scaler.transform(X_test)


# Artificial Neural Network to predict Power

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

# Understand this MLPRegressor architecture and parameters affecting performance!
# What can we do to improve performance?

model = MLPRegressor(
    hidden_layer_sizes=(2,),
    activation='relu',
    solver='sgd',           # SGD optimizer
    learning_rate_init = 0.001,
    early_stopping = True,
    validation_fraction = 0.4,
    n_iter_no_change = 25,
    max_iter=200,
    random_state=42
)

# Alternative solvers (uncomment to use)
# solver='adam', stochastic gradient descent 'sgd'
# Activation functions relu, logistic, tanh

In [None]:
model

In [None]:
model.fit(X_train, y_train)

# Predictions for error curves
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

train_err = mean_squared_error(y_train, train_pred)
test_err = mean_squared_error(y_test, test_pred)

print('Train Error:', train_err)
#print('Validation Error:', val_err)
print('Test Error:', test_err)

In [None]:
# Check coefficient of determination R^2
print(model.score(X_test, y_test))
print(model.score(X_train, y_train))


In [None]:
# Extract training loss
train_loss = model.loss_curve_

# Extract validation scores (R²) – convert to a loss-like metric if desired
val_scores = model.validation_scores_

# Convert validation R² to pseudo-loss: (1 - R²)
# Only needed if you want both curves in similar form
val_loss = [1 - s for s in val_scores]

plt.figure(figsize=(10,6))

plt.plot(train_loss, label="Training Loss")
plt.plot(val_loss, label="Validation Loss (1 - R²)")

plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training vs Validation Loss")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
model.best_validation_score_

In [None]:
pred_full = model.predict(X_all.values)

In [None]:
print(model.score(X_all, y_all))


In [None]:
plt.figure(figsize=(5,5))
plt.scatter(y_all, pred_full, s=5, color='blue', alpha=0.6)
plt.xlabel('Actual Power')
plt.ylabel('Predicted Power')
plt.title('True vs Predicted')
plt.plot([y_all.min(), y_all.max()], [y_all.min(), y_all.max()], 'k--', linewidth=1)
plt.grid(True)
plt.show()

In [None]:
fig = go.Figure()

# Full timeseries: Actual
fig.add_trace(go.Scatter(
    x=df_2m.index,
    y=y_all.values.ravel(),
    mode='lines',
    name='Actual'
))

# Full timeseries: Predicted
fig.add_trace(go.Scatter(
    x=df_2m.index,
    y=pred_full,
    mode='lines',
    name='Predicted'
))

# Training samples (randomly scattered)
fig.add_trace(go.Scatter(
    x=df_2m.index[train_mask],
    y=y_all[train_mask].values.ravel(),
    mode='markers',
    marker=dict(color='green', size=6, opacity=0.5),
    name='Training Samples'
))

# Test samples (randomly scattered)
fig.add_trace(go.Scatter(
    x=df_2m.index[test_mask],
    y=y_all[test_mask].values.ravel(),
    mode='markers',
    marker=dict(color='red', size=6, opacity=0.5),
    name='Test Samples'
))

# Layout
fig.update_layout(
    title='Timeseries Prediction (Random Train/Test Split)',
    xaxis_title='Time',
    yaxis_title='Power',
    hovermode='x unified',
    width=1100,
    height=500
)

fig.show()

In [None]:
# Iterate with different model configurations to improve model performance (specifically 
# print(model.score(X_all, y_all))


In [None]:
# Compute start of last month
last_month_start = df.index.max() - pd.DateOffset(months=1)

# Filter the last month
df_last_month = df.loc[last_month_start:]

X_last_month = df_last_month[input_features].values
y_last_month = df_last_month[target_feature].values
X_last_month_scaled = scaler.transform(X_last_month)  # reuse training scaler

In [None]:
pred_last_month = model.predict(X_last_month)

In [None]:
print(model.score(X_last_month, y_last_month))


In [None]:
plt.figure(figsize=(5,5))
plt.scatter(y_last_month, pred_last_month, s=5, color='blue', alpha=0.6)
plt.xlabel('Actual Power')
plt.ylabel('Predicted Power')
plt.title('True vs Predicted')
plt.plot([y_last_month.min(), y_last_month.max()], [y_last_month.min(), y_last_month.max()], 'k--', linewidth=1)
plt.grid(True)
plt.show()

In [None]:
fig = go.Figure()

# Actual power
fig.add_trace(go.Scatter(
    x=df_last_month.index,
    y=y_last_month.ravel(),
    mode='lines',
    name='Actual Power'
))

# Predicted power
fig.add_trace(go.Scatter(
    x=df_last_month.index,
    y=pred_last_month,
    mode='lines',
    name='Predicted Power'
))

fig.update_layout(
    title='Actual vs Predicted Power (Last Month)',
    xaxis_title='Time',
    yaxis_title='Power',
    hovermode='x unified',
    width=1100,
    height=500
)

fig.show()

# Curtailment

In [None]:
# Make a copy of the last month dataframe
df_n_data = df_last_month.copy()

# Identify where wind speed is greater than 4
ws_mask = df_n_data['wind_speed-mean'] > 4

# Generate random power shift values
np.random.seed(42)
random_shift = np.random.uniform(low=2.5, high=3.5, size=ws_mask.sum())

# Overwrite the target column in the copy
df_n_data.loc[ws_mask, 'power_output-mean'] = random_shift

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,4))
plt.scatter(df['wind_speed-mean'], df['power_output-mean'], label='Old data' , s=2)
plt.scatter (df_n_data['wind_speed-mean'], df_n_data['power_output-mean'], label='Curtailed_Power', s=2 , alpha=0.7)
plt.xlabel('Wind Speed (m/s)')
plt.ylabel('Power Output (kW)')
plt.title('Power Curve: Power Output vs Wind Speed')
plt.legend()
plt.grid()
plt.show()

In [None]:
X_n_data = df_n_data[input_features].values
y_n_data = df_n_data[target_feature].values
X_n_data_scaled = scaler.transform(X_n_data)  # reuse training scaler

In [None]:
pred_n_data = model.predict(X_n_data)

In [None]:
print(model.score(X_n_data, y_n_data))


In [None]:
plt.figure(figsize=(5,5))
plt.scatter(y_n_data, pred_n_data, s=5, color='blue', alpha=0.6)
plt.xlabel('Actual Power')
plt.ylabel('Predicted Power')
plt.title('True vs Predicted')
plt.plot([y_last_month.min(), y_last_month.max()], [y_last_month.min(), y_last_month.max()], 'k--', linewidth=1)
plt.grid(True)
plt.show()