# Wind Turbine Failure Prediction - Improved Pipeline (v2)

## Overview
This is an improved version of the original pipeline. The critical change is the **prevention of data leakage**.

## Key Improvements
- **Correct Split Strategy**: Data is split into Train/Val/Test **before** any scaling or dimensionality reduction.
- **No Look-Ahead Bias**: The Scaler and PCA are fitted ONLY on the Training set, and then applied to Validation and Test sets.
- **Robust Evaluation**: Metrics reflect true generalization capability.

## Pipeline Structure
1. Configuration & Data Loading
2. Data Cleaning & Labeling
3. **Temporal Split (Raw Data)**
4. **Feature Engineering (Fit on Train, Transform All)**
5. Sequence Creation
6. Model Training & Evaluation

In [None]:
import random
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix, average_precision_score, matthews_corrcoef
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import pickle

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

In [None]:
# Random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Hyperparameters
REMOVE_OUTLIERS = False
WINDOW_HOURS = 12
USE_PCA = False
PCA_VARIANCE = 0.95
SEQ_LENGTH = 50
BATCH_SIZE = 64
NUM_EPOCHS = 50
EARLY_STOPPING_PATIENCE = 5

# Split Ratios
TRAIN_RATIO = 0.6
VAL_RATIO = 0.12
# Test is remainder

# Model Params
FOCAL_ALPHA = 0.75
FOCAL_GAMMA = 2.0
LR_CNN_LSTM = 0.0001
LR_TCN = 0.0001
LR_TRANSFORMER = 0.0001
WEIGHT_DECAY = 1e-5
DROPOUT = 0.45

USE_WEIGHTED_SAMPLER = True
USE_CLASS_WEIGHTS = True

## 1. Data Loading and Preprocessing

In [None]:
# Load Data
scada = pd.read_csv("Datasets/SCADA/EDP-SCADA-2017.csv")
failure = pd.read_csv("Datasets/SCADA/EDP-Failure-2017.csv")

# Timestamp Conversion
scada['Timestamp'] = pd.to_datetime(scada['Timestamp'])
failure['Timestamp'] = pd.to_datetime(failure['Timestamp'])

# Sort
scada = scada.sort_values(['Turbine_ID', 'Timestamp']).reset_index(drop=True)
failure = failure.sort_values(['Turbine_ID', 'Timestamp']).reset_index(drop=True)

# Clean numeric columns
scada_cols = [col for col in scada.columns if col not in ['Timestamp', 'Turbine_ID']]
for col in scada_cols:
    scada[col] = pd.to_numeric(scada[col], errors='coerce')

# Simple Imputation (Median of whole dataset is acceptable for initial cleaning of sensor errors, 
# though ideally should be done per split. Keeping simple here as NaNs are rare)
scada[scada_cols] = scada[scada_cols].fillna(scada[scada_cols].median())

print(f"Data Loaded: {scada.shape}")

In [None]:
def label_data(scada_df, failure_df, window_hours=48):
    scada_labeled = scada_df.copy()
    scada_labeled['Label'] = 0
    
    for _, fail_row in failure_df.iterrows():
        turbine = fail_row['Turbine_ID']
        fail_time = fail_row['Timestamp']
        
        mask = (
            (scada_labeled['Turbine_ID'] == turbine) &
            (scada_labeled['Timestamp'] <= fail_time) &
            (scada_labeled['Timestamp'] >= fail_time - pd.Timedelta(hours=window_hours))
        )
        scada_labeled.loc[mask, 'Label'] = 1
    
    return scada_labeled

scada_labeled = label_data(scada, failure, WINDOW_HOURS)
print(f"Failure Samples: {scada_labeled['Label'].sum()}")

## 2. Temporal Split (The Fix)
Here we split the data **before** scaling or PCA. This ensures no information from the validation/test sets leaks into the training process.

In [None]:
def temporal_split_raw_data(df, train_ratio=0.6, val_ratio=0.12):
    train_dfs = []
    val_dfs = []
    test_dfs = []
    
    # Split per turbine to maintain chronology
    for turbine in df['Turbine_ID'].unique():
        turbine_df = df[df['Turbine_ID'] == turbine].copy()
        n = len(turbine_df)
        
        train_end = int(n * train_ratio)
        val_end = int(n * (train_ratio + val_ratio))
        
        train_dfs.append(turbine_df.iloc[:train_end])
        val_dfs.append(turbine_df.iloc[train_end:val_end])
        test_dfs.append(turbine_df.iloc[val_end:])
    
    return pd.concat(train_dfs), pd.concat(val_dfs), pd.concat(test_dfs)

train_df, val_df, test_df = temporal_split_raw_data(scada_labeled, TRAIN_RATIO, VAL_RATIO)

print(f"Train shape: {train_df.shape}")
print(f"Val shape:   {val_df.shape}")
print(f"Test shape:  {test_df.shape}")

## 3. Feature Engineering (Fit on Train, Transform All)
We fit the Scaler and PCA only on `train_df`.

In [None]:
feature_cols = [col for col in scada_cols]

# 1. Scaling
scaler = StandardScaler()
# FIT only on Train
X_train_scaled = scaler.fit_transform(train_df[feature_cols])
# TRANSFORM Val and Test
X_val_scaled = scaler.transform(val_df[feature_cols])
X_test_scaled = scaler.transform(test_df[feature_cols])

# 2. PCA (Conditional)
if USE_PCA:
    pca = PCA(n_components=PCA_VARIANCE, random_state=RANDOM_SEED)
    # FIT only on Train
    X_train_final = pca.fit_transform(X_train_scaled)
    # TRANSFORM Val and Test
    X_val_final = pca.transform(X_val_scaled)
    X_test_final = pca.transform(X_test_scaled)
    print(f"PCA Enabled: {X_train_scaled.shape[1]} features -> {pca.n_components_} components")
else:
    X_train_final = X_train_scaled
    X_val_final = X_val_scaled
    X_test_final = X_test_scaled
    print(f"PCA Disabled: Using all {X_train_scaled.shape[1]} features")

# Get labels and turbine IDs for sequence generation
y_train = train_df['Label'].values
y_val = val_df['Label'].values
y_test = test_df['Label'].values

turbines_train = train_df['Turbine_ID'].values
turbines_val = val_df['Turbine_ID'].values
turbines_test = test_df['Turbine_ID'].values

In [None]:
def create_sequences(X, y, turbine_ids, seq_length=50):
    sequences = []
    labels = []
    
    unique_turbines = np.unique(turbine_ids)
    for turbine in unique_turbines:
        turbine_mask = turbine_ids == turbine
        X_turbine = X[turbine_mask]
        y_turbine = y[turbine_mask]
        
        if len(X_turbine) < seq_length:
            continue
            
        for i in range(len(X_turbine) - seq_length + 1):
            sequences.append(X_turbine[i:i+seq_length])
            labels.append(y_turbine[i+seq_length-1])
            
    return np.array(sequences), np.array(labels)

print("Creating sequences...")
X_train_seq, y_train_seq = create_sequences(X_train_final, y_train, turbines_train, SEQ_LENGTH)
X_val_seq, y_val_seq = create_sequences(X_val_final, y_val, turbines_val, SEQ_LENGTH)
X_test_seq, y_test_seq = create_sequences(X_test_final, y_test, turbines_test, SEQ_LENGTH)

print(f"Train Sequences: {X_train_seq.shape}")
print(f"Val Sequences:   {X_val_seq.shape}")
print(f"Test Sequences:  {X_test_seq.shape}")

In [None]:
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.LongTensor(y)
    def __len__(self): return len(self.X)
    def __getitem__(self, idx): return self.X[idx], self.y[idx]

train_dataset = TimeSeriesDataset(X_train_seq, y_train_seq)
val_dataset = TimeSeriesDataset(X_val_seq, y_val_seq)
test_dataset = TimeSeriesDataset(X_test_seq, y_test_seq)

# Weighted Sampler for Train
class_counts = np.bincount(y_train_seq)
class_weights = 1. / class_counts
sample_weights = class_weights[y_train_seq]
sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=sampler)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
class CNN_LSTM(nn.Module):
    def __init__(self, input_dim, seq_length, num_classes=2):
        super(CNN_LSTM, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=input_dim, out_channels=64, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm1d(64)
        self.conv2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm1d(128)
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.dropout1 = nn.Dropout(DROPOUT)
        
        self.lstm = nn.LSTM(input_size=128, hidden_size=128, num_layers=2, 
                           batch_first=True, dropout=DROPOUT, bidirectional=True)
        
        self.fc1 = nn.Linear(256, 64)
        self.dropout2 = nn.Dropout(DROPOUT)
        self.fc2 = nn.Linear(64, num_classes)
        
    def forward(self, x):
        x = x.transpose(1, 2)
        x = torch.relu(self.bn1(self.conv1(x)))
        x = torch.relu(self.bn2(self.conv2(x)))
        x = self.pool(x)
        x = self.dropout1(x)
        x = x.transpose(1, 2)
        lstm_out, _ = self.lstm(x)
        x = lstm_out[:, -1, :]
        x = torch.relu(self.fc1(x))
        x = self.dropout2(x)
        x = self.fc2(x)
        return x

class FocalLoss(nn.Module):
    def __init__(self, alpha=0.75, gamma=2.0, class_weights=None):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.class_weights = class_weights
    def forward(self, inputs, targets):
        ce_loss = nn.CrossEntropyLoss(reduction='none', weight=self.class_weights)(inputs, targets)
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
        return focal_loss.mean()

In [None]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=50, device='cuda', patience=10):
    model.to(device)
    best_val_f1 = 0.0
    best_model_state = None
    epochs_without_improvement = 0
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        model.eval()
        val_preds, val_labels = [], []
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                preds = torch.argmax(outputs, dim=1).cpu().numpy()
                val_preds.extend(preds)
                val_labels.extend(y_batch.cpu().numpy())
        
        val_f1 = f1_score(val_labels, val_preds, average='binary', zero_division=0)
        print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss/len(train_loader):.4f} | Val F1: {val_f1:.4f}")
        
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            best_model_state = model.state_dict().copy()
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1
            
        if epochs_without_improvement >= patience:
            print("Early stopping")
            break
            
    if best_model_state:
        model.load_state_dict(best_model_state)
    return model

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_dim = X_train_final.shape[1]

# Class Weights for Loss
class_counts_train = np.bincount(y_train_seq)
class_weights_values = len(y_train_seq) / (len(class_counts_train) * class_counts_train)
class_weights_tensor = torch.FloatTensor(class_weights_values).to(device)

model = CNN_LSTM(input_dim=input_dim, seq_length=SEQ_LENGTH)
criterion = FocalLoss(alpha=FOCAL_ALPHA, gamma=FOCAL_GAMMA, class_weights=class_weights_tensor)
optimizer = optim.Adam(model.parameters(), lr=LR_CNN_LSTM, weight_decay=WEIGHT_DECAY)

print("Starting Training...")
model = train_model(model, train_loader, val_loader, criterion, optimizer, 
                    num_epochs=NUM_EPOCHS, device=device, patience=EARLY_STOPPING_PATIENCE)

In [None]:
def evaluate(model, loader, device):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch = X_batch.to(device)
            outputs = model(X_batch)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(y_batch.numpy())
    
    print("Confusion Matrix:")
    print(confusion_matrix(all_labels, all_preds))
    print(f"F1 Score: {f1_score(all_labels, all_preds):.4f}")
    print(f"Precision: {precision_score(all_labels, all_preds):.4f}")
    print(f"Recall: {recall_score(all_labels, all_preds):.4f}")

print("Test Set Evaluation:")
evaluate(model, test_loader, device)