In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
# ==========================
# 1. Custom Transformers
# ==========================

class FeatureEngineer(BaseEstimator, TransformerMixin):
    """Custom feature engineering for wind turbine SCADA data."""
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()

        # Handle invalid negative power values
        X['LV ActivePower (kW)'] = X['LV ActivePower (kW)'].clip(lower=0)

        # Replace large time gaps (sensor issues)
        X['time_diff'] = X['time_diff'].clip(upper=600)  # assume ~10-min intervals

        # Cyclic encoding for wind direction
        X['WindDir_sin'] = np.sin(2 * np.pi * X['Wind Direction (°)'] / 360)
        X['WindDir_cos'] = np.cos(2 * np.pi * X['Wind Direction (°)'] / 360)

        # Nonlinear transformations
        X['WindSpeed_sq'] = X['Wind Speed (m/s)'] ** 2
        X['WindSpeed_cu'] = X['Wind Speed (m/s)'] ** 3

        # Power ratio
        X['Power_to_Theoretical'] = (
            X['LV ActivePower (kW)'] / (X['Theoretical_Power_Curve (KWh)'] + 1e-3)
        )

        return X


class OutlierTrimmer(BaseEstimator, TransformerMixin):
    """Trims extreme outliers based on IQR method."""
    def __init__(self, cols=None, factor=1.5):
        self.cols = cols
        self.factor = factor
    
    def fit(self, X, y=None):
        self.bounds_ = {}
        for col in self.cols:
            q1 = X[col].quantile(0.25)
            q3 = X[col].quantile(0.75)
            iqr = q3 - q1
            self.bounds_[col] = (q1 - self.factor * iqr, q3 + self.factor * iqr)
        return self
    
    def transform(self, X):
        X = X.copy()
        for col, (low, high) in self.bounds_.items():
            X[col] = X[col].clip(lower=low, upper=high)
        return X

In [None]:
# ==========================
# 2. Load and Prepare Data
# ==========================

# First load without parse_dates to see the actual format
df = pd.read_csv("dataset/T1.csv")
print("Original Date/Time sample:", df['Date/Time'].head().tolist())

# Now convert to datetime with dayfirst=True (since your format is day month year)
df['Date/Time'] = pd.to_datetime(df['Date/Time'], dayfirst=True)

# Sort by datetime
df.sort_values("Date/Time", inplace=True)

# Calculate time_diff
df['time_diff'] = df['Date/Time'].diff().dt.total_seconds().div(60)
df['time_diff'].fillna(0, inplace=True)  # Handle first row NaN

# Define features and target
features = ['Wind Speed (m/s)', 'Theoretical_Power_Curve (KWh)', 
            'Wind Direction (°)', 'time_diff']
target = 'LV ActivePower (kW)'

X = df[features]
y = df[target]

print(f"Data shape: {df.shape}")
print(f"Features: {features}")
print(f"Target: {target}")

Original Date/Time sample: ['01 01 2018 00:00', '01 01 2018 00:10', '01 01 2018 00:20', '01 01 2018 00:30', '01 01 2018 00:40']
Data shape: (50530, 6)
Features: ['Wind Speed (m/s)', 'Theoretical_Power_Curve (KWh)', 'Wind Direction (°)', 'time_diff']
Target: LV ActivePower (kW)


In [6]:
# ==========================
# 3. Build Preprocessing Pipeline
# ==========================

# Numeric pipeline: imputation, trimming, scaling
numeric_features = features
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("outlier_trim", OutlierTrimmer(cols=numeric_features)),
    ("scaler", StandardScaler())
])

# Combine all transformations
preprocessor = Pipeline([
    ("feature_eng", FeatureEngineer()),
    ("preprocess", ColumnTransformer([
        ("num", numeric_pipeline, numeric_features)
    ], remainder="passthrough"))
])

In [10]:
# ==========================
# 4. Train / Val / Test Split (Time-based)
# ==========================

print("DataFrame columns:", df.columns.tolist())
print("DataFrame shape:", df.shape)

# Define features and target (make sure target is not in features)
features = ['Wind Speed (m/s)', 'Theoretical_Power_Curve (KWh)', 
            'Wind Direction (°)', 'time_diff']
target = 'LV ActivePower (kW)'

print("Features:", features)
print("Target:", target)

# Verify target is not in features
if target in features:
    features.remove(target)

# Split the data
split1 = int(0.7 * len(df))
split2 = int(0.85 * len(df))

train = df.iloc[:split1]
val = df.iloc[split1:split2]
test = df.iloc[split2:]

X_train, y_train = train[features], train[target]
X_val, y_val = val[features], val[target]
X_test, y_test = test[features], test[target]

print(f"\nSplit completed:")
print(f"Train: {len(X_train)} samples")
print(f"Val: {len(X_val)} samples") 
print(f"Test: {len(X_test)} samples")

# Check if preprocessor exists and is properly defined
print(f"\nPreprocessor type: {type(preprocessor)}")

# If preprocessor was defined to include the target column, we need to redefine it
# Let's create a new preprocessor that only processes the features
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

# Create a fresh preprocessor for only the feature columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), features)
    ]
)

print("Preprocessor transformers:", preprocessor.transformers)

# Now fit and transform only the features
X_train_proc = preprocessor.fit_transform(X_train)
X_val_proc = preprocessor.transform(X_val)
X_test_proc = preprocessor.transform(X_test)

print("\nPreprocessed shapes:")
print("Train shape:", X_train_proc.shape)
print("Validation shape:", X_val_proc.shape)
print("Test shape:", X_test_proc.shape)

print("\nFeature names after preprocessing:")
print(preprocessor.get_feature_names_out())

DataFrame columns: ['Date/Time', 'LV ActivePower (kW)', 'Wind Speed (m/s)', 'Theoretical_Power_Curve (KWh)', 'Wind Direction (°)', 'time_diff']
DataFrame shape: (50530, 6)
Features: ['Wind Speed (m/s)', 'Theoretical_Power_Curve (KWh)', 'Wind Direction (°)', 'time_diff']
Target: LV ActivePower (kW)

Split completed:
Train: 35371 samples
Val: 7579 samples
Test: 7580 samples

Preprocessor type: <class 'sklearn.pipeline.Pipeline'>
Preprocessor transformers: [('num', StandardScaler(), ['Wind Speed (m/s)', 'Theoretical_Power_Curve (KWh)', 'Wind Direction (°)', 'time_diff'])]

Preprocessed shapes:
Train shape: (35371, 4)
Validation shape: (7579, 4)
Test shape: (7580, 4)

Feature names after preprocessing:
['num__Wind Speed (m/s)' 'num__Theoretical_Power_Curve (KWh)'
 'num__Wind Direction (°)' 'num__time_diff']
