In [2]:
# Feature Engineering Pipeline
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import joblib

In [3]:
# Load cleaned dataset
df = pd.read_csv('/workspace/COMP3610-Renewable-Energy-Prediction/data/processed/final_combined_dataset.csv', parse_dates=['time'])

In [4]:
def create_features(df):
    """Create time-series features from datetime index"""
    
    # Temporal Features
    df['hour'] = df['time'].dt.hour
    df['day_of_week'] = df['time'].dt.dayofweek
    df['day_of_year'] = df['time'].dt.dayofyear
    df['month'] = df['time'].dt.month
    df['quarter'] = df['time'].dt.quarter
    
    # Cyclical Encoding for temporal features
    def cyclical_encode(df, col, max_val):
        df[col + '_sin'] = np.sin(2 * np.pi * df[col]/max_val)
        df[col + '_cos'] = np.cos(2 * np.pi * df[col]/max_val)
        return df
    
    df = cyclical_encode(df, 'hour', 24)
    df = cyclical_encode(df, 'day_of_week', 7)
    df = cyclical_encode(df, 'month', 12)
    
    # Seasonality
    df['season'] = df['month'] % 12 // 3 + 1
    
    # Weather Features
    df['temp_rhum_interaction'] = df['temp'] * df['rhum']  # Heat index proxy
    df['wind_power_potential'] = df['wspd'] ** 3  # Wind power is proportional to cube of wind speed
    
    # Lag Features (previous time steps)
    target_cols = ['Solar', 'Wind Onshore']
    for col in target_cols:
        for lag in [1, 2, 3, 24, 168]:  # 1h, 2h, 3h, 24h, 1 week lags
            df[f'{col}_lag_{lag}'] = df.groupby('country')[col].shift(lag)
    
    # Rolling Statistics
    for col in target_cols:
        df[f'{col}_rolling_24h_mean'] = df.groupby('country')[col].rolling(24).mean().values
        df[f'{col}_rolling_24h_std'] = df.groupby('country')[col].rolling(24).std().values
    
    # Country-specific features
    df = pd.get_dummies(df, columns=['country', 'season'], drop_first=True)
    
    return df


In [5]:
# Apply feature engineering
df_featured = create_features(df)

In [7]:
# Save engineered data to the specified processed directory
df_featured.to_csv('/workspace/COMP3610-Renewable-Energy-Prediction/data/processed/feature_engineered_data.csv', index=False)

In [8]:
# Define target columns upfront
targets = ['Solar', 'Wind Onshore']

In [9]:
# Save target columns and feature config
joblib.dump({
    'features': df_featured.drop(targets, axis=1).columns.tolist(),
    'target_cols': targets
}, '/workspace/COMP3610-Renewable-Energy-Prediction/models/data_config.pkl')

print("Feature engineering complete. Data saved to feature_engineered_data")

Feature engineering complete. Data saved to feature_engineered_data
