# Feature Engineering and Preprocessing

### import libraries

In [None]:
import sys
import os

# Ensure the common module can be imported
# Adjust the path to point to the common directory
# Get the parent directory of the project (one level above week-2/)
sys.path.append(os.path.abspath(".."))

from common.load_data import load_data
from common.time_based_features import add_time_based_features
from common.lag_and_rolling_statistics import engineer_lag_and_rolling
from common.scale_features import scale_features
from common.chronological_split import chronological_train_test_split
from common.linear_regression import baseline_model_performance
from common.persistence_model import persistence_baseline_performance

### Pipeline Overview:
- Load dataset from CSV and parse datetime index (ensuring chronological order)
- Add time-based features (month, weekday, hour, is_weekend) to help capture temporal patterns
- Engineer lagged and rolling statistical features per zone to provide historical context
- split the dataset chronologically into train and test sets to prevent data leakage and preserve time sequence
- finally, normalize numerical features (including engineered features) using StandardScaler for stable training
Note: All preprocessing is done BEFORE the train/test split to ensure consistent feature transformations.
The chronological split maintains the temporal integrity essential for time series modeling.


In [2]:
zones = [
    'Zone 1 Power Consumption',
    'Zone 2 Power Consumption',
    'Zone 3 Power Consumption'
]

# Load and preprocess data
dataset = (
    load_data(file_path='..\Data\Tetuan City power consumption.csv')
    .pipe(add_time_based_features)
)

dataset_full = dataset.copy()
for zone in zones:
    dataset_full = engineer_lag_and_rolling(dataset_full, zone, lags=[1], rolling_windows=[3])

# Split chronologically AFTER feature engineering
dataset_train, dataset_test = chronological_train_test_split(dataset_full, train_ratio=0.8, val_ratio=None)

# Split BEFORE creating lags/rolling
dataset_train, dataset_test = chronological_train_test_split(dataset, train_ratio=0.8, val_ratio=None)

for zone in zones:
    # Feature engineering ONLY for this zone
    train_zone = engineer_lag_and_rolling(dataset_train, zone, lags=[1], rolling_windows=[3])
    test_zone = engineer_lag_and_rolling(dataset_test, zone, lags=[1], rolling_windows=[3])

    # Scale features
    train_scaled, test_scaled, scaler, feature_cols = scale_features(
        train_zone, test_zone, target_col=zone
    )

    # Train & evaluate
    performance = baseline_model_performance(
        train_dataset=train_scaled,
        test_dataset=test_scaled,
        target_col=zone,
        feature_cols=feature_cols
    )

    print(f"\nPerformance metrics for {zone}:")
    print(performance['metrics'])


[INFO] Using 'DateTime' as datetime index column

Performance metrics for Zone 1 Power Consumption:
  Metric       Value
0   RMSE  454.367351
1    MAE  339.725092
2     R2    0.994571

Performance metrics for Zone 2 Power Consumption:
  Metric       Value
0   RMSE  379.034880
1    MAE  285.118287
2     R2    0.995222

Performance metrics for Zone 3 Power Consumption:
  Metric       Value
0   RMSE  291.706884
1    MAE  211.576803
2     R2    0.992183


#### Interpretation per zone
**RMSE**: Root Mean Squared Error  
**MAE**: Mean Absolute Error  
**R2**: Coefficient of Determination - a statistical measure that tells how much of the variance in the target variable.


- **Zone 1**
  - RMSE: ~454.37
  - MAE: ~339.73
  - R²: 0.9946 → The model explains 99.46% of the variance, which is huge (near-perfect).

- **Zone 2**
  - RMSE: ~379.03
  - MAE: ~285.12
  - R²: 0.9952 → Best-performing zone, suggesting Zone 2’s patterns are the most predictable from its own history.

- **Zone 3**
  - RMSE: ~291.71
  - MAE: ~211.58.70
  - R²: 0.9921 → Slightly more error than Zone 2 but still very high predictive power.

In time series, high R² is due to strong autocorrelation (yesterday’s consumption a very good predictor for today).

Adding Persistence model = baseline of “no model” — just using previous value as prediction - to compare how the linear regression baseline model is compared to baseline persistence model - using yesterday's value.


In [3]:
for zone in zones:
    # Feature engineering ONLY for this zone
    test_zone = engineer_lag_and_rolling(dataset_test, zone, lags=[1, 3], rolling_windows=[3, 7])

    metrics = persistence_baseline_performance(
        test_zone, target_col=zone, lag=1)
    print(f"Persistence baseline metrics for {zone}:")
    print(metrics)
    print()

Persistence baseline metrics for Zone 1 Power Consumption:
  Metric       Value
0   RMSE  549.974681
1    MAE  369.605858
2     R2    0.992046

Persistence baseline metrics for Zone 2 Power Consumption:
  Metric       Value
0   RMSE  459.747095
1    MAE  307.673258
2     R2    0.992971

Persistence baseline metrics for Zone 3 Power Consumption:
  Metric       Value
0   RMSE  338.884857
1    MAE  201.406095
2     R2    0.989449



### Analysis of Linear Regression vs. Persistence Model Performance

From the above results - `Linear Regression (lag-1)` model significantly out performs the `Persistence (naïve)` model across all zones.


| Zone   | Model              | RMSE     | MAE      | R²       | Improvement vs. Persistence               |
|--------|--------------------|----------|----------|----------|------------------------------------------|
| Zone 1 | Linear Regression  | 454.37   | 339.73   | 0.9946   | RMSE ↓17.4%, MAE ↓8.1%, R² ↑0.25%       |
| Zone 1 | Persistence        | 549.97   | 369.61   | 0.9920   | (Baseline)                               |
| Zone 2 | Linear Regression  | 379.03   | 285.12   | 0.9952   | RMSE ↓17.6%, MAE ↓7.3%, R² ↑0.23%       |
| Zone 2 | Persistence        | 459.75   | 307.67   | 0.9930   | (Baseline)                               |
| Zone 3 | Linear Regression  | 291.71   | 211.58   | 0.9922   | RMSE ↓13.9%, MAE ↑5.1%, R² ↑0.28%       |
| Zone 3 | Persistence        | 338.88   | 201.41   | 0.9894   | (Baseline)                               |

**Persistence model** (minimal benchmark) is used to verify that LinearRegression baseline model adds value (which it does).  
**Data** has structure (trends/scaling effects) that Persistence misses.  
**Data Characteristics**
- Strong autocorrelation: The next value heavily depends on the current one.



In [4]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

zones = ['Zone 1 Power Consumption', 'Zone 2 Power Consumption', 'Zone 3 Power Consumption']
exog_features = ['Temperature', 'Humidity', 'Wind Speed', 'general diffuse flows', 'diffuse flows']

for zone in zones:
    # Feature engineering ONLY for this zone
    train_zone = engineer_lag_and_rolling(dataset_train, zone, lags=[1], rolling_windows=[3])
    test_zone = engineer_lag_and_rolling(dataset_test, zone, lags=[1], rolling_windows=[3])

    # Scale features
    train_scaled, test_scaled, scaler, feature_cols = scale_features(
        train_zone, test_zone, target_col=zone
    )

    # Prepare SARIMAX inputs
    endog_train = train_scaled[zone]  # raw target for training
    endog_test = test_scaled[zone]    # raw target for testing

    # SARIMAX exog = scaled features
    exog_train = train_scaled[feature_cols]
    exog_test = test_scaled[feature_cols]

    results, forecast, metrics = sarimax_fast(
        endog_train=endog_train,
        endog_test=endog_test,
        exog_train=exog_train,
        exog_test=exog_test
    )

    print(results)
    print(forecast)
    print(metrics)
    


NameError: name 'sarimax_fast' is not defined