In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [None]:
# =======================
# Load & prepare data
# =======================
df = pd.read_csv("dataset/T1.csv", parse_dates=['Date/Time'])

# Now convert to datetime with dayfirst=True (since your format is day month year)
df['Date/Time'] = pd.to_datetime(df['Date/Time'], dayfirst=True)
df = df.sort_values('Date/Time')

# Calculate time_diff first
df['time_diff'] = df['Date/Time'].diff().dt.total_seconds().div(60)
df['time_diff'].fillna(0, inplace=True)  # Handle first row NaN

features = ['Wind Speed (m/s)', 'Wind Direction (°)', 'Theoretical_Power_Curve (KWh)', 'time_diff']
target = 'LV ActivePower (kW)'

X = df[features]
y = df[target]

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"Features: {features}")

X shape: (50530, 4)
y shape: (50530,)
Features: ['Wind Speed (m/s)', 'Wind Direction (°)', 'Theoretical_Power_Curve (KWh)', 'time_diff']


In [5]:
# =======================
# Define models
# =======================
models = {
    "XGBoost": XGBRegressor(
        n_estimators=300,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.9,
        random_state=42,
        early_stopping_rounds=20,
        objective='reg:squarederror'
    ),
    "LightGBM": LGBMRegressor(
        n_estimators=300,
        num_leaves=63,
        learning_rate=0.05,
        feature_fraction=0.9,
        bagging_fraction=0.9,
        random_state=42
    ),
    "GradientBoosting": GradientBoostingRegressor(
        n_estimators=200,
        max_depth=4,
        learning_rate=0.1,
        subsample=0.9,
        random_state=42
    ),
    "MLP": Pipeline([
        ('scaler', StandardScaler()),
        ('mlp', MLPRegressor(
            hidden_layer_sizes=(128, 64),
            activation='relu',
            learning_rate_init=0.001,
            max_iter=500,
            alpha=0.001,
            random_state=42
        ))
    ])
}

In [12]:
# =======================
# Evaluation
# =======================
tscv = TimeSeriesSplit(n_splits=5)
results = []

for name, model in models.items():
    mae_list, rmse_list, r2_list = [], [], []
    
    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        # Handle different model parameters
        if name == "XGBoost":
            model.fit(X_train, y_train,
                     eval_set=[(X_test, y_test)],
                     verbose=False)
        elif name == "LightGBM":
            model.fit(X_train, y_train,
                     eval_set=[(X_test, y_test)])
            # LightGBM doesn't have 'verbose' in fit(), use early_stopping_rounds instead if needed
        else:
            model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        
        mae_list.append(mae)
        rmse_list.append(rmse)
        r2_list.append(r2)
    
    results.append({
        "Model": name,
        "MAE": np.mean(mae_list),
        "RMSE": np.mean(rmse_list),
        "R²": np.mean(r2_list)
    })

results_df = pd.DataFrame(results).sort_values("RMSE")
print(results_df)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000548 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 8425, number of used features: 3
[LightGBM] [Info] Start training from score 1482.974409
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000657 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 16846, number of used features: 3
[LightGBM] [Info] Start training from score 1388.049664
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001150 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 25267, number of used features: 3
[LightGBM] [Info] Start trai



              Model         MAE        RMSE        R²
3               MLP  188.090026  362.285659  0.913074
0           XGBoost  202.168336  374.800178  0.907761
2  GradientBoosting  185.027040  401.994660  0.893363
1          LightGBM  189.698656  424.305020  0.881188
