In [32]:
# Import libraries 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [33]:
# Load dataset 
data = pd.read_csv("/workspace/COMP3610-Renewable-Energy-Prediction/data/processed/feature_engineered_data.csv")


In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177210 entries, 0 to 177209
Data columns (total 44 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   Area                           177210 non-null  object 
 1   YEAR                           177210 non-null  int64  
 2   Solar                          177210 non-null  float64
 3   Wind Onshore                   177210 non-null  float64
 4   Country                        177210 non-null  object 
 5   time                           177210 non-null  object 
 6   temp                           177210 non-null  float64
 7   dwpt                           177210 non-null  float64
 8   rhum                           177210 non-null  float64
 9   prcp                           177210 non-null  float64
 10  wspd                           177210 non-null  float64
 11  pres                           177210 non-null  float64
 12  hour                          

In [35]:
# Define target columns (from your feature engineering code)
target_col = ['Solar', 'Wind Onshore']

# Prepare features and target
X = data.drop(columns=target_col)  # Drop both target columns to get features
y = data[target_col]            # Select specific target column

In [36]:
# Identify column types
categorical_cols = ['Area', 'Country']  # Explicitly list categorical columns
bool_cols = [col for col in X.columns if X[col].dtype == bool]  # Auto-detect boolean columns
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_cols = [col for col in numeric_cols if col not in categorical_cols + bool_cols]

In [37]:
# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('bool', 'passthrough', bool_cols)
    ],
    remainder='drop'  # drops any unhandled columns
)

In [38]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [39]:
# Function to train and evaluate models
def train_evaluate(model, model_name):
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"\n{model_name} Performance ({target_col}):")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R²: {r2:.4f}")
    
    # Cross-validation
    cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
    print(f"CV RMSE: {np.sqrt(-cv_scores.mean()):.4f}")
    
    return pipeline


In [40]:
# Initialize models
models = {
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
   # "Support Vector Machine": SVR(kernel='rbf')
}

In [41]:
# Train and evaluate each model
trained_models = {}
for name, model in models.items():
    trained_models[name] = train_evaluate(model, name)

# Plot feature importance for tree-based models
for name in ['Random Forest', 'Gradient Boosting']:
    try:
        # Get feature names after one-hot encoding
        cat_encoder = trained_models[name].named_steps['preprocessor'].named_transformers_['cat']
        cat_features = cat_encoder.get_feature_names_out(categorical_cols)
        all_features = numeric_cols + list(cat_features) + bool_cols
        
        # Get importance scores
        importances = trained_models[name].named_steps['regressor'].feature_importances_
        
        # Create and plot importance dataframe
        importance_df = pd.DataFrame({'Feature': all_features, 'Importance': importances})
        importance_df = importance_df.sort_values('Importance', ascending=False).head(20)
        
        plt.figure(figsize=(10, 6))
        plt.title(f'Feature Importance - {name}')
        plt.barh(importance_df['Feature'], importance_df['Importance'])
        plt.gca().invert_yaxis()
        plt.show()
    except AttributeError:
        continue


Random Forest Performance (['Solar', 'Wind Onshore']):
RMSE: 256.4148
MAE: 154.2558
R²: 0.9920


KeyboardInterrupt: 