In [1]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# --- Data Collection and Preparation ---
df = pd.read_csv('../power_predict/data/merged_dataset2023-11-29 16:33:32.960189.csv') 
df.head(5)

Unnamed: 0.1,Unnamed: 0,Month_year,Country,Balance,Combustible_Renewables,Hydro,Other_Renewables,Solar,Total_Renewables__Hydro__Geo__Solar__Wind__Other_,Wind,value_CDD_18,value_CDD_21,value_Global_Horizontal_Irrandiance,value_HDD_16,value_HDD_18,value_Heat_index,value_Relative_Humidty,value_Temperature,value_Total_Precipitation,total_sol_wind_hyd
0,0,2010-04-01,Australia,Net Electricity Production,216.287,1044.406,0.0,26.811,1638.098,350.511,48.13,12.62,720994.0,17.47,37.08,18.41,57.37,23.21,0.05483,1421.728
1,1,2010-04-01,Austria,Net Electricity Production,350.383,2504.13,0.0,9.965,2995.696,131.107,0.05449,0.0,736161.0,228.9,287.8,8.41,69.85,6.748,0.1051,2645.202
2,2,2010-04-01,Belgium,Net Electricity Production,383.177,141.898,0.0,80.419,702.509,97.015,0.9916,0.0,723195.0,195.8,253.7,9.577,67.9,9.322,0.03999,319.332
3,3,2010-04-01,Canada,Net Electricity Production,712.988,28243.738,2.44,24.184,29709.301,725.951,0.06883,2.3e-05,679927.0,246.7,306.2,7.797,70.85,-1.708,0.05456,28993.873
4,4,2010-04-01,Chile,Net Electricity Production,152.826,1740.416,0.0,0.0,1913.601,20.359,1.411,0.04019,578355.0,113.0,168.3,12.44,66.58,9.672,0.1578,1760.775


In [3]:

# --- Data Preprocessing ---

# Assuming 'df' is your DataFrame

# Setting Country + Month year as Index
df['Country_Month'] = df['Country'] + '_' + df['Month_year'].astype(str)
df = df.set_index('Country_Month')

# Separating features and target variables
X = df.drop(['Unnamed: 0', 'Month_year', 'Balance',
             'Combustible_Renewables', 'Hydro', 'Other_Renewables', 'Solar',
             'Total_Renewables__Hydro__Geo__Solar__Wind__Other_', 'Wind',
             'total_sol_wind_hyd'], axis=1)

# Applying logistic (log) transformation to the target variables
y = np.log1p(df[['Hydro', 'Solar', 'Wind', 'total_sol_wind_hyd']])

In [5]:
# Init list of numerical columns
num_features = X.select_dtypes(include=[np.number]).columns.tolist()

# Preprocessing pipeline
preprocessing_pipeline = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('scaler', MinMaxScaler())
        ]), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Country'])
    ])

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# --- Create Model Training ---
# KNeighborsRegressor wrapped in MultiOutputRegressor
multi_knn_regressor = MultiOutputRegressor(KNeighborsRegressor(n_neighbors=20, p=2, weights='uniform'))

# Pipeline including preprocessing and multi-output KNN regressor
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessing_pipeline),
    ('multi_knn_regressor', multi_knn_regressor)
])


In [7]:
# --- Model Training ---
# Train the model
pipeline.fit(X_train, y_train)


In [8]:

# --- Model Evaluation ---
# Evaluate the model
y_pred = pipeline.predict(X_test)  # X_test will be automatically preprocessed by the pipeline

# Inverse log transformation of the predictions
y_pred = np.expm1(y_pred)

# Evaluation metrics for each target, in the original scale
for i, target in enumerate(['Hydro', 'Solar', 'Wind', 'total_sol_wind_hyd']):
    mse = mean_squared_error(np.expm1(y_test.iloc[:, i]), y_pred[:, i])
    mae = mean_absolute_error(np.expm1(y_test.iloc[:, i]), y_pred[:, i])
    r2 = r2_score(np.expm1(y_test.iloc[:, i]), y_pred[:, i])
    rmse = np.sqrt(mse)  # Calculating RMSE
    print(f"--- {target} ---")
    print(f"Mean Absolute Error: {mae}")
    print(f"Mean Squared Error: {mse}")
    print(f"Root Mean Squared Error: {rmse}\n")
    print(f"R-squared: {r2}\n")



--- Hydro ---
Mean Absolute Error: 490.98777384586634
Mean Squared Error: 3231325.4753257115
Root Mean Squared Error: 1797.5887948376046

R-squared: 0.9823606678565688

--- Solar ---
Mean Absolute Error: 477.1939726251077
Mean Squared Error: 3268458.6665028804
Root Mean Squared Error: 1807.88790208433

R-squared: 0.5737276986102838

--- Wind ---
Mean Absolute Error: 696.7775297018535
Mean Squared Error: 6942822.146349584
Root Mean Squared Error: 2634.9235560732277

R-squared: 0.7564302790215275

--- total_sol_wind_hyd ---
Mean Absolute Error: 1334.8280666142257
Mean Squared Error: 23552767.02449715
Root Mean Squared Error: 4853.1193087021

R-squared: 0.940057553431901

