In [7]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
# from power_predict.logic.registry import save_model, save_performance

In [8]:
# --- Fetching Data ---
df = pd.read_csv('/Users/FernandoSandoval/code/VonRiecken/Power-Predict/power_predict/data/merged_dataset2023-12-01 18:08:35.062618.csv')
df.head(5)

Unnamed: 0.1,Unnamed: 0,Month_year,Country,Balance,Combustible_Renewables,Hydro,Other_Renewables,Solar,Total_Renewables__Hydro__Geo__Solar__Wind__Other_,Wind,value_CDD_18,value_CDD_21,value_Global_Horizontal_Irrandiance,value_HDD_16,value_HDD_18,value_Heat_index,value_Relative_Humidty,value_Temperature,value_Total_Precipitation,total_sol_wind_hyd
0,0,2010-04-01,Australia,Net Electricity Production,216.287,1044.406,0.0,26.811,1638.098,350.511,0.111464,0.03712857,0.598917,0.019489,0.038689,0.639927,0.399736,0.867825,0.068503,1421.728
1,1,2010-04-01,Austria,Net Electricity Production,350.383,2504.13,0.0,9.965,2995.696,131.107,0.000126,0.0,0.611575,0.255355,0.300292,0.435803,0.60588,0.556986,0.131309,2645.202
2,2,2010-04-01,Belgium,Net Electricity Production,383.177,141.898,0.0,80.419,702.509,97.015,0.002296,0.0,0.600754,0.218429,0.264712,0.459624,0.57367,0.605589,0.049963,319.332
3,3,2010-04-01,Canada,Net Electricity Production,712.988,28243.738,2.44,24.184,29709.301,725.951,0.000159,6.825537e-08,0.564642,0.275212,0.319491,0.42329,0.622398,0.397319,0.068166,28993.873
4,4,2010-04-01,Chile,Net Electricity Production,152.826,1740.416,0.0,0.0,1913.601,20.359,0.003268,0.0001182407,0.47987,0.12606,0.175605,0.518065,0.551867,0.612198,0.197151,1760.775


In [9]:
# --- Data Preprocessing ---

# Setting Country + Month year as Index
df['Country_Month'] = df['Country'] + '_' + df['Month_year'].astype(str)
df = df.set_index('Country_Month')

# Separating features and target variables
X = df.drop(['Unnamed: 0', 'Month_year', 'Balance',
             'Combustible_Renewables', 'Hydro', 'Other_Renewables', 'Solar',
             'Total_Renewables__Hydro__Geo__Solar__Wind__Other_', 'Wind',
             'total_sol_wind_hyd', 'value_CDD_18', 'value_CDD_21',
             'value_HDD_16', 'value_HDD_18', 'value_Heat_index',], axis=1)

In [10]:
# Applying logistic (log) transformation to the target variables
y = np.log1p(df[['Hydro', 'Solar', 'Wind', 'total_sol_wind_hyd']])
y

Unnamed: 0_level_0,Hydro,Solar,Wind,total_sol_wind_hyd
Country_Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Australia_2010-04-01,6.952161,3.325432,5.862241,7.260331
Austria_2010-04-01,7.826096,2.394708,4.883612,7.880881
Belgium_2010-04-01,4.962131,4.399609,4.585121,5.769358
Canada_2010-04-01,10.248662,3.226209,6.588859,10.274874
Chile_2010-04-01,7.462454,0.000000,3.061473,7.474077
...,...,...,...,...
Spain_2022-09-01,7.286618,8.075533,8.342723,9.090765
Sweden_2022-09-01,8.698386,4.628640,7.594843,8.997336
Switzerland_2022-09-01,7.920937,5.825151,2.494824,8.040184
United Kingdom_2022-09-01,5.909365,7.234739,8.616540,8.892267


In [11]:
# Init list of numerical columns
num_features = X.select_dtypes(include=[np.number]).columns.tolist()

# Preprocessing pipeline
preprocessing_pipeline = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('scaler', MinMaxScaler())
        ]), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Country'])
    ])

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# --- Create Model Training ---
# Create a Linear Regression model wrapped in MultiOutputRegressor
multi_linear_regressor = MultiOutputRegressor(LinearRegression())

# Pipeline including preprocessing and multi-output Linear Regression
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessing_pipeline),
    ('multi_linear_regressor', multi_linear_regressor)
])

In [14]:
# --- 5-Fold Cross-Validation ---
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5)
mean_cv_score = np.mean(cv_scores)
print(f"Cross-validated scores for 5 folds on the training data: {cv_scores}")
print(f"Mean CV Score: {mean_cv_score}")

Cross-validated scores for 5 folds on the training data: [0.91302623 0.91815152 0.91166115 0.91303916 0.91926171]
Mean CV Score: 0.9150279521765425


In [15]:
# --- Model Training ---
# Train the model
pipeline.fit(X_train, y_train)

In [19]:
# --- Model Evaluation ---
# Evaluate the model
y_pred = pipeline.predict(X_test)  # X_test will be automatically preprocessed by the pipeline

# Inverse log transformation of the predictions
y_pred = np.expm1(y_pred)

# --- Save Model ---
    # Save fitted pipeline model as 'knn_log'
# save_model(pipeline, 'knn_log')

# --- Save Params and Metrics ---
    # Save params from fitted pipeline into a dict 'params'
params = pipeline.named_steps['multi_linear_regressor'].get_params()

# Define performace metrics
    # Initialize an empty dictionary to store metrics
metrics = {}
for i, target in enumerate(['Hydro', 'Solar', 'Wind', 'total_sol_wind_hyd']):
    mse = mean_squared_error(np.expm1(y_test.iloc[:, i]), y_pred[:, i])
    mae = mean_absolute_error(np.expm1(y_test.iloc[:, i]), y_pred[:, i])
    r2 = r2_score(np.expm1(y_test.iloc[:, i]), y_pred[:, i])
    rmse = np.sqrt(mse)

    # Store metrics in the dictionary
    metrics[target] = {
        'Mean CV Score': mean_cv_score,
        'Mean Absolute Error': mae,
        'Mean Squared Error': mse,
        'Root Mean Squared Error': rmse,
        'R-squared': r2
    }

    # Call save_performace function in registry.py to save dicts with a time stamp in the correct file
# save_performance('knn_log', params, metrics)

In [20]:
for target, metrics_values in metrics.items():
    print(f"Metrics for {target}:")
    for metric_name, metric_value in metrics_values.items():
        print(f"    {metric_name}: {metric_value:.4f}")
    print("\n")


Metrics for Hydro:
    Mean CV Score: 0.9150
    Mean Absolute Error: 843.2208
    Mean Squared Error: 15057538.1146
    Root Mean Squared Error: 3880.4044
    R-squared: 0.9178


Metrics for Solar:
    Mean CV Score: 0.9150
    Mean Absolute Error: 483.6467
    Mean Squared Error: 3455982.7579
    Root Mean Squared Error: 1859.0274
    R-squared: 0.5493


Metrics for Wind:
    Mean CV Score: 0.9150
    Mean Absolute Error: 701.0055
    Mean Squared Error: 7279061.7592
    Root Mean Squared Error: 2697.9736
    R-squared: 0.7446


Metrics for total_sol_wind_hyd:
    Mean CV Score: 0.9150
    Mean Absolute Error: 1480.0363
    Mean Squared Error: 35382740.2384
    Root Mean Squared Error: 5948.3393
    R-squared: 0.9099


