In [12]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
# from power_predict.logic.registry import save_model, save_performance

# --- Fetching Data ---
df = pd.read_csv('/Users/FernandoSandoval/code/VonRiecken/Power-Predict/power_predict/data/merged_dataset2023-12-04 23:03:28.614514.csv')
df.head(5)

# --- Data Preprocessing ---

# Setting Country + Month year as Index
df['Country_Month'] = df['Country'] + '_' + df['Month_year'].astype(str)
df = df.set_index('Country_Month')

# Separating features and target variables
X = df.drop(['Unnamed: 0', 'Month_year', 'Balance',
             'Combustible_Renewables', 'Hydro', 'Other_Renewables', 'Solar',
             'Total_Renewables__Hydro__Geo__Solar__Wind__Other_', 'Wind',
             'total_sol_wind_hyd', 'value_CDD_18', 'value_CDD_21',
             'value_HDD_16', 'value_HDD_18', 'value_Heat_index',], axis=1)

# Applying logistic (log) transformation to the target variables
y = np.log1p(df[['Hydro', 'Solar', 'Wind', 'total_sol_wind_hyd']])
y

# Init list of numerical columns
num_features = X.select_dtypes(include=[np.number]).columns.tolist()

# Preprocessing pipeline
preprocessing_pipeline = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('scaler', MinMaxScaler())
        ]), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Country'])
    ])

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Create Model Training ---
# KNeighborsRegressor with Polynomial Features wrapped in MultiOutputRegressor
polynomial_features = PolynomialFeatures(degree=2)
multi_knn_regressor = MultiOutputRegressor(KNeighborsRegressor(n_neighbors=20, p=2, weights='uniform'))

# Pipeline including preprocessing, polynomial feature generation, and multi-output Polynomial Regression
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessing_pipeline),
    ('poly_features', polynomial_features),
    ('multi_knn_regressor', multi_knn_regressor)
])


# --- 5-Fold Cross-Validation ---
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5)
mean_cv_score = np.mean(cv_scores)
print(f"Cross-validated scores for 5 folds on the training data: {cv_scores}")
print(f"Mean CV Score: {mean_cv_score}")

# --- Model Training ---
# Train the model
pipeline.fit(X_train, y_train)

# --- Model Evaluation ---
# Evaluate the model
y_pred = pipeline.predict(X_test)  # X_test will be automatically preprocessed by the pipeline

# Inverse log transformation of the predictions
y_pred = np.expm1(y_pred)

# --- Save Model ---
    # Save fitted pipeline model as 'knn_log'
# save_model(pipeline, 'knn_log')

# --- Save Params and Metrics ---
    # Save params from fitted pipeline into a dict 'params'
params = pipeline.named_steps['multi_knn_regressor'].get_params()

# Define performace metrics
    # Initialize an empty dictionary to store metrics
metrics = {}
for i, target in enumerate(['Hydro', 'Solar', 'Wind', 'total_sol_wind_hyd']):
    mse = mean_squared_error(np.expm1(y_test.iloc[:, i]), y_pred[:, i])
    mae = mean_absolute_error(np.expm1(y_test.iloc[:, i]), y_pred[:, i])
    r2 = r2_score(np.expm1(y_test.iloc[:, i]), y_pred[:, i])
    rmse = np.sqrt(mse)

    # Store metrics in the dictionary
    metrics[target] = {
        'Mean CV Score': mean_cv_score,
        'Mean Absolute Error': mae,
        'Mean Squared Error': mse,
        'Root Mean Squared Error': rmse,
        'R-squared': r2
    }

    # Call save_performace function in registry.py to save dicts with a time stamp in the correct file
# save_performance('knn_log', params, metrics)

for target, metrics_values in metrics.items():
    print(f"Metrics for {target}:")
    for metric_name, metric_value in metrics_values.items():
        print(f"    {metric_name}: {metric_value:.4f}")
    print("\n")


Cross-validated scores for 5 folds on the training data: [0.91148104 0.91674235 0.91207561 0.91133013 0.91887947]
Mean CV Score: 0.9141017196352784
Metrics for Hydro:
    Mean CV Score: 0.9141
    Mean Absolute Error: 494.8786
    Mean Squared Error: 3008284.7158
    Root Mean Squared Error: 1734.4408
    R-squared: 0.9836


Metrics for Solar:
    Mean CV Score: 0.9141
    Mean Absolute Error: 482.3370
    Mean Squared Error: 3339206.0219
    Root Mean Squared Error: 1827.3495
    R-squared: 0.5645


Metrics for Wind:
    Mean CV Score: 0.9141
    Mean Absolute Error: 708.8939
    Mean Squared Error: 7141506.8619
    Root Mean Squared Error: 2672.3598
    R-squared: 0.7495


Metrics for total_sol_wind_hyd:
    Mean CV Score: 0.9141
    Mean Absolute Error: 1382.5777
    Mean Squared Error: 25657803.7841
    Root Mean Squared Error: 5065.3533
    R-squared: 0.9347


