In [2]:
# Parameters
BASE_PATH = "base_path"
model_name = "model_name"
iter = 1 

In [3]:
import pandas as pd
import joblib
import glob
import os 
import numpy as np
import yaml
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_pinball_loss
import plotly.graph_objects as go

In [4]:
def create_pred(BASE_PATH, model_name, iter):
    # Paths
    TEST_DATA_PATH = os.path.join(BASE_PATH, "solar/data/train_norm.csv")
    API_TEST_DATA_PATH = os.path.join(BASE_PATH, "solar/data/test_norm.csv")
    MODEL_PATH = os.path.join(BASE_PATH, f"solar/models/{model_name}/{iter}")

    # Load data
    df_test = pd.read_csv(TEST_DATA_PATH)
    df_api_test = pd.read_csv(API_TEST_DATA_PATH)

    # Load configuration to check train_test_split settings
    config_path = os.path.join(BASE_PATH, "/Users/florian/Documents/github/DP2/Energy_production_price_prediction/solar/config/dataloader_config.yaml") 
    with open(config_path, "r") as f:
        config = yaml.safe_load(f)

    train_test_split_enabled = config.get('train_test_split', {}).get('enabled', False)
    ratio = config.get('train_test_split', {}).get('ratio', 0.2)
    random_state = config.get('train_test_split', {}).get('random_state', 0)
    shuffle = config.get('train_test_split', {}).get('shuffle', 0)
    target = config.get('target_column')

    # Split data only if enabled
    if train_test_split_enabled:
        X = df_test.drop(columns=[target])
        y = df_test[target]

        _, X_test, _, y_test = train_test_split(X, y, test_size=ratio, random_state= random_state, shuffle= shuffle)
    else:
        # Use all data if train_test_split is not enabled
        X = df_test.drop(columns=[target])
        y = df_test[target]
        X_test, y_test = X, y  # Use the entire dataset as test data

    X_test_api = df_api_test.drop(columns=[target])
    y_test_api = df_api_test[target]

    # Load models
    model_paths = glob.glob(MODEL_PATH + f"/{model_name}/models/i{iter}_models/*.pkl")
    models = {model_path: joblib.load(model_path) for model_path in model_paths}

    # Predictions
    test_predictions = {model_path: model.predict(X_test) for model_path, model in models.items()}
    test_api_predictions = {model_path: model.predict(X_test_api) for model_path, model in models.items()}

    # Alpha values
    alpha_val = {"q1": 0.1, "q2": 0.2, "q3": 0.3, "q4": 0.4, "q5": 0.5, "q6": 0.6, "q7": 0.7, "q8": 0.8, "q9": 0.9}

    # Mean pinball losses
    test_mean_pinball_losses = {
        model_path: mean_pinball_loss(y_test, test_predictions[model_path],
                                      alpha=alpha_val[model_path.split("_")[-1].split(".")[0]])
        for model_path in model_paths
    }

    test_api_mean_pinball_losses = {
        model_path: mean_pinball_loss(y_test_api, test_api_predictions[model_path],
                                      alpha=alpha_val[model_path.split("_")[-1].split(".")[0]])
        for model_path in model_paths
    }

    # Plot results based on train_test_split enabled/disabled
    if train_test_split_enabled:
        plot_results(y_test, test_predictions, test_mean_pinball_losses, y_test_api, test_api_predictions, test_api_mean_pinball_losses, model_name, iter)
    else:
        # Only plot the API test results
        plot_api_results(y_test_api, test_api_predictions, test_api_mean_pinball_losses, model_name, iter)

    return y_test, test_predictions, test_mean_pinball_losses, y_test_api, test_api_predictions, test_api_mean_pinball_losses, model_paths

def plot_results(y_test, test_predictions, test_mean_pinball_losses, y_test_api, test_api_predictions, test_api_mean_pinball_losses, model_name, iter):
    # Plot for test api data
    fig_test_api = go.Figure()
    fig_test_api.add_trace(go.Scatter(y=y_test_api, name="True Test Api Values", mode="lines", line=dict(color="black")))
    for model_path, prediction in test_api_predictions.items():
        fig_test_api.add_trace(go.Scatter(y=prediction, name=f"Test Api Predictions {model_path.split('/')[-1]}, Loss: {test_api_mean_pinball_losses[model_path] * 2778.9489032132205}", mode="lines"))
    fig_test_api.update_layout(title=f"True Test Api Values vs Predictions of {model_name}, iteration {iter}", xaxis_title="Time", yaxis_title="Solar Production (MWh)")
    fig_test_api.show()

    print(((sum([loss for loss in test_api_mean_pinball_losses.values()])/9)*2778.9489032132205).round(3))

    # Plot for test data
    fig_test = go.Figure()
    fig_test.add_trace(go.Scatter(y=y_test, name="True Test Values", mode="lines", line=dict(color="black")))
    for model_path, prediction in test_predictions.items():
        fig_test.add_trace(go.Scatter(y=prediction, name=f"Test Predictions {model_path.split('/')[-1]}, Loss: {test_mean_pinball_losses[model_path] * 2281.8743117295026}", mode="lines"))
    fig_test.update_layout(title=f"True Test Values vs Predictions of {model_name}, iteration {iter}", xaxis_title="Time", yaxis_title="Solar Production (MWh)")
    fig_test.show()

    print(((sum([loss for loss in test_mean_pinball_losses.values()])/9)*2281.8743117295026).round(3))

def plot_api_results(y_test_api, test_api_predictions, test_api_mean_pinball_losses, model_name, iter):
    # Plot for API test data only
    fig_test_api = go.Figure()
    fig_test_api.add_trace(go.Scatter(y=y_test_api, name="True Test Api Values", mode="lines", line=dict(color="black")))
    for model_path, prediction in test_api_predictions.items():
        fig_test_api.add_trace(go.Scatter(y=prediction, name=f"Test Api Predictions {model_path.split('/')[-1]}, Loss: {test_api_mean_pinball_losses[model_path] * 2778.9489032132205}", mode="lines"))
    fig_test_api.update_layout(title=f"True Test Api Values vs Predictions of {model_name}, iteration {iter}", xaxis_title="Time", yaxis_title="Solar Production (MWh)")
    fig_test_api.show()

    print(((sum([loss for loss in test_api_mean_pinball_losses.values()])/9)*2778.9489032132205).round(3))

In [5]:
# Load models and plot results
y_test, test_predictions, test_mean_pinball_losses, y_test_api, test_api_predictions, test_api_mean_pinball_losses, model_paths = create_pred(BASE_PATH=BASE_PATH, model_name=model_name, iter=iter)
plot_results(y_test, test_predictions, test_mean_pinball_losses, y_test_api, test_api_predictions, test_api_mean_pinball_losses, model_name=model_name, iter=iter)

FileNotFoundError: [Errno 2] No such file or directory: 'base_path/solar/data/train_norm.csv'

In [51]:
df = pd.read_csv(os.path.join(BASE_PATH, "solar/data/test_norm.csv"))
feature_importances = []

for model_path in model_paths:
    model = joblib.load(model_path)
    if hasattr(model, 'feature_importances_'):
        feature_importances.append(model.feature_importances_)
    else:
        print(f"Model at {model_path} does not have feature_importances_ attribute")

if feature_importances:
    mean_feature_importances = np.mean(feature_importances, axis=0)
    feature_names = df.drop(columns=["Target_Capacity_MWP_%"]).columns  # Assuming the features are the columns of df except the target
    feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': mean_feature_importances})
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
    print(feature_importance_df)
else:
    print("No feature importances found for any model.")


Model at /Users/florian/Documents/github/DP2/Energy_production_price_prediction/Generation_forecast/Solar_forecast/models//hgbr_model/models/i5_models/hgbr_q7.pkl does not have feature_importances_ attribute
Model at /Users/florian/Documents/github/DP2/Energy_production_price_prediction/Generation_forecast/Solar_forecast/models//hgbr_model/models/i5_models/hgbr_q6.pkl does not have feature_importances_ attribute
Model at /Users/florian/Documents/github/DP2/Energy_production_price_prediction/Generation_forecast/Solar_forecast/models//hgbr_model/models/i5_models/hgbr_q4.pkl does not have feature_importances_ attribute
Model at /Users/florian/Documents/github/DP2/Energy_production_price_prediction/Generation_forecast/Solar_forecast/models//hgbr_model/models/i5_models/hgbr_q5.pkl does not have feature_importances_ attribute
Model at /Users/florian/Documents/github/DP2/Energy_production_price_prediction/Generation_forecast/Solar_forecast/models//hgbr_model/models/i5_models/hgbr_q1.pkl does 