In [None]:
#----IMPORT LIBRAIRIES----
import pandas as pd
import plotly
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
import json

import pvlib

import mlflow
from mlflow.models.signature import infer_signature

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error

from sklearn.inspection import permutation_importance

import Model_func as mf
import boto3

from dotenv import load_dotenv
import os

load_dotenv()
os.environ["MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING"] = "false"


In [None]:
#---VARIABLES----
weather_data_path = 'https://renergies99-bucket.s3.eu-west-3.amazonaws.com/public/openweathermap/merge_openweathermap_cleaned.csv'
solar_data_path = 'https://renergies99-bucket.s3.eu-west-3.amazonaws.com/public/solar/raw_solar_data.csv'
landsat_data_path = 'https://renergies99-bucket.s3.eu-west-3.amazonaws.com/public/LandSat/result_EarthExplorer_region_ARA.csv'

prod_data_path = 'https://renergies99-bucket.s3.eu-west-3.amazonaws.com/public/prod/eCO2mix_RTE_Auvergne-Rhone-Alpes_cleaned.csv'
target = 'tch_solaire_(%)'


In [None]:
#--- PREPARATION ----
collected_weather_data = mf.data_collection_weather(weather_data_path) # collect data and format columns per city
collected_solar_data = mf.data_coll_solar(solar_data_path)
collected_landsat_data = mf.data_coll_landsat(landsat_data_path)
landsat_data = collected_landsat_data.copy()

weather_solar = mf.merge_weather_solar_data(collected_weather_data, collected_solar_data)

#creer un df landsat réduit avec 1 donnée/jour
columns_to_keep = landsat_data.select_dtypes(exclude=["object"]).columns
limited_landsat_data = landsat_data[columns_to_keep].groupby('Time').mean().reset_index()
 
merged_data = mf.merge_weather_solar_landsat_data(collected_weather_data, collected_solar_data, limited_landsat_data)

In [None]:
# merged_data_copy = pd.read_csv('../../../Mes_fichiers_vrac/merged_data_copy.csv')
# merged_data_copy['Time'] = pd.to_datetime(merged_data_copy['Time'])

# merged_data = merged_data_copy.copy()

In [None]:
#---data_split : add target and train_test_split
prod_data = mf.data_collection_prod(prod_data_path)
targeted_data = mf.add_target(merged_data, prod_data, target_columns_to_use=['Time', target])

#---columns selection
col_solar = ['Ap', '10cm', 'K index Planetary']
col_weather = ['temp', 'feels_like' 'pressure', 'humidity', 'dew_point',
                'clouds', 'wind_speed', 'wind_deg']
features = col_weather + col_solar + ['Moulins_day_length']

selected_weather_columns = [col for col in targeted_data.columns if col.endswith(tuple(col_weather))]
selected_columns = selected_weather_columns + col_solar + ['Moulins_day_length']

y = targeted_data[target].to_numpy()
X = targeted_data[selected_columns]

#--- gestion de NaN
X = X.dropna(axis=1)


In [None]:
#---MLFlow params
os.environ["APP_URI"] = "https://renergies99-mlflow.hf.space/"
EXPERIMENT_NAME = "all_columns_models"

mlflow.set_tracking_uri(os.environ["APP_URI"])
mlflow.set_experiment(EXPERIMENT_NAME)
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

mlflow.sklearn.autolog()  # enables automatic logging for scikit-learn

#---Preprocess
result_preprocess = mf.preprocessing_and_pipeline(X)
pipeline = result_preprocess["pipeline"]
preprocessor = result_preprocess["preprocessor"]

run_description = (
    f"Features used: {features}\nTarget: {target}\n"
    f"Estimator: {pipeline.named_steps['estimator']}"
)

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24)
input_example = x_train.iloc[:3]


with mlflow.start_run(experiment_id=experiment.experiment_id, description=run_description):
    # Fit the pipeline (preprocessing + model)
    pipeline.fit(x_train, y_train)

    # signature
    signature = infer_signature(x_test, pipeline.predict(x_test))

    # predictions
    y_pred = pipeline.predict(x_test)

    #Artifact for features_names
    #mf.custom_get_feature_names(result_preprocess, artifact_name="features.json")
 
    #artifact for confidence interval
    error = mf.error_stat(x_test, y_test, pipeline)
    error.to_json("error.json")
    mlflow.log_artifact("error.json")
    
    # metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    n = len(y_test)
    p = X.shape[1]
    adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

    
    # logging metrics
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("R2", r2)
    mlflow.log_metric("Adjusted_R2", adj_r2)
 
    # Log the full pipeline as a model
    mlflow.sklearn.log_model(pipeline, signature=signature, input_example=input_example)



# Results exploration
Coefficients

In [None]:
preprocessor = pipeline.named_steps['preprocessor']

feature_names = []

for name, transformer, cols in preprocessor.transformers:
    if name == 'num':
        feature_names.extend(cols)  # StandardScaler ne change pas le nombre de colonnes
    elif name == 'obj':
        feature_names.extend(cols)  # passthrough garde les colonnes telles quelles

# Récupérer les coefficients du modèle
coefs = pipeline.named_steps['estimator'].coef_

print(len(feature_names), len(coefs))

# Tracer les coefs (triés en valuer absolue)
df_coef = pd.DataFrame({
    'feature': feature_names,
    'coefficient': coefs
})
df_coef = df_coef.sort_values(by='coefficient', key=abs)

px.bar(df_coef, x='coefficient', y='feature')

Erreur

In [None]:
# Calcul des prédictions et des résidus
y_pred = pipeline.predict(x_test)
residuals = y_test - y_pred

# Création de la figure
fig = go.Figure()

# Ajout des résidus
fig.add_trace(go.Scatter(
    x=y_pred,
    y=residuals,
    mode='markers',
    name='Résidus',
    marker=dict(color='blue', size=8)
))

# Ligne horizontale y=0
fig.add_trace(go.Scatter(
    x=[min(y_pred), max(y_pred)],
    y=[0, 0],
    mode='lines',
    line=dict(color='red', dash='dash'),
    name='Zéro'
))

# Layout
fig.update_layout(
    title="Residual Plot",
    xaxis_title="Predictions",
    yaxis_title="Residuals",
    showlegend=True
)

fig.show()


In [None]:
# Création du DataFrame pour Plotly Express
df_plot = pd.DataFrame({
    'y_test': y_test,
    'y_pred': y_pred
})

# Scatter plot
fig = px.scatter(
    df_plot,
    x='y_test',
    y='y_pred',
    labels={'y_test': 'Valeurs réelles tch', 'y_pred': 'Prédictions'},
    title='Prédictions vs Valeurs réelles'
)

# Ajouter une ligne y=x pour visualiser l’idéal
fig.add_shape(
    type='line',
    x0=df_plot['y_test'].min(),
    y0=df_plot['y_test'].min(),
    x1=df_plot['y_test'].max(),
    y1=df_plot['y_test'].max(),
    line=dict(color='red', dash='dash')
)

fig.show()
