In [7]:
#----IMPORT LIBRAIRIES----
import pandas as pd
import seaborn as sns
import json
import math
from scipy.stats import zscore

import plotly
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import plotly.io as pio
from plotly.subplots import make_subplots

import pvlib

import mlflow
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.inspection import permutation_importance
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error

from mlflow.models.signature import infer_signature

import boto3

from dotenv import load_dotenv
import os

load_dotenv()
os.environ["MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING"] = "false"

import Model_func as mf
import func_cleaning as fc

In [None]:
#---VARIABLES----
weather_data_path = 'https://renergies99-lead-bucket.s3.eu-west-3.amazonaws.com/public/openweathermap/merge_openweathermap_cleaned.csv'
solar_data_path = 'https://renergies99-lead-bucket.s3.eu-west-3.amazonaws.com/public/solar/raw_solar_data.csv'
landsat_data_path = 'https://renergies99-lead-bucket.s3.eu-west-3.amazonaws.com/public/LandSat/result_EarthExplorer_region_ARA.csv'
prod_data_path = 'https://renergies99-lead-bucket.s3.eu-west-3.amazonaws.com/public/prod/eCO2mix_RTE_Auvergne-Rhone-Alpes_cleaned.csv'

target = 'tch_solaire_(%)'
col_solar = ['Time', 'Ap', '10cm', 'K index Planetary'] # ALWAYS include a 'Time' column (used to merge datasets)
cities_list = ['Moulins', 'Annecy', 'Nyons', 'Saint-√âtienne', 'Aurillac']

In [9]:
#--- Data Collection ----
full_dataset = fc.create_full_dataset(weather_data_path, solar_data_path, landsat_data_path, prod_data_path, 
                                   cities_list, col_solar, target)

#full_dataset.to_csv('../../../Mes_fichiers_vrac/data_csv.csv')


In [10]:
#--- Data cleaning ---
df = full_dataset.copy()
print(f'df shape: {df.shape}')

# gestion des Nan
df_no_Nan = fc.handle_nan(df)
print(f'df_no_Nan shape: {df_no_Nan.shape}')

# clean data (convert int to float, select type columns, remove unique values)
df_clean = fc.clean_dataframe(df_no_Nan, type='numeric')
print(f'df_clean shape: {df_clean.shape}')
cols = df_clean.select_dtypes(include="int64").columns.to_list()
df_clean[cols] = df_clean[cols].astype(float) # Modif li√©e √† la signature dans MLFlow qui retournait une erreur

#suppression des outliers
df_no_outliers = fc.remove_outliers(df_clean, target, method='iqr')
print(f'df_no_outliers shape: {df_no_outliers.shape}')



df shape: (1757, 193)
df_no_Nan shape: (1757, 124)
df_clean shape: (1757, 83)
df_no_outliers shape: (1389, 83)


In [11]:
#--- Features and target definition
X = df_no_outliers.drop(target, axis=1)
y = df_no_outliers[target]

In [None]:
EXPERIMENT_NAME = "all_columns_models"
run_name = 'super_popular_panda'
threshold=[42]

#---Preprocess
result_preprocess = mf.preprocessing_and_pipeline(X, LinearRegression(), suffixes=['humidity'], split_thresholds=threshold)
pipeline = result_preprocess["pipeline"]
preprocessor = result_preprocess["preprocessor"]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24)
input_example = x_train.iloc[:3]

#---MLFlow params
os.environ["APP_URI"] = "https://renergies99lead-mlflow.hf.space/"

mlflow.set_tracking_uri(os.environ["APP_URI"])
mlflow.set_experiment(EXPERIMENT_NAME)
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

#mlflow.sklearn.autolog()  # enables automatic logging for scikit-learn


run_description = (
    f"Target: {target}\n"
    f"Estimator: {pipeline.named_steps['estimator']}\n"
    "base run with solarposition, basic cleaning and outliers removal with iqr method"
    "feature engineering in columns 'humidity'"
)

with mlflow.start_run(experiment_id=experiment.experiment_id, description=run_description, run_name=run_name):
    # Fit the pipeline (preprocessing + model)
    pipeline.fit(x_train, y_train)

    # signature
    #signature = infer_signature(x_test, pipeline.predict(x_test))

    # predictions
    y_pred = pipeline.predict(x_test)

    #artifact for confidence interval
    error = mf.error_stat(x_test, y_test, pipeline)
    error.to_json("error.json")
    mlflow.log_artifact("error.json")

    #artifact for custum function
    mlflow.log_artifact("Model_func.py")
    mlflow.log_artifact("func_feat_eng.py")
    
    # metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    n = len(y_test)
    p = X.shape[1]
    adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

    
    # logging metrics
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("R2", r2)
    mlflow.log_metric("Adjusted_R2", adj_r2)

    # Log the full pipeline as a model
    mlflow.sklearn.log_model(pipeline,
                             input_example=input_example, name="model", 
                             code_paths=["func_feat_eng.py", "Model_func.py"],
                             #signature=signature
                             )




üèÉ View run super_popular_panda at: https://renergies99-mlflow.hf.space/#/experiments/5/runs/78dede7e218e404ea0d5c21124f5a103
üß™ View experiment at: https://renergies99-mlflow.hf.space/#/experiments/5
