Chaque modèle aura une expérience composée d'aumoins deux (2) runs

In [2]:
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
from typing import Dict, Any, List, Optional

from sklearn.model_selection import train_test_split
#from preprocess import load_and_preprocess

# Métriques & courbes
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score, log_loss,
    confusion_matrix, ConfusionMatrixDisplay,
    RocCurveDisplay)

# import des modèles
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:

# Définir où MLflow stockera les logs et runs
#mlflow.set_tracking_uri("file:///E:/Formation_Data_Analystic/projet_MLOps_GAI/Projet_MLOps/mlflow/mlruns")

In [None]:
#mlflow server --host 127.0.0.1 --port 8080

In [11]:
from mlflow import MlflowClient
client = MlflowClient(tracking_uri="http://127.0.0.1:8080")

In [13]:
all_experiments = client.search_experiments()

print(all_experiments)

MlflowException: API request to http://127.0.0.1:8080/api/2.0/mlflow/experiments/search failed with exception HTTPConnectionPool(host='127.0.0.1', port=8080): Max retries exceeded with url: /api/2.0/mlflow/experiments/search (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000212D529CE90>: Failed to establish a new connection: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée'))

In [14]:
# Provide an Experiment description that will appear in the UI
experiment_description = (
    "The projet is about a default credit prediction "
    "Each experiment tests differents models to find the best."
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "credit_default",
    "team": "alex-jiwon-patricia-wai",
    "project_quarter": "sda-2024",
    "mlflow.note.content": experiment_description,
}

# # Create the Experiment, providing a unique name
prediction_credit_default_experiments = client.create_experiment(
    name="credit_default_Models", tags=experiment_tags
)


MlflowException: API request to http://127.0.0.1:8080/api/2.0/mlflow/experiments/create failed with exception HTTPConnectionPool(host='127.0.0.1', port=8080): Max retries exceeded with url: /api/2.0/mlflow/experiments/create (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000212D5DBBBD0>: Failed to establish a new connection: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée'))

In [None]:
# Use search_experiments() to search on the project_name tag key

credit_defaut_experiments = client.search_experiments(
    filter_string="tags.`project_name` = 'credit_default_Models'"
)

print(vars(prediction_credit_default_experiments[0]))

In [9]:
def train_and_log(model_name: str, params: dict, data_path: str):
    """Entraîne un modèle donné et logge les résultats dans MLflow."""

    data_path = "E:\Formation_Data_Analystic\projet_MLOps_GAI\Projet_MLOps\datasetfinal.csv"
    target_col = "default"
    
    # Charger et préparer les données
    df = pd.read_csv(data_path)
    X = df.drop(target_col, axis=1)
    y = df[target_col]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Sélection du modèle
    if model_name == "random_forest":
        model = RandomForestClassifier(**params, random_state=42)
    elif model_name == "decision_tree":
        model = DecisionTreeClassifier(**params, random_state=42)
    elif model_name == "regression_logistique":
        model = LogisticRegression(**params, random_state=42)
    else:
        raise ValueError("Modèle inconnu : random_forest ou regression_logistque ou decision_tree ")

    # Démarrer un run MLflow
    with mlflow.start_run(run_name=f"{model_name}_run"):
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        acc = accuracy_score(y_test, preds)
        f1_weight = f1_score(y_test, preds, average="weighted") 

        # Log des paramètres et métriques
        for k, v in params.items():
            mlflow.log_param(k, v)
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("f1_score", f1_weight)


        # Enregistrer le modèle
        mlflow.sklearn.log_model(model, "model")

        print(f"{model_name} ({params}) → accuracy = {acc:.4f} -> f1_score = {f1_weight:.4f}")

### Métrique de classification

In [None]:
# def metrics_classification(y_true, y_pred, y_prob=None) -> Dict[str, float]:
#     """
#     Calcule un set de métriques utiles pour comparer les modèles :
#       - accuracy
#       - f1_weighted (gère le déséquilibre entre classes)
#       - log_loss (si proba disponible)
#       - roc_auc_ovr (AUC binaire ou multi-classes en one-vs-rest si proba dispo)
#     """
#     m = {
#         "accuracy": float(accuracy_score(y_true, y_pred)),
#         "f1_weighted": float(f1_score(y_true, y_pred, average="weighted"))
#     }
#     # log_loss & AUC demandent des probabilités (ou un score de décision)
#     if y_prob is not None:
#         # log_loss (plus petit = meilleur)
#         try:
#             m["log_loss"] = float(log_loss(y_true, y_prob, labels=np.unique(y_true)))
#         except Exception:
#             pass

#         # ROC AUC : binaire -> AUC standard ; multi-classes -> AUC OvR
#         try:
#             if hasattr(y_prob, "shape") and len(y_prob.shape) == 2 and y_prob.shape[1] > 2:
#                 # multi-classes
#                 m["roc_auc_ovr"] = float(roc_auc_score(y_true, y_prob, multi_class="ovr"))
#             else:
#                 # binaire : prendre la probabilité de la classe positive (colonne 1)
#                 pos = y_prob if y_prob.ndim == 1 else y_prob[:, 1]
#                 m["roc_auc_ovr"] = float(roc_auc_score(y_true, pos))
#         except Exception:
#             pass
#     return m

In [10]:

#Lancer les Expériences pour chaque modèle
if __name__ == "__main__":

    data_path = "E:\Formation_Data_Analystic\projet_MLOps_GAI\Projet_MLOps\datasetfinal.csv"

    # ==================== Expérience 1 : Random Forest ====================
    mlflow.set_experiment("credit_random_forest_exp")

    # Run 1
    train_and_log("random_forest", {"n_estimators": 100, "max_depth": 5}, data_path)

    # Run 2
    train_and_log("random_forest", {"n_estimators": 200, "max_depth": 8}, data_path)

    # ==================== Expérience 2 : Default credit with Decision Tree ====================
    mlflow.set_experiment("credit_decision_tree_exp")

    # Run 1
    train_and_log("decision_tree", {"max_depth": 5, "criterion": "gini"}, data_path)

    # Run 2
    train_and_log("decision_tree", {"max_depth": 10, "criterion": "entropy"}, data_path)

        # ==================== Expérience 3 : Default credit with Regression Logistique ====================
    mlflow.set_experiment("credit_logreg_exp")

    # Run 1
    train_and_log("regression_logistique", {"C": 1.0, "solver": "lbfgs"}, data_path)

    # Run 2
    train_and_log("regression_logistique", {"C": 0.5, "solver": "saga"}, data_path)




random_forest ({'n_estimators': 100, 'max_depth': 5}) → accuracy = 0.9900 -> f1_score = 0.9900




random_forest ({'n_estimators': 200, 'max_depth': 8}) → accuracy = 0.9940 -> f1_score = 0.9940




decision_tree ({'max_depth': 5, 'criterion': 'gini'}) → accuracy = 0.9900 -> f1_score = 0.9900




decision_tree ({'max_depth': 10, 'criterion': 'entropy'}) → accuracy = 0.9935 -> f1_score = 0.9935


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


regression_logistique ({'C': 1.0, 'solver': 'lbfgs'}) → accuracy = 0.9860 -> f1_score = 0.9859




regression_logistique ({'C': 0.5, 'solver': 'saga'}) → accuracy = 0.8260 -> f1_score = 0.7473


### Sauvegarde du Meilleur Modèle

In [None]:
#    # Télécharger le modèle depuis MLflow
#     model_src = os.path.join("..", "mlruns", exp.experiment_id, best_run_id, "artifacts", "model")
#     model_dest = os.path.join(ARTIFACTS_DIR, f"{experiment_name}_best_model")
#     if os.path.exists(model_dest):
#         shutil.rmtree(model_dest)
#     shutil.copytree(model_src, model_dest)
#     print(f"✅ Modèle copié dans : {model_dest}")   # Télécharger le modèle depuis MLflow
#     model_src = os.path.join("..", "mlruns", exp.experiment_id, best_run_id, "artifacts", "model")
#     model_dest = os.path.join(ARTIFACTS_DIR, f"{experiment_name}_best_model")
#     if os.path.exists(model_dest):
#         shutil.rmtree(model_dest)
#     shutil.copytree(model_src, model_dest)
#     print(f"✅ Modèle copié dans : {model_dest}")