# Mettre en place MLflow

In [1]:
# Install the following librairies (it is better to create a venv (or conda) virtual environment first and install these librairies in it)
!pip install mlflow
!pip install --upgrade jinja2
!pip install --upgrade Flask
!pip install setuptools

Collecting mlflow
  Using cached mlflow-2.16.0-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==2.16.0 (from mlflow)
  Using cached mlflow_skinny-2.16.0-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Using cached alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Using cached docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Using cached graphene-3.3-py2.py3-none-any.whl.metadata (7.7 kB)
Collecting matplotlib<4 (from mlflow)
  Using cached matplotlib-3.9.2-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting pandas<3 (from mlflow)
  Using cached pandas-2.2.2-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting pyarrow<18,>=4.0.0 (from mlflow)
  Using cached pyarrow-17.0.0-cp311-cp311-win_amd64.whl.metadata (3.4 kB)
Collecting scikit-learn<2 (from mlflow)
  Using cached scikit_learn-1.5.1-cp311-cp311-win_amd64.whl.metadata (12 kB)
Collecting databricks-sdk<1,>=0.

In [2]:

# starts an MLflow server locally.
!mlflow server --host 127.0.0.1 --port 5000


^C


Initier une nouvelle Expérience.

Démarrer des Exécutions au sein d'une Expérience.

Documenter les paramètres, les métriques et les balises pour vos Exécutions.

Enregistrer des artefacts liés aux exécutions, tels que des modèles, des tableaux, des graphiques, et plus encore.


In [3]:
from mlflow import MlflowClient
from pprint import pprint
from sklearn.ensemble import RandomForestRegressor


In [5]:
# In order to connect to the tracking server, we’ll need to use the uri that we assigned the server when we started it.

client = MlflowClient(tracking_uri="http://127.0.0.1:5000")

#it allows programmatic interaction with the MLflow tracking server.

Nous avons maintenant une interface client vers le serveur de suivi qui peut à la fois envoyer des données au serveur de suivi et en récupérer.



In [6]:
all_experiments = client.search_experiments()

print(all_experiments)


[<Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1725646407256, experiment_id='0', last_update_time=1725646407256, lifecycle_stage='active', name='Default', tags={}>]


### Créer une expérience

In [7]:
# Fournir une description de l'expérience qui apparaîtra dans l'interface utilisateur
experiment_description = (
    "Ceci est un projet de prédiction de défaut de paiement des clients. "
    "Cette expérience contient les modèles pour la prédiction de défaut de crédit."
)

# Fournir des balises (tags) recherchables qui définissent les caractéristiques des exécutions (Runs)
# qui feront partie de cette expérience
experiment_tags = {
    "project_name": "credit-default-prediction",
    "business_unit": "risk-management",
    "team": "data-science",
    "project_quarter": "Q3-2023",
    "mlflow.note.content": experiment_description,
}

# Créer l'expérience en fournissant un nom unique
credit_default_experiment = client.create_experiment(
    name="Credit_Default_Models", tags=experiment_tags
)

In [10]:

# Utiliser search_experiments() pour rechercher les expériences par la balise project_name

credit_default_experiment = client.search_experiments(
    filter_string="tags.`project_name` = 'credit-default-prediction'"
)

print(vars(credit_default_experiment[0]))



{'_experiment_id': '892581525100417762', '_name': 'Apple_Models', '_artifact_location': 'mlflow-artifacts:/892581525100417762', '_lifecycle_stage': 'active', '_tags': {'mlflow.note.content': 'This is the grocery forecasting project. This experiment contains the produce models for apples.', 'project_name': 'grocery-forecasting', 'project_quarter': 'Q3-2023', 'store_dept': 'produce', 'team': 'stores-ml'}, '_creation_time': 1725647809507, '_last_update_time': 1725647809507}


### Dataset

In [14]:
import pandas as pd

# Charger les données
data = pd.read_csv("Loan_Data(1).csv")



In [15]:

data.head()

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
1,7442532,5,1958.928726,8228.75252,26648.43525,2,572,1
2,2256073,0,3363.009259,2027.83085,65866.71246,4,602,0
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0


### Logging our first runs with MLflow

In [16]:
import mlflow
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [17]:
# This function call sets the global tracking URI for the current session.
# It’s a convenient way to configure the tracking server URI without creating a separate client instance.

mlflow.set_tracking_uri("http://127.0.0.1:5000")


In [14]:
# Sets the current active experiment to the "Apple_Models" experiment and
# returns the Experiment metadata
# Définir l'expérience active pour votre projet
customer_default_experiment = mlflow.set_experiment("Client_Default_Prediction")

# Define a run name for this iteration of training.
# If this is not set, a unique name will be auto-generated for your run.
run_name = "default_rf_test"

# Define an artifact path that the model will be saved to.
# Définir le chemin des artefacts
artifact_path = "rf_default_model"



### Modèle

In [21]:
import pandas as pd
import numpy as np
import mlflow
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score



# Diviser les données en caractéristiques (X) et cible (y)
X = data.drop(columns=["customer_id", "default"])  # Enlever les colonnes 'customer_id' et 'default'
y = data["default"]

# Diviser les données en ensembles d'entraînement et de validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Paramètres du modèle RandomForestClassifier
params = {
    "n_estimators": 100,
    "max_depth": 6,
    "min_samples_split": 10,
    "min_samples_leaf": 4,
    "bootstrap": True,
    "oob_score": False,
    "random_state": 888,
}

# Entraîner le modèle RandomForestClassifier
rf = RandomForestClassifier(class_weight='balanced',**params)

# Ajuster le modèle sur les données d'entraînement
rf.fit(X_train, y_train)

# Prédire sur l'ensemble de validation
y_pred = rf.predict(X_val)

# Calculer les métriques de classification
accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)

# Assembler les métriques dans une collection pour les enregistrer
metrics = {"accuracy": accuracy, "f1_score": f1, "precision": precision, "recall": recall}

# Démarrer le contexte d'exécution MLflow
with mlflow.start_run(run_name="random_forest_default_prediction") as run:
    # Enregistrer les paramètres utilisés pour l'entraînement du modèle
    mlflow.log_params(params)

    # Enregistrer les métriques de performance calculées
    mlflow.log_metrics(metrics)

    # Enregistrer une instance du modèle entraîné pour une utilisation ultérieure
    mlflow.sklearn.log_model(
        sk_model=rf, input_example=X_val, artifact_path="random_forest_model"
    )


2024/09/07 09:08:28 INFO mlflow.tracking._tracking_service.client: 🏃 View run random_forest_default_prediction at: http://127.0.0.1:5000/#/experiments/0/runs/bb79a13ebccd4ee8a36b45ae9326efdf.
2024/09/07 09:08:28 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0.


In [20]:
# Prédire sur l'ensemble d'entraînement
y_train_pred = rf.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)

# Afficher les scores de précision
print(f"Train Accuracy: {train_accuracy}")
print(f"Validation Accuracy: {accuracy}")


Train Accuracy: 0.995875
Validation Accuracy: 0.9915


In [25]:
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
import mlflow
from sklearn.linear_model import LogisticRegression  # Importer LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report  # Importer classification_report

# Définir l'URI de suivi
mlflow.set_tracking_uri("http://127.0.0.1:5000")

# Créer ou sélectionner l'expérience pour la régression logistique
mlflow.set_experiment("LogisticRegression_Models")

# Définir les paramètres du modèle
params = {"C": 1.0, "solver": 'liblinear', "random_state": 42, "class_weight": 'balanced'}

# Démarrer un run pour entraîner le modèle avec les paramètres définis
with mlflow.start_run(run_name="logistic_regression_run_1"):
    # Entraîner le modèle de régression logistique
    lr_model = LogisticRegression(**params)
    lr_model.fit(X_train, y_train)

    # Prédire sur l'ensemble de validation
    y_pred = lr_model.predict(X_val)

    # Calculer les métriques de classification
    accuracy = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)

    # Assembler les métriques dans une collection pour les enregistrer
    metrics = {"accuracy": accuracy, "f1_score": f1, "precision": precision, "recall": recall}

    # Enregistrer les paramètres et les métriques dans MLflow
    mlflow.log_params(params)
    mlflow.log_metrics(metrics)

    # Enregistrer le modèle entraîné comme artefact
    mlflow.sklearn.log_model(
        sk_model=lr_model, input_example=X_val, artifact_path="logistic_regression_model"
    )

    # Afficher le rapport de classification
    print(f"Run completed with parameters: {params}")
    print(classification_report(y_val, y_pred))

2024/09/07 09:20:49 INFO mlflow.tracking._tracking_service.client: 🏃 View run logistic_regression_run_1 at: http://127.0.0.1:5000/#/experiments/286315274138648484/runs/4e7c2243f07a456cb25a630e9a05033e.
2024/09/07 09:20:49 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/286315274138648484.


Run completed with parameters: {'C': 1.0, 'solver': 'liblinear', 'random_state': 42, 'class_weight': 'balanced'}
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      1652
           1       0.91      0.98      0.94       348

    accuracy                           0.98      2000
   macro avg       0.95      0.98      0.96      2000
weighted avg       0.98      0.98      0.98      2000



In [26]:
import pickle
with open("random_forest_model.pkl", "wb") as f:
    pickle.dump(rf, f)
