In [139]:
import requests
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import xgboost as xgb
import joblib 
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [140]:
mlflow.set_tracking_uri("http://localhost:5000")  # URL du serveur MLflow


In [141]:
# API endpoint URLs
BASE_URL = "http://127.0.0.1:8081"  # Adjust the host and port if needed
TOKEN_URL = f"{BASE_URL}/token"
ALL_FILMS_URL = f"{BASE_URL}/all_films"


In [142]:
# Step 1: Authenticate and get the token
def get_token(username: str, password: str):
    response = requests.post(TOKEN_URL, data={"username": username, "password": password})
    if response.status_code == 200:
        return response.json()["access_token"]
    else:
        raise Exception("Authentication failed, check your credentials")


In [143]:
# Step 2: Fetch all films data using the token
def fetch_all_films(token: str):
    headers = {
        "Authorization": f"Bearer {token}"
    }
    response = requests.get(ALL_FILMS_URL, headers=headers)
    
    if response.status_code == 200:
        films_data = response.json()["films"]
        return films_data
    else:
        raise Exception("Failed to fetch films data")


In [144]:
# Step 3: Convert the films data into a Pandas DataFrame
def films_to_dataframe(films_data):
    return pd.DataFrame(films_data)


In [145]:
def train_model(df):
    # √âtape 1: Pr√©paration des donn√©es
    X = df.drop(columns=["evaluation"])  # Features except for the target
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(df["evaluation"])
    
    # √âtape 2: S√©paration des donn√©es en train et test avec stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # V√©rification de la r√©partition des classes
    print("R√©partition des classes dans l'ensemble d'entra√Ænement :", 
          dict(zip(*np.unique(y_train, return_counts=True))))
    print("R√©partition des classes dans l'ensemble de test :", 
          dict(zip(*np.unique(y_test, return_counts=True))))
    
    # √âtape 3: Entra√Ænement des mod√®les de classification
    
    # Mod√®le RandomForestClassifier
    with mlflow.start_run(run_name="RandomForestClassifier"):
        rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
        rf_model.fit(X_train, y_train)
        y_pred_rf = rf_model.predict(X_test)
        accuracy_rf = accuracy_score(y_test, y_pred_rf)
        precision_rf = precision_score(y_test, y_pred_rf, average='weighted')
        recall_rf = recall_score(y_test, y_pred_rf, average='weighted')
        f1_rf = f1_score(y_test, y_pred_rf, average='weighted')
        
        mlflow.log_metric("accuracy_rf", accuracy_rf)
        mlflow.log_metric("precision_rf", precision_rf)
        mlflow.log_metric("recall_rf", recall_rf)
        mlflow.log_metric("f1_rf", f1_rf)
        mlflow.sklearn.log_model(rf_model, "random_forest_model", registered_model_name="random_forest_model")
    
    # Mod√®le XGBoostClassifier
    with mlflow.start_run(run_name="XGBoostClassifier"):
        xgb_model = xgb.XGBClassifier(objective="multi:softprob", n_estimators=100, random_state=42)
        xgb_model.fit(X_train, y_train)
        y_pred_xgb = xgb_model.predict(X_test)
        accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
        precision_xgb = precision_score(y_test, y_pred_xgb, average='weighted')
        recall_xgb = recall_score(y_test, y_pred_xgb, average='weighted')
        f1_xgb = f1_score(y_test, y_pred_xgb, average='weighted')
        
        mlflow.log_metric("accuracy_xgb", accuracy_xgb)
        mlflow.log_metric("precision_xgb", precision_xgb)
        mlflow.log_metric("recall_xgb", recall_xgb)
        mlflow.log_metric("f1_xgb", f1_xgb)
        mlflow.sklearn.log_model(xgb_model, "xgboost_model", registered_model_name="xgboost_model")
    
    # Mod√®le LogisticRegression
    with mlflow.start_run(run_name="LogisticRegression"):
        lr_model = LogisticRegression(max_iter=1000, random_state=42)
        lr_model.fit(X_train, y_train)
        y_pred_lr = lr_model.predict(X_test)
        accuracy_lr = accuracy_score(y_test, y_pred_lr)
        precision_lr = precision_score(y_test, y_pred_lr, average='weighted')
        recall_lr = recall_score(y_test, y_pred_lr, average='weighted')
        f1_lr = f1_score(y_test, y_pred_lr, average='weighted')
        joblib.dump(label_encoder, "label_encoder_films.joblib")
        mlflow.log_artifact("label_encoder_films.joblib")
        
        mlflow.log_metric("accuracy_lr", accuracy_lr)
        mlflow.log_metric("precision_lr", precision_lr)
        mlflow.log_metric("recall_lr", recall_lr)
        mlflow.log_metric("f1_lr", f1_lr)
        mlflow.sklearn.log_model(lr_model, "logistic_regression_model", registered_model_name="logistic_regression_model")
    
    # Affichage des r√©sultats
    print(f"RandomForestClassifier - Accuracy: {accuracy_rf}, Precision: {precision_rf}, Recall: {recall_rf}, F1-score: {f1_rf}")
    print(f"XGBoostClassifier - Accuracy: {accuracy_xgb}, Precision: {precision_xgb}, Recall: {recall_xgb}, F1-score: {f1_xgb}")
    print(f"LogisticRegression - Accuracy: {accuracy_lr}, Precision: {precision_lr}, Recall: {recall_lr}, F1-score: {f1_lr}")
    
    return {
        "RandomForestClassifier": {"accuracy": accuracy_rf, "precision": precision_rf, "recall": recall_rf, "f1": f1_rf},
        "XGBoostClassifier": {"accuracy": accuracy_xgb, "precision": precision_xgb, "recall": recall_xgb, "f1": f1_xgb},
        "LogisticRegression": {"accuracy": accuracy_lr, "precision": precision_lr, "recall": recall_lr, "f1": f1_lr}
    }

In [146]:
# Main function to orchestrate the workflow
# Replace with actual credentials
username = "shuren"
password = "test"

# Step 1: Get the token
token = get_token(username, password)

# Step 2: Fetch films data
films_data = fetch_all_films(token)

# Step 3: Convert the data to a Pandas DataFrame
df = films_to_dataframe(films_data)
print(df.head())  # Preview the data to ensure it's loaded correctly

# Step 4: Train models on the data and log the results
results = train_model(df)
print("Model training complete. Results:")
print(results)


     budget    revenue  runtime  vote_count evaluation
0  25000000  124272124      146        1910       bien
1  45000000  788241776       89        5376       bien
2  64000000   97571250       92         756       bien
3  35000000  126216940      138        1310       bien
4  15000000   56255142      148        3045       bien
R√©partition des classes dans l'ensemble d'entra√Ænement : {np.int64(0): np.int64(151), np.int64(1): np.int64(152), np.int64(2): np.int64(152)}
R√©partition des classes dans l'ensemble de test : {np.int64(0): np.int64(38), np.int64(1): np.int64(38), np.int64(2): np.int64(38)}


Registered model 'random_forest_model' already exists. Creating a new version of this model...
2024/10/03 14:50:45 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: random_forest_model, version 5
Created version '5' of model 'random_forest_model'.
2024/10/03 14:50:45 INFO mlflow.tracking._tracking_service.client: üèÉ View run RandomForestClassifier at: http://localhost:5000/#/experiments/0/runs/8d4296c5d6554d11b2b7b2d14173d848.
2024/10/03 14:50:45 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://localhost:5000/#/experiments/0.
Registered model 'xgboost_model' already exists. Creating a new version of this model...
2024/10/03 14:50:52 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: xgboost_model, version 5
Created version '5' of model 'xgboost_model'.
2024/10/03 14:50:52 INFO mlflow.tracking._tracking_servi

RandomForestClassifier - Accuracy: 0.7456140350877193, Precision: 0.7447368421052633, Recall: 0.7456140350877193, F1-score: 0.7448176921861133
XGBoostClassifier - Accuracy: 0.7105263157894737, Precision: 0.715013794282087, Recall: 0.7105263157894737, F1-score: 0.7122317253962823
LogisticRegression - Accuracy: 0.543859649122807, Precision: 0.5190058479532164, Recall: 0.543859649122807, F1-score: 0.46940077466393254
Model training complete. Results:
{'RandomForestClassifier': {'accuracy': 0.7456140350877193, 'precision': np.float64(0.7447368421052633), 'recall': np.float64(0.7456140350877193), 'f1': np.float64(0.7448176921861133)}, 'XGBoostClassifier': {'accuracy': 0.7105263157894737, 'precision': np.float64(0.715013794282087), 'recall': np.float64(0.7105263157894737), 'f1': np.float64(0.7122317253962823)}, 'LogisticRegression': {'accuracy': 0.543859649122807, 'precision': np.float64(0.5190058479532164), 'recall': np.float64(0.543859649122807), 'f1': np.float64(0.46940077466393254)}}


In [147]:
import os
import sys
from mlflow.tracking import MlflowClient


MLFLOW_URL = "http://localhost:5000"
PROJECT_MODEL_NAME = "logistic_regression_model"
MONITORER_DIR = "mlflow"
experiment_name = "logistic_regression_model"

def recuperer_nom_modeles(MLFLOW_URL, PROJECT_MODEL_NAME):
    # R√©cup√©ration des noms des mod√®les sur MLFLow
    all_model_names=[]
    model_registry_client = mlflow.tracking.MlflowClient(MLFLOW_URL)
    model_versions = model_registry_client.search_model_versions("")
    model_names = set([mv.name for mv in model_versions])
    for name in model_names:
        if PROJECT_MODEL_NAME in name:
            all_model_names.append(name)
    return all_model_names[0], model_registry_client

def charger_modele_et_artefacts(model_name, model_registry_client, MONITORER_DIR):
    """
    Charge un mod√®le MLflow, ainsi que ses artefacts associ√©s, et importe les modules Python associ√©s.

    Cette fonction r√©cup√®re la version en production d'un mod√®le √† partir du registre des mod√®les MLflow,
    t√©l√©charge les artefacts associ√©s, et importe dynamiquement tous les modules Python contenus dans les artefacts.
    Les donn√©es de test et les pr√©dictions associ√©es au mod√®le sont √©galement extraites.

    Args:
        model_name (str): Le nom du mod√®le √† charger depuis le registre de mod√®les MLflow.
        model_registry_client (mlflow.tracking.MlflowClient): Le client MLflow pour interagir avec le registre des mod√®les.
        WORK_DIR (str): Le r√©pertoire de travail o√π les artefacts du mod√®le seront t√©l√©charg√©s.

    Returns:
        tuple: Un tuple contenant deux √©l√©ments :
            
    dict: Un dictionnaire avec les √©l√©ments suivants :
    'model' (sklearn model): Le mod√®le MLflow charg√©.
    'run_id' (str): L'identifiant de la run associ√©e au mod√®le.
    'pickle' (str): Le chemin du fichier pickle des variables sauvegard√©es.
    'X_test' (str): Les donn√©es de test utilis√©es lors de l'entra√Ænement du mod√®le.
    'y_test' (str): Les labels de test utilis√©s lors de l'entra√Ænement du mod√®le.
    'y_pred' (str): Les pr√©dictions du mod√®le sur les donn√©es de test.
    'rmse' (str or float): La valeur RMSE (Root Mean Square Error) du mod√®le, ou un message indiquant
                            qu'aucune RMSE n'a √©t√© trouv√©e.
    dict: Un dictionnaire des modules Python import√©s dynamiquement, avec les noms des modules comme cl√©s.

    Raises:
        FileNotFoundError: Si le r√©pertoire sp√©cifi√© pour les artefacts n'existe pas ou n'est pas un r√©pertoire.
        ImportError: Si l'importation des modules Python √©choue.

    """
    # R√©cup√©rer la version en production du mod√®le d√©sign√©
    prod_model_version = model_registry_client.get_model_version_by_alias(model_name, "production")

    # Charger le mod√®le MLflow et l'enregistrer dans un dictionnaire
    model={}
    model["model"] = mlflow.sklearn.load_model(prod_model_version.source)
    model["run_id"] = prod_model_version.run_id

    # R√©cup√©rer les artefacts du mod√®le
    run = mlflow.get_run(model["run_id"])
    artifact_uri = run.info.artifact_uri

    # # Supprimer le dossier des artefacts s'il existe d√©j√†
    # if os.path.exists(MONITORER_DIR+"/model_artefacts/") and os.path.isdir(MONITORER_DIR+"/model_artefacts/"):
    #     shutil.rmtree(MONITORER_DIR+"/model_artefacts/")

    # T√©l√©charger les artefacts du mod√®le
    artifact_folder = mlflow.artifacts.download_artifacts(artifact_uri, dst_path=MONITORER_DIR+"/model_artefacts/")
    sys.path.append(artifact_folder)

    # T√©l√©charger les artefacts du mod√®le
    artifact_folder = mlflow.artifacts.download_artifacts(artifact_uri, dst_path=os.path.join(MONITORER_DIR, "model_artefacts"))
    sys.path.append(artifact_folder)

    # Charger le LabelEncoder
    label_encoder_path = os.path.join(artifact_folder, "label_encoder_films.joblib")
    if os.path.exists(label_encoder_path):
        model["label_encoder"] = joblib.load(label_encoder_path)
    else:
        raise FileNotFoundError(f"Le fichier {label_encoder_path} n'a pas √©t√© trouv√©.")


    return model

def tag_latest_model_as_production(MLFLOW_URL, experiment_name):
    '''
    Marque le mod√®le le plus r√©cent comme √©tant le mod√®le en production.

    Args:
        mlflowURI (str): Adresse qui pointe vers MLFlow.
        experiment_name (str): Nom du mod√®le dans le registre MLFlow.
    '''
    client = mlflow.tracking.MlflowClient(MLFLOW_URL)
    registered_model = client.get_registered_model(experiment_name)

    model_versions = client.search_model_versions(f"name='{experiment_name}'")
    model_versions.sort(key=lambda x: x.last_updated_timestamp, reverse=True)

    latest_model_version = model_versions[0] if model_versions else None
    if latest_model_version:
        client.set_registered_model_alias(registered_model.name, 'production', latest_model_version.version)

In [149]:
tag_latest_model_as_production(MLFLOW_URL, "logistic_regression_model")

In [150]:
# Appeler la fonction pour r√©cup√©rer le nom du mod√®le et le client MLflow
try:
    model_name, client = recuperer_nom_modeles(MLFLOW_URL, PROJECT_MODEL_NAME)
    print(f"Nom du mod√®le r√©cup√©r√© : {model_name}")
except Exception as e:
    print(f"Erreur lors de la r√©cup√©ration du nom du mod√®le : {e}")


Nom du mod√®le r√©cup√©r√© : logistic_regression_model


In [151]:
# D√©finir le r√©pertoire de surveillance (assurez-vous que le chemin existe)
if not os.path.exists(MONITORER_DIR):
    os.makedirs(MONITORER_DIR)

# Appeler la fonction pour charger le mod√®le et ses artefacts
try:
    model_data = charger_modele_et_artefacts(model_name, client, MONITORER_DIR)
    label_encoder = model_data["label_encoder"]
    loaded_model = model_data["model"]
    run_id = model_data["run_id"]
    print(f"Mod√®le charg√© avec succ√®s. Run ID : {run_id}")
except Exception as e:
    print(f"Erreur lors du chargement du mod√®le et des artefacts : {e}")


Erreur lors du chargement du mod√®le et des artefacts : Le fichier c:\Users\Utilisateur\Documents\projet_ia\bdd_films\mlflow\model_artefacts\artifacts\label_encoder.joblib n'a pas √©t√© trouv√©.


In [153]:
encoder = joblib.load("label_encoder_films.joblib")
model = mlflow.sklearn.load_model("mlflow/model_artefacts/artifacts/logistic_regression_model") 
X_new = pd.DataFrame([[10550000, 15500000, 145, 75]], columns=["budget", "revenue", "runtime", "vote_count"])
try:
    prediction_numeric = loaded_model.predict(X_new)
    prediction_label = label_encoder.inverse_transform(prediction_numeric.astype(int))
    print(f"Pr√©diction pour les donn√©es {X_new.values} : {prediction_label[0]}")
except Exception as e:
    print(f"Erreur lors de la pr√©diction : {e}")


Pr√©diction pour les donn√©es [[10550000 15500000      145       75]] : moyen
