## 1. IMPORTS  

In [None]:
# --- STANDARD
import numpy as np
import pandas as pd

# --- VISUALISATION
import matplotlib.pyplot as plt
import seaborn as sns

# --- MACHINE LEARNING
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, hamming_loss, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

# --- TRAITEMENT SPECIFIQUE
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer

# --- PERSISTENCE / OUTILS EXTERNES
import joblib
import pickle
import os

# --- UTILITAIRES
from scipy.sparse import load_npz
from tqdm.notebook import tqdm  # ou simplement tqdm si console

# --- TRACKING MLFLOW
import mlflow
import mlflow.sklearn  # Pour les modèles scikit-learn
import sys
import os

# --- AJOUT DOSSIER PARENT DU NOTEBOOK AU PATH PYTHON
project_root = os.path.abspath("..")  # ou "../.." selon ton niveau
sys.path.append(project_root)
# --- MODULES PROJET
import importlib
import src.tags_suggester.modeling.modeling as mdl
importlib.reload(mdl)


## 2. MODELISATION : LOGISTIC REGRESSION

### 2.1. CHARGEMENT DES FEATURES

In [None]:
# -------------------------------------------
# --- RECHARGEMENT DES MODULES DE MODELISATION
# -------------------------------------------
import importlib
import src.tags_suggester.modeling.modeling as mdl
importlib.reload(mdl)

# -----------------------------------------
# --- 0. CONFIGURATION DU MODÈLE À TESTER
# -----------------------------------------
# ✅ Tu peux changer ces lignes pour benchmarker un autre modèle
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

model_type = "logreg"
model_class = LogisticRegression
model_wrapper = OneVsRestClassifier  # ou ClassifierChain ou None

# ------------------------------
# --- 1. SELECTION DES FEATURES
# ------------------------------
# --- COMMENTE CAR CAS 50000 QUESTIONS REND LECTURE IMPOSSIBLE
# full_df = pd.read_parquet("data/processed/full_explo_wo.parquet")
# --- DEBUT SOLUCE ADAPTEE 50 000 QUESTIONS
# import pandas as pd
# chunks = []
# chunk_size = 5000  # ou 10000 selon ta RAM

# for chunk in pd.read_csv("data/processed/full_explo_wo.csv.gz", chunksize=chunk_size):
#     chunks.append(chunk)

# full_df = pd.concat(chunks, ignore_index=True)
# --- FIN SOLUCE ADAPTEE 50 000 QUESTIONS

# print(f"Dimensions du dataframe full_df : {full_df.shape}")
# print(f" Colonnes du dataframe full_df : {full_df.columns.tolist()}")
# print(full_df[["clean_title_body"]].head(3))

# --- A. CHARGEMENT DES VECTEURS
X_bow   = load_npz("models/bow/X_bow_full.npz")
X_tfidf = load_npz("models/tfidf/X_tfidf_full.npz")
X_svd   = np.load("models/svd/X_titlebody_svd10k.npy")
X_w2v   = np.load("models/w2v/X_w2v_full.npy")
X_use   = np.load("models/use/embeddings_use_full.npy")
X_sbert = np.load("models/sbert/embeddings_sbert_full.npy")

print("# --- DIMENSIONS DES VECTEURS :")
for name, mat in [("BoW", X_bow), ("TF-IDF", X_tfidf), ("SVD", X_svd),
                  ("Word2Vec", X_w2v), ("USE", X_use), ("SBERT", X_sbert)]:
    print(f"# --- {name:<10}: {mat.shape}")


# --- B. CHARGEMENT DES LABELS MULTILABEL
mlb = joblib.load("models/tags/multilabel_binarizer_full.pkl")
Y_full = np.load("models/tags/y_tags_full.npy")

print(f"# --- Labels multilabel chargés : {Y_full.shape}")
print(f"# --- Nombre de tags avant filtrage : {len(mlb.classes_)}")

tag_counts = Y_full.sum(axis=0)
tag_mask = tag_counts >= 1  # seuil de rareté
Y_full_filtered = Y_full[:, tag_mask]

mlb_filtered = mlb
mlb_filtered.classes_ = np.array(mlb.classes_)[tag_mask]
print(f"# --- TAGS conservés après filtrage : {len(mlb_filtered.classes_)}")
print(mlb_filtered.classes_.tolist())
print("java" in mlb_filtered.classes_)    # True ou False
print("python" in mlb_filtered.classes_)  # True ou False
tags_name = mlb.classes_
for tag in ["java", "python"]:
    idx = list(tags_name).index(tag)
    print(f"{tag} count: {tag_counts[idx]}")

from collections import Counter
import numpy as np
tag_freq = Counter({tag: tag_counts[i] for i, tag in enumerate(tags_name)})
print(tag_freq["asp.net-core-mvc"])  # → combien d’occurrences ?

# Top 10 tags les plus fréquents
top_tags = tag_freq.most_common(10)
print("🔝 Tags dominants :", top_tags)

### 2.2. DEFINITION DES VARIABLES VECTEURS A ENTRAINER

In [None]:
# -------------------------------------------
# --- RECHARGEMENT DES MODULES DE MODELISATION
# -------------------------------------------
import importlib
import src.tags_suggester.modeling.modeling as mdl
importlib.reload(mdl)
# ----------------------------------------------
# --- 2. SÉPARATION DES FEATURES ET DE LA CIBLE
# ----------------------------------------------
Y = Y_full_filtered
# X_text = full_df["clean_title_body"]
# --- DEBUT IMPACT DIFFERENCIATION PRE TRAITEMENT SELON TYPE TRANSFORMATIONS
import pandas as pd
# --- POUR TFIDF/SVD/BOW
X_text = pd.read_csv(
    "data/processed/full_explo_wo.csv.gz",
    usecols=["clean_title_body"],
    dtype={"clean_title_body": "str"}
)["clean_title_body"]
# --- POUR SBERT/USE
X_text_embed = pd.read_csv(
    "data/processed/full_explo_wo.csv.gz",
    usecols=["clean_title_body_embed"],
    dtype={"clean_title_body_embed": "str"}
)["clean_title_body_embed"]
# --- POUR WORD2VEC
X_text_w2v = pd.read_csv(
    "data/processed/full_explo_wo.csv.gz",
    usecols=["clean_title_body_w2v"],
    dtype={"clean_title_body_w2v": "str"}
)["clean_title_body_w2v"]

X_text_dict = {
    "bow": X_text,
    "tfidf": X_text,
    "svd": X_text,
    "w2v": X_text_w2v,
    "use": X_text_embed,
    "sbert": X_text_embed
}

# --- FIN IMPACT DIFFERENCIATION PRE TRAITEMENT SELON TYPE TRANSFORMATIONS
print(f"# --- Matrice multilabel (Y) : {Y.shape}")
print(f"# --- Colonne textuelle (X) : {X_text.shape}")

# ---------------------------------------------------------
# --- 3. CRÉATION DES VECTEURS + DICTS DE SUPPORT
# ---------------------------------------------------------
X_dict = {
    "bow": X_bow,
    "tfidf": X_tfidf,
    "svd": X_svd,
    "w2v": X_w2v,
    "use": X_use,
    "sbert": X_sbert
}

preproc_dict = {
    "bow": None,
    "tfidf": None,
    "svd": None,
    "w2v": "scale",
    "use": "scale",
    "sbert": "scale"
}

### 2.3. DIVISION TRAIN/ TEST POUR CHAQUE VECTEUR

In [None]:
# --------------
# --- OBSOLETE
# --------------
# ---------------------------------------------
# --- RECHARGEMENT DES MODULES DE MODELISATION
# ---------------------------------------------
import importlib
import src.tags_suggester.modeling.modeling as mdl
importlib.reload(mdl)
# ---------------------------------------------------------
# --- 4. DIVISION EN TRAIN / TEST SUR CHAQUE VECTEUR
# ---------------------------------------------------------
indices = np.arange(Y.shape[0])
train_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42)

splits_dict = {}
for name, X in X_dict.items():
    X_train, X_test, y_train, y_test = mdl.split_on_indices(X, Y, train_idx, test_idx)
    splits_dict[name] = (X_train, X_test)

print(f"# --- Splits prêts pour vecteurs : {list(splits_dict.keys())}")


In [None]:
# ---------------------------------------------
# --- RECHARGEMENT DES MODULES DE MODELISATION
# ---------------------------------------------
import importlib
import src.tags_suggester.modeling.modeling as mdl
importlib.reload(mdl)

indices = np.arange(Y.shape[0])
train_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42)
splits_dict = {}

for name, X in X_dict.items():
    # Récupération du texte associé
    X_text_current = X_text_dict[name]
    
    # Split des vecteurs et des labels
    X_train, X_test, y_train, y_test = mdl.split_on_indices_custom(X, Y, train_idx, test_idx)
    
    # Split du texte
    X_text_train = X_text_current.iloc[train_idx].reset_index(drop=True)
    X_text_test  = X_text_current.iloc[test_idx].reset_index(drop=True)
    
    # Stockage dans le dictionnaire
    splits_dict[name] = {
        "X_train": X_train,
        "X_test": X_test,
        "y_train": y_train,
        "y_test": y_test,
        "X_text_train": X_text_train,
        "X_text_test": X_text_test
    }

print(f"# --- Splits enrichis pour vecteurs : {list(splits_dict.keys())}")


### 2.4. ENTRAINEMENT DU MODELE POUR CHAQUE VECTEURS - TRACKING - SAUVEGARDE

In [None]:
# --- NOUVELLE SOLUTION --- DEVENUE OBSOLETE
# -------------------------------------------
# --- RECHARGEMENT DES MODULES DE MODELISATION
# -------------------------------------------
import importlib
import src.tags_suggester.modeling.modeling as mdl
importlib.reload(mdl)
import mlflow
import mlflow.sklearn
import os
import joblib
import pandas as pd
import warnings
from pathlib import Path
# Ignorer seulement le warning spécifique sur les labels absents
warnings.filterwarnings("ignore", message="Label not .* is present in all training examples")

df_results = []
trained_models_dict = {}  # 🧠 Dictionnaire des modèles entraînés
uri = "file:///D:/machine_learning_training/openclassrooms_projects/05_categorisez_automatiquement_question/mlruns"
mlflow.set_tracking_uri(uri)
mlflow.set_experiment("logreg_stackoverflow")
notebook_path = Path().resolve()
base_path = notebook_path.parents[0]

for name, (X_train, X_test) in splits_dict.items():
    preproc = preproc_dict.get(name)

    with mlflow.start_run(run_name=f"{model_type}_{name}"):
        mlflow.log_param("model_type", model_type)
        mlflow.log_param("vecteur", name)
        mlflow.log_param("preprocessing", preproc)

        # 🧪 Entraînement + prédiction
        scores = mdl.train_and_score_vector_full_metrics(
            name=name,
            X_train=X_train,
            X_test=X_test,
            y_train=y_train,
            y_test=y_test,
            model_class=model_class,
            model_wrapper=model_wrapper,
            preprocess=preproc
        )

        # 📊 Logging des métriques
        mlflow.log_metric("f1_micro", scores["f1_micro"])
        mlflow.log_metric("hamming_loss", scores["hamming_loss"])
        mlflow.log_metric("coverage_score", scores["coverage_tags"])
        mlflow.log_metric("f1_macro", scores["f1_macro"])
        mlflow.log_metric("precision_micro", scores["precision_micro"])
        mlflow.log_metric("recall_micro", scores["recall_micro"])

        # --- DEFINITION EMPLACEMENT SOUHAITE DE SAUVEGARDE LOCALE DU MODELE
        path_model = f"models/logreg/logreg_{name}.joblib"
        os.makedirs(os.path.dirname(path_model), exist_ok=True)

        # 💾 SAUVEGARDE DU MODELE EN LOCAL
        joblib.dump(scores["model"], path_model)
        mlflow.log_artifact(path_model)
        # --- SAUVEGARDE DANS UN EMPLACEMENT ACCESSIBLE A L'API
        api_model_path = base_path / "src" / "tags_suggester" / "api" / "models" / name
        # api_model_path.mkdir(parents=True, exist_ok=True)
        # 📥 Stockage dans le tableau de résultats
        df_results.append({
            "vecteur": name,
            "f1_micro": round(scores["f1_micro"], 3),
            "f1_macro": round(scores["f1_macro"], 3),
            "precision_micro": round(scores["precision_micro"], 3),
            "recall_micro": round(scores["recall_micro"], 3),
            "hamming_loss": round(scores["hamming_loss"], 4),
            "coverage_tags": round(scores["coverage_tags"], 4)
        })

        # 🔄 Remplissage du dictionnaire des modèles entraînés
        trained_models_dict[name] = scores["model"]
        print(f"📦 Modèle '{name}' stocké dans le dict : {type(scores['model'])}")


print(f"🧠 Dictionnaire final : {list(trained_models_dict.keys())}")

# 🧾 Création du dataframe final
df_results = pd.DataFrame(df_results)
display(df_results.sort_values("f1_micro", ascending=False))

In [None]:
# -------------------------------------------
# --- RECHARGEMENT DES MODULES DE MODELISATION
# -------------------------------------------
import importlib
import src.tags_suggester.modeling.modeling as mdl
importlib.reload(mdl)
import mlflow
import mlflow.sklearn
import os
import joblib
import pandas as pd
import warnings
from pathlib import Path
# Ignorer seulement le warning spécifique sur les labels absents
warnings.filterwarnings("ignore", message="Label not .* is present in all training examples")

df_results = []
trained_models_dict = {}  # 🧠 Dictionnaire des modèles entraînés
uri = "file:///D:/machine_learning_training/openclassrooms_projects/05_categorisez_automatiquement_question/mlruns"
mlflow.set_tracking_uri(uri)
mlflow.set_experiment("logreg_stackoverflow")
notebook_path = Path().resolve()
base_path = notebook_path.parents[0]

for name, split in splits_dict.items():
    preproc = preproc_dict.get(name)

    with mlflow.start_run(run_name=f"{model_type}_{name}"):
        mlflow.log_param("model_type", model_type)
        mlflow.log_param("vecteur", name)
        mlflow.log_param("preprocessing", preproc)

        # 🧪 Entraînement + prédiction avec les bons textes
        scores = mdl.train_and_score_vector_full_metrics_custom(
            name=name,
            X_train=split["X_train"],
            X_test=split["X_test"],
            y_train=split["y_train"],
            y_test=split["y_test"],
            model_class=model_class,
            model_wrapper=model_wrapper,
            preprocess=preproc,
            X_text_train=split["X_text_train"],
            X_text_test=split["X_text_test"]
        )

        # 📊 Logging des métriques
        mlflow.log_metric("f1_micro", scores["f1_micro"])
        mlflow.log_metric("hamming_loss", scores["hamming_loss"])
        mlflow.log_metric("coverage_score", scores["coverage_tags"])
        mlflow.log_metric("f1_macro", scores["f1_macro"])
        mlflow.log_metric("precision_micro", scores["precision_micro"])
        mlflow.log_metric("recall_micro", scores["recall_micro"])

        # 💾 Sauvegarde du modèle
        path_model = f"models/logreg/logreg_{name}.joblib"
        os.makedirs(os.path.dirname(path_model), exist_ok=True)
        joblib.dump(scores["model"], path_model)
        mlflow.log_artifact(path_model)

        # 📥 Stockage des résultats
        df_results.append({
            "vecteur": name,
            "f1_micro": round(scores["f1_micro"], 3),
            "f1_macro": round(scores["f1_macro"], 3),
            "precision_micro": round(scores["precision_micro"], 3),
            "recall_micro": round(scores["recall_micro"], 3),
            "hamming_loss": round(scores["hamming_loss"], 4),
            "coverage_tags": round(scores["coverage_tags"], 4)
        })

        trained_models_dict[name] = scores["model"]
        print(f"📦 Modèle '{name}' stocké dans le dict : {type(scores['model'])}")




print(f"🧠 Dictionnaire final : {list(trained_models_dict.keys())}")

# 🧾 Création du dataframe final
df_results = pd.DataFrame(df_results)
display(df_results.sort_values("f1_micro", ascending=False))

### 2.5. SAUVEGARDE DU MEILLLEUR MODELE

In [None]:
# --- NOUVELLE SOLUTION 2
# -------------------------------------------
# --- RECHARGEMENT DES MODULES DE MODELISATION
# -------------------------------------------
import importlib
import src.tags_suggester.modeling.modeling as mdl
importlib.reload(mdl)
import json
print(df_results.columns)
import json
import joblib
from pathlib import Path

# ----------------------------------------------------------------
# --- IDENTIFICATION DU MEILLEUR MODELE SELON LE SCORE F1 MICRO
# ----------------------------------------------------------------
best_row = df_results.sort_values("f1_micro", ascending=False).iloc[0]
print(f"# --- LA LIGNE DE MEILLEURE f1 micro EST : {best_row}")
best_vect = best_row["vecteur"]
print(f"# --- LE VECTEUR PRESENTANT LE MEILLEUT f1 micro EST : {best_vect}")
best_model = trained_models_dict[best_vect]
print(f"# --- LE MODELE DE REGRESSION LOGISTIQUE ASSOCIE A CE MEILLEUR VECTEUR EST : {best_model}")

# 📂 Définir le chemin du dossier API correspondant
api_model_dir = base_path / "src" / "tags_suggester" / "api" / "models" / best_vect
# TODO JE VEUX COPIER  DANS api_model_dir LE FICHIER logreg_{best_vect}.joblib QUI SE TROUVE DANS models/best_vect
import shutil
# 📂 Définir le chemin source et destination avec Path
source_path = base_path / "notebooks" / "models" / "logreg" / f"logreg_{best_vect}.joblib"
destination_path = api_model_dir / f"logreg_{best_vect}.joblib"

# 📁 Créer le dossier destination s’il n’existe pas
api_model_dir.mkdir(parents=True, exist_ok=True)

# 🔄 Copier le fichier
if source_path.exists():
    shutil.copy2(source_path, destination_path)
    print(f"✅ Modèle copié de {source_path} vers {destination_path}")
else:
    print(f"🚫 Fichier source introuvable : {source_path}")

# FIN TODO 

# 📌 MultilabelBinarizer 
mlb_path = base_path / "notebooks" /  "models" / "tags" / "multilabel_binarizer_full.pkl"
mlb_exists = mlb_path.exists()


api_model_dir = base_path / "src" / "tags_suggester" / "api" / "models" / best_vect
model_path = api_model_dir / f"logreg_{best_vect}.joblib"
print(f"# --- LE CHEMIN DU MEILLEUR MODELE DE REGRESSION LOGISTIQUE EST : {model_path}")
# 📌 Chemin vers le transformateur
if best_vect == "sbert":
    vectorizer_path_api = api_model_dir / "sbert_model"
    vectorizer_path = base_path / "notebooks" / "models" / "sbert" / "sbert_model"
elif best_vect == "use":
    vectorizer_path_api = api_model_dir / "use_path.json"
    vectorizer_path = base_path / "notebooks" / "models" / "use" / "use_path.json"
elif best_vect in ["word2vec", "w2v"]:
    vectorizer_path_api = api_model_dir / "w2v_titlebody_full.bin"
    vectorizer_path = base_path / "notebooks" / "models" / "w2v" / "w2v_titlebody_full.bin"
elif best_vect == "bow":
    vectorizer_path_api = api_model_dir / "vectorizer_bow_full.pkl"
    vectorizer_path = base_path / "notebooks" / "models" / "bow" / "vectorizer_bow_full.pkl"
elif best_vect == "tfidf":
    vectorizer_path_api = api_model_dir / "tfidf_vectorizer_titlebody.joblib"
    vectorizer_path = base_path / "notebooks" / "models" / "tfidf" / "tfidf_vectorizer_titlebody.joblib"
elif best_vect == "svd":
    vectorizer_path_api = api_model_dir / "tfidf_vectorizer_titlebody.joblib"
    vectorizer_path = base_path / "notebooks" / "models" / "tfidf" / "tfidf_vectorizer_titlebody.joblib"
    svd_path_api = api_model_dir / "svd_model_titlebody.joblib"
    svd_path = base_path / "notebooks" / "models" / "svd" / "svd_model_titlebody.joblib"
    shutil.copy2(svd_path, svd_path_api)
    print(f"✅ Modèle SVD copié de {svd_path} vers {svd_path_api}")
else:
    raise ValueError(f"🚫 Type de vecteur inconnu : {best_vect}")

# --- TODO SAUVEGARDE DANS API DU VECTORIZER - ET DU MultilabelBinarizer
# 📦 Copie du vectorizer
if best_vect == "sbert":
    if vectorizer_path.exists():
        if vectorizer_path_api.exists():
            shutil.rmtree(vectorizer_path_api)  # Supprime l'ancien dossier s'il existe
        shutil.copytree(vectorizer_path, vectorizer_path_api)
        print(f"✅ Dossier SBERT copié de {vectorizer_path} vers {vectorizer_path_api}")
    else:
        print(f"🚫 Dossier SBERT introuvable : {vectorizer_path}")
else:
    # 📦 Copie des autres vectorizers (fichiers)
    if vectorizer_path.exists():
        shutil.copy2(vectorizer_path, vectorizer_path_api)
        print(f"✅ Vectorizer copié de {vectorizer_path} vers {vectorizer_path_api}")
    else:
        print(f"🚫 Fichier vectorizer introuvable : {vectorizer_path}")



api_model_dir_config = base_path / "src" / "tags_suggester" / "api" / "models"
# 📦 Copie du MultilabelBinarizer
mlb_path_api = api_model_dir_config / "tags" / "multilabel_binarizer_full.pkl"
mlb_path_api.parent.mkdir(parents=True, exist_ok=True)

if mlb_path.exists():
    shutil.copy2(mlb_path, mlb_path_api)
    print(f"✅ MultilabelBinarizer copié de {mlb_path} vers {mlb_path_api}")
else:
    print(f"🚫 Fichier MultilabelBinarizer introuvable : {mlb_path}")

# --- FIN TODO

# # 📄 Construire le dictionnaire de config
config = {
    "vectorizer": best_vect,
    "model_path": (Path("models") / best_vect / f"logreg_{best_vect}.joblib").as_posix(),
    "vectorizer_path": (Path("models") / best_vect / vectorizer_path_api.name).as_posix(),
    "mlb_path": (Path("models") / "tags" / "multilabel_binarizer_full.pkl").as_posix() if mlb_exists else None
}
if best_vect == "svd":
    config["svd_path"] = (Path("models") / best_vect / svd_path_api.name).as_posix()


# 💾 Sauvegarde du fichier config : FONCTIONNEMENT A CONSERVER
config_file = api_model_dir_config / "config_best_model.json"
with open(config_file, "w") as f:
    json.dump(config, f, indent=4)

print(f"✅ Fichier config_best_model.json sauvegardé dans : {config_file}")


### 2.6. LOG MLFLOW DES MÉTRIQUES DANS UN TABLEAU COMPARATIF FINAL

In [None]:
# -------------------------------------------
# --- RECHARGEMENT DES MODULES DE MODELISATION
# -------------------------------------------
import importlib
import src.tags_suggester.modeling.modeling as mdl
importlib.reload(mdl)

# -----------------------------------------
# --- ARRONDI A 3 CHIFFRES POUR LISIBILITE
# -----------------------------------------
df_results[
    ["f1_micro", "f1_macro", "precision_micro", "recall_micro", "hamming_loss", "coverage_tags"]
    ] = df_results[
    ["f1_micro", "f1_macro", "precision_micro", "recall_micro", "hamming_loss", "coverage_tags"]
].round(3)

# --------------------------------------
# --- LOG MLFLOW DU TABLEAU COMPARATIF 
# --------------------------------------
model_directory = "models/logreg"
os.makedirs(model_directory, exist_ok=True)
mlflow.end_run()
with mlflow.start_run(run_name="comparatif_vecteurs_final"):
    csv_path = f"{model_directory}/comparatif_vecteurs.csv"
    df_results.to_csv(csv_path, index=False)
    mlflow.log_artifact(csv_path)

# ---------------------------------------------
# --- AFFICHAGE DE CE TABLEAU DANS LE NOTEBOOK
# ----------------------------------------------
display(df_results.sort_values("f1_micro", ascending=False))

### 2.7 VISUALISATION DES METRIQUES DANS MLFLOW

In [None]:
# -------------------------------------------
# --- RECHARGEMENT DES MODULES DE MODELISATION
# -------------------------------------------
import importlib
import src.tags_suggester.modeling.modeling as mdl
importlib.reload(mdl)
# ---------------------------------------------------------------------
# --- POUR SAUVEGARDES LOCALES DES BARPLOTS SEABORN LOGUES DANS MLFLOW
# ---------------------------------------------------------------------
img_model_directory = f"{model_directory}/mlflow_images"
os.makedirs(img_model_directory, exist_ok=True)
# --------------------------------------------------
# --- BARPLOT COUVERTURE DES TAGS LOGUE DANS MLFLOW
# --------------------------------------------------
mlflow.end_run()
with mlflow.start_run(run_name="comparatif_coverage_barplot", nested=True):
    mdl.plot_and_log_barplot(
        df_scores=df_results,
        metric="coverage_tags",
        title="Couverture des tags par vecteur",
        save_path=f"{img_model_directory}/barplot_coverage_tags.png"
    )
# --------------------------------------------------
# --- BARPLOT SCORE F1 LOGUE DANS MLFLOW
# --------------------------------------------------
mlflow.end_run()
with mlflow.start_run(run_name="comparatif_f1_micro_barplot", nested=True):
    mdl.plot_and_log_barplot(
        df_scores=df_results,
        metric="f1_micro",
        title="Score F1 (entraînement classique) par vecteur",
        save_path=f"{img_model_directory}/barplot_f1_micro.png"
    )