In [1]:
import os
import shutil
import mlflow
import mlflow.sklearn
import pandas as pd
import scipy.sparse as sp
from pymongo import MongoClient
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import pymongo

# Connexion à MongoDB
uri = "mongodb://mongodb:27017/"
client = pymongo.MongoClient(uri)
db = client["github_issues"]
collection = db["closed_issues"]

# Extraire les données depuis MongoDB
cursor = collection.find({"duration_class": {"$exists": True}})

data = []
for issue in cursor:
    title = issue.get("title", "")
    body = issue.get("body", "")
    language = issue.get("language", "unknown")
    duration_class = issue.get("duration_class", "unknown")
    
    text = title + " " + body  # Combinaison titre + corps du texte
    data.append([text, language, duration_class])

# Convertir en DataFrame
df = pd.DataFrame(data, columns=["text", "language", "duration_class"])

# Vérifier la distribution des classes
print("Distribution des classes dans l'ensemble complet :")
print(df["duration_class"].value_counts(normalize=True) * 100)

# Séparer les features et labels
X = df["text"]
y = df["duration_class"]

# Vectorisation avec TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

# Ajouter la langue comme feature catégorielle
X_language = pd.get_dummies(df["language"])
X_final = sp.hstack([X_tfidf, X_language.values])

# Séparer en train/test
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42, stratify=y)

# Vérifier la distribution des classes dans chaque ensemble
print("Distribution des classes dans l'ensemble d'entraînement :")
print(pd.Series(y_train).value_counts(normalize=True) * 100)
print("Distribution des classes dans l'ensemble de test :")
print(pd.Series(y_test).value_counts(normalize=True) * 100)

# Initialisation de MLflow
mlflow.set_tracking_uri("../mlruns")  # Stockage des logs dans 'mlruns'
mlflow.set_experiment("Classification Duration")

# Vérifier si MLflow tourne déjà
if mlflow.active_run() is None:
    mlflow.start_run()
else:
    mlflow.start_run(nested=True)

# Entraîner le modèle
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Prédire sur l'ensemble de test
y_pred = model.predict(X_test)

# Évaluer le modèle
report = classification_report(y_test, y_pred, output_dict=True)
print(classification_report(y_test, y_pred))

# Log des métriques MLflow
mlflow.log_metric("accuracy", report["accuracy"])
for label, metrics in report.items():
    if isinstance(metrics, dict):  # Vérifier si c'est une classe
        mlflow.log_metric(f"{label}_precision", metrics["precision"])
        mlflow.log_metric(f"{label}_recall", metrics["recall"])
        mlflow.log_metric(f"{label}_f1-score", metrics["f1-score"])

# Sauvegarde du modèle
model_dir = "../models"
os.makedirs(model_dir, exist_ok=True)
model_path = os.path.join(model_dir, "logistic_model.pkl")

import joblib
joblib.dump(model, model_path)

# Enregistrement du modèle avec MLflow
mlflow.sklearn.log_model(model, "logistic_model")

# Log du fichier modèle dans MLflow
mlflow.log_artifact(model_path)

# Fin de l'exécution MLflow
mlflow.end_run()

print(f"Modèle enregistré dans : {model_path}")


Distribution des classes dans l'ensemble complet :
duration_class
5    23.744574
3    20.150176
1    19.717707
4    18.817920
2    17.569623
Name: proportion, dtype: float64


2025/02/22 15:50:15 INFO mlflow.tracking.fluent: Experiment with name 'Classification Duration' does not exist. Creating a new experiment.
The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



Distribution des classes dans l'ensemble d'entraînement :
duration_class
5    23.744554
3    20.150495
1    19.716832
4    18.817822
2    17.570297
Name: proportion, dtype: float64
Distribution des classes dans l'ensemble de test :
duration_class
5    23.744654
3    20.148899
1    19.721210
4    18.818311
2    17.566925
Name: proportion, dtype: float64
              precision    recall  f1-score   support

           1       0.29      0.36      0.32      2490
           2       0.20      0.10      0.13      2218
           3       0.24      0.22      0.23      2544
           4       0.26      0.19      0.22      2376
           5       0.34      0.50      0.41      2998

    accuracy                           0.29     12626
   macro avg       0.27      0.27      0.26     12626
weighted avg       0.27      0.29      0.27     12626





Modèle enregistré dans : ../models/logistic_model.pkl
