In [2]:
import os
import mlflow
import mlflow.sklearn
import pandas as pd
import scipy.sparse as sp
import pymongo
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib
from mlflow.tracking import MlflowClient

# --- Connexion à MongoDB ---
uri = "mongodb://mongodb:27017/"
client = pymongo.MongoClient(uri)
db = client["github_issues"]
collection = db["closed_issues"]

# --- Extraction des données depuis MongoDB ---
data = []
for issue in collection.find({"duration_class": {"$exists": True}}):
    title = issue.get("title", "")
    body = issue.get("body", "")
    language = issue.get("language", "unknown")
    duration_class = issue.get("duration_class", "unknown")
    text = title + " " + body
    data.append([text, language, duration_class])

# --- Conversion en DataFrame ---
df = pd.DataFrame(data, columns=["text", "language", "duration_class"])
print("Distribution des classes dans l'ensemble complet :")
print(df["duration_class"].value_counts(normalize=True) * 100)

# --- Préparation des features et labels ---
X = df["text"]
y = df["duration_class"]

# --- Vectorisation avec TF-IDF ---
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(X)
print("Shape of TF-IDF features:", X_tfidf.shape)

# --- Ajout de la langue comme feature catégorielle ---
LANGUAGE_COLUMNS = ["Python", "Java", "JavaScript", "C++", "Ruby", "Go", "PHP", "Other"]

# One-hot encode and reindex to ensure consistent columns
X_language = pd.get_dummies(df["language"]).reindex(columns=LANGUAGE_COLUMNS, fill_value=0)
print("Shape of dense language features:", X_language.shape)

# Convert the dense language feature array to a sparse matrix (cast to float for consistency)
X_language_sparse = sp.csr_matrix(X_language.to_numpy().astype(float))
print("Shape of sparse language features:", X_language_sparse.shape)

# Combine the TF-IDF features and the language features
X_final = sp.hstack([X_tfidf, X_language_sparse])
print("Shape of combined features:", X_final.shape)

# --- Séparation train/test ---
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y, test_size=0.2, random_state=42, stratify=y
)
print("Distribution des classes dans l'ensemble d'entraînement :")
print(pd.Series(y_train).value_counts(normalize=True) * 100)
print("Distribution des classes dans l'ensemble de test :")
print(pd.Series(y_test).value_counts(normalize=True) * 100)

# --- Configuration de MLflow ---
mlflow.set_tracking_uri("../mlruns")  # Ajustez le chemin si nécessaire
mlflow.set_experiment("Classification Duration")
mlflow.autolog()  # Activer l'autologging

with mlflow.start_run() as run:
    # --- Entraînement du modèle ---
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    
    # --- Évaluation du modèle ---
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    
    # --- Sauvegarde locale du modèle et du vectorizer ---
    model_dir = "../models"
    os.makedirs(model_dir, exist_ok=True)
    model_path = os.path.join(model_dir, "logistic_model.pkl")
    joblib.dump(model, model_path)
    vectorizer_path = os.path.join(model_dir, "tfidf_vectorizer.pkl")
    joblib.dump(vectorizer, vectorizer_path)
    
    # --- Log du modèle et des artefacts avec MLflow ---
    mlflow.sklearn.log_model(model, artifact_path="logistic_model")
    mlflow.log_artifact(model_path)
    
    # --- Enregistrement du modèle dans le MLflow Model Registry ---
    model_uri = f"runs:/{run.info.run_id}/logistic_model"
    registered_model = mlflow.register_model(model_uri, "MyLogisticModel")
    print("Modèle enregistré dans le registre avec la version:", registered_model.version)
    
    # --- Transition du modèle vers l'état 'Production' ---
    mlflow_client = MlflowClient()
    mlflow_client.transition_model_version_stage(
        name="MyLogisticModel",
        version=registered_model.version,
        stage="Production"
    )
    print("Le modèle a été mis en production.")

print("Modèle logué et enregistré dans MLflow avec le run ID:", run.info.run_id)


Distribution des classes dans l'ensemble complet :
duration_class
5    23.744574
3    20.150176
1    19.717707
4    18.817920
2    17.569623
Name: proportion, dtype: float64
Shape of TF-IDF features: (63126, 5000)
Shape of dense language features: (63126, 8)
Shape of sparse language features: (63126, 8)
Shape of combined features: (63126, 5008)


2025/02/22 19:24:58 INFO mlflow.tracking.fluent: Experiment with name 'Classification Duration' does not exist. Creating a new experiment.


Distribution des classes dans l'ensemble d'entraînement :
duration_class
5    23.744554
3    20.150495
1    19.716832
4    18.817822
2    17.570297
Name: proportion, dtype: float64
Distribution des classes dans l'ensemble de test :
duration_class
5    23.744654
3    20.148899
1    19.721210
4    18.818311
2    17.566925
Name: proportion, dtype: float64


2025/02/22 19:24:58 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/02/22 19:24:58 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.


              precision    recall  f1-score   support

           1       0.29      0.34      0.31      2490
           2       0.20      0.10      0.13      2218
           3       0.24      0.23      0.24      2544
           4       0.25      0.18      0.21      2376
           5       0.34      0.49      0.40      2998

    accuracy                           0.28     12626
   macro avg       0.26      0.27      0.26     12626
weighted avg       0.27      0.28      0.27     12626



Successfully registered model 'MyLogisticModel'.


Modèle enregistré dans le registre avec la version: 1
Le modèle a été mis en production.
Modèle logué et enregistré dans MLflow avec le run ID: 9625c07c3b2d4f3ca23d3024c4783999


Created version '1' of model 'MyLogisticModel'.
  mlflow_client.transition_model_version_stage(
