In [1]:
import os
import mlflow
import mlflow.keras
import pandas as pd
import scipy.sparse as sp
import pymongo
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib
from mlflow.tracking import MlflowClient
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# --- Connexion à MongoDB ---
uri = "mongodb://mongodb:27017/"
client = pymongo.MongoClient(uri)
db = client["github_issues"]
collection = db["closed_issues"]

# --- Extraction des données depuis MongoDB ---
data = []
for issue in collection.find({"duration_class": {"$exists": True}}):
    title = issue.get("title", "")
    body = issue.get("body", "")
    language = issue.get("language", "unknown")
    duration_class = issue.get("duration_class", "unknown")
    text = title + " " + body
    data.append([text, language, duration_class])

# --- Conversion en DataFrame ---
df = pd.DataFrame(data, columns=["text", "language", "duration_class"])
print("Distribution des classes dans l'ensemble complet :")
print(df["duration_class"].value_counts(normalize=True) * 100)

# --- Préparation des features et labels ---
X = df["text"]
y = df["duration_class"]

# --- Vectorisation avec TF-IDF ---
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(X)
print("Shape of TF-IDF features:", X_tfidf.shape)

# --- Ajout de la langue comme feature catégorielle ---
LANGUAGE_COLUMNS = ["Python", "Java", "JavaScript", "C++", "Ruby", "Go", "PHP", "Other"]

# Encodage one-hot de la langue et réindexation
X_language = pd.get_dummies(df["language"]).reindex(columns=LANGUAGE_COLUMNS, fill_value=0)
print("Shape of dense language features:", X_language.shape)

# Conversion en matrice sparse
X_language_sparse = sp.csr_matrix(X_language.to_numpy().astype(float))
print("Shape of sparse language features:", X_language_sparse.shape)

# Combinaison des features TF-IDF et langue
X_final = sp.hstack([X_tfidf, X_language_sparse])
print("Shape of combined features:", X_final.shape)

# Conversion en tableau dense pour le réseau de neurones
X_final_dense = X_final.toarray()

# --- Encodage des labels ---
le = LabelEncoder()
y_encoded = le.fit_transform(y)
num_classes = len(le.classes_)
y_categorical = to_categorical(y_encoded, num_classes=num_classes)

# --- Séparation train/test ---
X_train, X_test, y_train, y_test, y_train_enc, y_test_enc = train_test_split(
    X_final_dense, y_categorical, y_encoded, test_size=0.2, random_state=42, stratify=y
)
print("Distribution des classes dans l'ensemble d'entraînement :")
print(pd.Series(y_train_enc).value_counts(normalize=True) * 100)
print("Distribution des classes dans l'ensemble de test :")
print(pd.Series(y_test_enc).value_counts(normalize=True) * 100)

# --- Configuration de MLflow ---
mlflow.set_tracking_uri("../mlruns")  # Ajustez le chemin si nécessaire
mlflow.set_experiment("Classification Duration NN")
mlflow.autolog()  # Activer l'autologging

with mlflow.start_run() as run:
    # --- Construction du modèle de réseau de neurones ---
    input_dim = X_final_dense.shape[1]
    model = Sequential()
    model.add(Dense(128, activation='relu', input_dim=input_dim))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    # --- Entraînement du modèle ---
    history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)
    
    # --- Évaluation du modèle ---
    loss, accuracy = model.evaluate(X_test, y_test)
    print("Test accuracy:", accuracy)
    
    # Génération des prédictions pour le rapport de classification
    y_pred_prob = model.predict(X_test)
    y_pred = np.argmax(y_pred_prob, axis=1)
    print(classification_report(y_test_enc, y_pred, target_names=le.classes_))
    
    # --- Sauvegarde locale du modèle, vectorizer et label encoder ---
    model_dir = "../models"
    os.makedirs(model_dir, exist_ok=True)
    model_path = os.path.join(model_dir, "nn_model.h5")
    model.save(model_path)
    vectorizer_path = os.path.join(model_dir, "tfidf_vectorizer.pkl")
    joblib.dump(vectorizer, vectorizer_path)
    labelencoder_path = os.path.join(model_dir, "label_encoder.pkl")
    joblib.dump(le, labelencoder_path)
    
    # --- Log du modèle et des artefacts avec MLflow ---
    mlflow.keras.log_model(model, artifact_path="nn_model")
    mlflow.log_artifact(model_path)
    
    # --- Enregistrement du modèle dans le MLflow Model Registry ---
    model_uri = f"runs:/{run.info.run_id}/nn_model"
    registered_model = mlflow.register_model(model_uri, "MyNNModel")
    print("Modèle enregistré dans le registre avec la version:", registered_model.version)
    
    # --- Transition du modèle vers l'état 'Production' ---
    mlflow_client = MlflowClient()
    mlflow_client.transition_model_version_stage(
        name="MyNNModel",
        version=registered_model.version,
        stage="Production"
    )
    print("Le modèle a été mis en production.")

print("Modèle logué et enregistré dans MLflow avec le run ID:", run.info.run_id)


2025-02-22 19:22:06.504125: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-22 19:22:06.762962: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740252126.865501   19301 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740252126.894072   19301 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-22 19:22:07.135000: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Distribution des classes dans l'ensemble complet :
duration_class
5    23.744574
3    20.150176
1    19.717707
4    18.817920
2    17.569623
Name: proportion, dtype: float64
Shape of TF-IDF features: (63126, 5000)
Shape of dense language features: (63126, 8)
Shape of sparse language features: (63126, 8)
Shape of combined features: (63126, 5008)


2025/02/22 19:22:20 INFO mlflow.tracking.fluent: Experiment with name 'Classification Duration NN' does not exist. Creating a new experiment.
2025/02/22 19:22:20 INFO mlflow.tracking.fluent: Autologging successfully enabled for keras.


Distribution des classes dans l'ensemble d'entraînement :
4    23.744554
2    20.150495
0    19.716832
3    18.817822
1    17.570297
Name: proportion, dtype: float64
Distribution des classes dans l'ensemble de test :
4    23.744654
2    20.148899
0    19.721210
3    18.818311
1    17.566925
Name: proportion, dtype: float64


2025/02/22 19:22:20 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/02/22 19:22:20 INFO mlflow.tracking.fluent: Autologging successfully enabled for tensorflow.
2025/02/22 19:22:20 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.
The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2025-02-22 19:22:21.482426: E external/local_

: 