In [29]:
import pymongo
import pandas as pd
from collections import Counter
import random
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, desc
import pymongo
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Connexion à MongoDB
uri = "mongodb://mongodb:27017/"
client = pymongo.MongoClient(uri)
db = client["github_issues"]
collection = db["closed_issues"]



In [30]:
# Extraire les données
cursor = collection.find({"duration": {"$exists": True}})

data = []
for issue in cursor:
    title = issue["title"]
    body = issue["body"]
    language = issue["language"]
    duration_class = issue["duration_class"]
    
    # Vous pouvez concaténer le titre et le body pour un meilleur modèle
    text = title + " " + body
    data.append([text, language, duration_class])

# Convertir en DataFrame
df = pd.DataFrame(data, columns=["text", "language", "duration_class"])

# Afficher les premières lignes du DataFrame pour vérifier les données
df.head()


Unnamed: 0,text,language,duration_class
0,Errors occured when trying to run sniffnet: in...,Rust,2
1,thread 'thread_write_report' panic when run no...,Rust,5
2,unable to install on Windows 11 ARM64 VM insta...,Rust,5
3,"The ""Open full report"" button might resize awk...",Rust,5
4,Include WPCAP on installation? wpcap can be a ...,Rust,5


In [31]:
# Afficher la distribution des classes dans l'ensemble de données complet
class_distribution = df['duration_class'].value_counts(normalize=True) * 100
print("Répartition des classes dans l'ensemble complet :")
print(class_distribution)


Répartition des classes dans l'ensemble complet :
duration_class
5    23.744574
3    20.150176
1    19.717707
4    18.817920
2    17.569623
Name: proportion, dtype: float64


In [32]:
# Prétraiter les données
X = df["text"]
y = df["duration_class"]

# Vectoriser le texte avec TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

# Ajouter la colonne de langue en tant que feature
X_language = pd.get_dummies(df["language"])

# Fusionner les caractéristiques textuelles et celles liées au langage
import scipy.sparse as sp
X_final = sp.hstack([X_tfidf, X_language.values])

# Afficher les dimensions de la matrice résultante
print(f"Dimensions de X_final : {X_final.shape}")


Dimensions de X_final : (63126, 5013)


In [33]:
# Diviser les données en ensembles d'entraînement et de test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)

# Afficher la distribution des classes dans les ensembles d'entraînement et de test
print("Répartition des classes dans l'ensemble d'entraînement :")
print(y_train.value_counts(normalize=True) * 100)

print("Répartition des classes dans l'ensemble de test :")
print(y_test.value_counts(normalize=True) * 100)


Répartition des classes dans l'ensemble d'entraînement :
duration_class
5    23.766337
3    20.158416
1    19.764356
4    18.778218
2    17.532673
Name: proportion, dtype: float64
Répartition des classes dans l'ensemble de test :
duration_class
5    23.657532
3    20.117218
1    19.531126
4    18.976715
2    17.717409
Name: proportion, dtype: float64


In [34]:
# Entraîner un modèle de régression logistique
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Prédire sur l'ensemble de test
y_pred = model.predict(X_test)

# Évaluer les résultats
print("Métriques de classification sur l'ensemble de test :")
print(classification_report(y_test, y_pred))


Métriques de classification sur l'ensemble de test :
              precision    recall  f1-score   support

           1       0.30      0.38      0.33      2466
           2       0.22      0.11      0.14      2237
           3       0.24      0.21      0.22      2540
           4       0.26      0.19      0.22      2396
           5       0.34      0.49      0.40      2987

    accuracy                           0.29     12626
   macro avg       0.27      0.28      0.26     12626
weighted avg       0.27      0.29      0.27     12626



In [35]:
# Afficher les métriques de chaque classe séparément
from sklearn.metrics import precision_recall_fscore_support

precision, recall, fscore, _ = precision_recall_fscore_support(y_test, y_pred, average=None)

# Créer un DataFrame avec les résultats
metrics_df = pd.DataFrame({
    'Class': sorted(y_test.unique()),
    'Precision': precision,
    'Recall': recall,
    'F1-Score': fscore
})

# Afficher les métriques pour chaque classe
print(metrics_df)


   Class  Precision    Recall  F1-Score
0      1   0.300842  0.376723  0.334534
1      2   0.215971  0.106392  0.142558
2      3   0.235189  0.206299  0.219799
3      4   0.258029  0.194491  0.221799
4      5   0.335756  0.494811  0.400054


In [39]:
import pandas as pd
from scipy import sparse

# Test example issue
test_example = ["Error after inserting an extern file", 
                "Cant build the app /.", 
                "Java"]

# Preprocess the text of the example
test_text = test_example[0] + " " + test_example[1]  # Combine title and description
test_language = test_example[2]  # Language

# Vectorize the text of the example using the same vectorizer as the training phase
test_tfidf = vectorizer.transform([test_text])

# Convert the language feature into the same format as the training set (one-hot encoding)
test_language_feature = pd.get_dummies([test_language]).reindex(columns=X_language.columns, fill_value=0)

# Convert to a numeric array (integer dtype)
test_language_feature = test_language_feature.values.astype(int)

# Convert to sparse matrix format
test_language_feature_sparse = sparse.csr_matrix(test_language_feature)

# Combine text and language features into a sparse matrix (same as during training)
test_input = sparse.hstack([test_tfidf, test_language_feature_sparse])

# Make a prediction using the trained model
predicted_class = model.predict(test_input)

# Print the predicted class (duration interval)
print(f"The predicted class  is: {predicted_class[0]}")


The predicted class  is: 4
