In [40]:
# Import necessary libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import hamming_loss, f1_score, precision_score
from sklearn.multioutput import MultiOutputClassifier

In [41]:
# Load the dataset
file_path = './dataset.csv'
df = pd.read_csv(file_path)

# Checking for missing values
missing_values = df.isnull().sum()
print(f"Missing values in each column:\n{missing_values}")

# Handling missing values (e.g., filling with the mean for numerical columns)
# df.fillna(df.mean(), inplace=True)

Missing values in each column:
report                     0
type_blocker               0
type_regression            0
type_bug                   0
type_documentation         0
type_enhancement           0
type_task                  0
type_dependency_upgrade    0
dtype: int64


In [42]:
# Download stopwords from NLTK
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Text Cleaning Function
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters, digits, and extra spaces
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize and remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return " ".join(words)

# Apply text cleaning to the 'report' column
df['cleaned_report'] = df['report'].apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sefas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [43]:
# Vectorize text using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['cleaned_report'])

# Prepare the labels (binary matrix)
labels = df.drop(columns=['report', 'cleaned_report'])
y = labels.values

# Remove 'type_task' label
df = df.drop(columns=['type_task'])
labels = df.drop(columns=['report', 'cleaned_report'])
y = labels.values

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [44]:
# from sklearn.preprocessing import StandardScaler
#
# # Standardizing the TF-IDF features (feature scaling)
# scaler = StandardScaler(with_mean=False)  # 'with_mean=False' because sparse matrices can't be centered
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)


In [45]:
# Train and evaluate different models using MultiOutputClassifier for multi-label classification

# 1. Logistic Regression (One-vs-Rest)
lr_model = LogisticRegression(solver='liblinear')
lr_multi_model = MultiOutputClassifier(lr_model, n_jobs=-1)
lr_multi_model.fit(X_train, y_train)
lr_pred = lr_multi_model.predict(X_test)

In [46]:
# 2. Support Vector Machine (SVM)
svm_model = SVC(kernel='linear', probability=True)
svm_multi_model = MultiOutputClassifier(svm_model, n_jobs=-1)
svm_multi_model.fit(X_train, y_train)
svm_pred = svm_multi_model.predict(X_test)

In [47]:
# 3. Perceptron
perceptron_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000)
perceptron_multi_model = MultiOutputClassifier(perceptron_model, n_jobs=-1)
perceptron_multi_model.fit(X_train, y_train)
perceptron_pred = perceptron_multi_model.predict(X_test)

In [48]:
# 4. Deep Neural Network (DNN)
dnn_model = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000)
dnn_multi_model = MultiOutputClassifier(dnn_model, n_jobs=-1)
dnn_multi_model.fit(X_train, y_train)
dnn_pred = dnn_multi_model.predict(X_test)

In [49]:
# Evaluate using multi-label metrics
def evaluate_model(y_true, y_pred):
    # Hamming Loss
    hamming = hamming_loss(y_true, y_pred)
    # F1 Score (Micro)
    f1_micro = f1_score(y_true, y_pred, average='micro')
    # F1 Score (Macro)
    f1_macro = f1_score(y_true, y_pred, average='macro')
    # Precision (Micro)
    precision_micro = precision_score(y_true, y_pred, average='micro')
    return hamming, f1_micro, f1_macro, precision_micro

# Logistic Regression Evaluation
lr_hamming, lr_f1_micro, lr_f1_macro, lr_precision = evaluate_model(y_test, lr_pred)
print(f"Logistic Regression - Hamming Loss: {lr_hamming}, F1 Micro: {lr_f1_micro}, F1 Macro: {lr_f1_macro}, Precision: {lr_precision}")

# SVM Evaluation
svm_hamming, svm_f1_micro, svm_f1_macro, svm_precision = evaluate_model(y_test, svm_pred)
print(f"SVM - Hamming Loss: {svm_hamming}, F1 Micro: {svm_f1_micro}, F1 Macro: {svm_f1_macro}, Precision: {svm_precision}")

# Perceptron Evaluation
perceptron_hamming, perceptron_f1_micro, perceptron_f1_macro, perceptron_precision = evaluate_model(y_test, perceptron_pred)
print(f"Perceptron - Hamming Loss: {perceptron_hamming}, F1 Micro: {perceptron_f1_micro}, F1 Macro: {perceptron_f1_macro}, Precision: {perceptron_precision}")

# DNN Evaluation
dnn_hamming, dnn_f1_micro, dnn_f1_macro, dnn_precision = evaluate_model(y_test, dnn_pred)
print(f"DNN - Hamming Loss: {dnn_hamming}, F1 Micro: {dnn_f1_micro}, F1 Macro: {dnn_f1_macro}, Precision: {dnn_precision}")


Logistic Regression - Hamming Loss: 0.12170263788968826, F1 Micro: 0.8111627906976744, F1 Macro: 0.44579475953594544, Precision: 0.8433268858800773
SVM - Hamming Loss: 0.11270983213429256, F1 Micro: 0.8278388278388278, F1 Macro: 0.608248484018128, Precision: 0.846441947565543
Perceptron - Hamming Loss: 0.12170263788968826, F1 Micro: 0.8139321723189734, F1 Macro: 0.6227471048575731, Precision: 0.8330206378986866
DNN - Hamming Loss: 0.12470023980815348, F1 Micro: 0.8077634011090573, F1 Macro: 0.5924611469639677, Precision: 0.833969465648855


In [50]:
from sklearn.metrics import precision_score
import numpy as np

def precision_at_k(y_true, y_pred, k=3):
    # Calculate Precision@k for multi-label classification
    precision_at_k = []
    for true, pred in zip(y_true, y_pred):
        top_k_preds = np.argsort(pred)[::-1][:k]  # Get indices of top-k predictions
        correct_preds = np.sum([true[i] for i in top_k_preds])  # Check how many are correct
        precision_at_k.append(correct_preds / k)  # Precision@k = correct / k
    return np.mean(precision_at_k)

# Add Precision@k to the evaluation
precision_at_k_lr = precision_at_k(y_test, lr_pred)
print(f"Logistic Regression - Precision@k: {precision_at_k_lr}")

precision_at_k_svm = precision_at_k(y_test, svm_pred)
print(f"SVM - Precision@k: {precision_at_k_svm}")


Logistic Regression - Precision@k: 0.5815347721822542
SVM - Precision@k: 0.5959232613908872
