#**CSI5137 - Applications of NLP and ML in Software engineering**
#**Final Course Project**

**Mohammad Bin Yousuf - CU# 101239019**

**Vrishab Prasanth Davey - UO# 300438343**

**Surendar Pala Dana Sekaran - UO#300401916**


#Importing the libraries

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import pandas as pd
from sklearn.model_selection import train_test_split
import random
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sentence_transformers import SentenceTransformer, util
import pickle
import os
import nlpaug.augmenter.word as naw
import numpy as np
from prettytable import PrettyTable
from sklearn.utils.class_weight import compute_class_weight
from transformers import AutoTokenizer, AutoModelForSequenceClassification

nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

#Data Preprocessing

In [None]:
file_path = 'jira_flattened_results_clean.csv'
jira_data = pd.read_csv(file_path)

# Selecting relevant columns
useful_columns = [
    'fields_summary', 'fields_description', 'fields_priority_name',
    'fields_issuetype_name', 'fields_labels'
]
jira_data = jira_data[useful_columns]

# Handling missing values
jira_data['fields_summary'] = jira_data['fields_summary'].fillna('')
jira_data['fields_description'] = jira_data['fields_description'].fillna('')
jira_data['fields_labels'] = jira_data['fields_labels'].fillna('')

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

jira_data['fields_summary'] = jira_data['fields_summary'].apply(preprocess_text)
jira_data['fields_description'] = jira_data['fields_description'].apply(preprocess_text)

jira_data['text_combined'] = jira_data['fields_summary'] + " " + jira_data['fields_description']

jira_data.to_csv('preprocessed_jira_data.csv', index=False)

#Bug Classification Task

In [None]:
torch.manual_seed(42)
np.random.seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def preprocess_data(X, y, tokenizer, max_length=128):

    unique_labels = y.unique()
    label_to_id = {label: idx for idx, label in enumerate(unique_labels)}

    encodings = tokenizer(
        X.tolist(),
        truncation=True,
        padding=True,
        max_length=max_length
    )

    labels = [label_to_id[label] for label in y]

    return {
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'labels': labels
    }, label_to_id

X_classification = jira_data['text_combined']
y_classification = jira_data['fields_issuetype_name']

X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(
    X_classification, y_classification, test_size=0.2, random_state=42
)

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

train_encodings, label_to_id = preprocess_data(X_train_class, y_train_class, tokenizer)
test_encodings, _ = preprocess_data(X_test_class, y_test_class, tokenizer)

train_dataset = Dataset.from_dict(train_encodings)
test_dataset = Dataset.from_dict(test_encodings)

num_labels = len(label_to_id)

# Load BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label={v: k for k, v in label_to_id.items()},
    label2id=label_to_id
).to(device)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Training the model
trainer.train()

# Prediction function
def predict_bert(texts, model, tokenizer, device):

    model.eval()
    encodings = tokenizer(texts, truncation=True, padding=True, return_tensors='pt').to(device)

    with torch.no_grad():
        outputs = model(**encodings)
        predictions = torch.argmax(outputs.logits, dim=1)

    return [model.config.id2label[pred.item()] for pred in predictions]

y_pred_class = predict_bert(X_test_class.tolist(), model, tokenizer, device)

classification_metrics = classification_report(y_test_class, y_pred_class, output_dict=True)

def print_classification_metrics(metrics):

    table = PrettyTable()
    table.field_names = ["Class", "Precision", "Recall", "F1-Score", "Support"]

    for class_name, class_metrics in metrics.items():
        if class_name not in ['accuracy', 'macro avg', 'weighted avg']:
            table.add_row([
                class_name,
                f"{class_metrics['precision']:.3f}",
                f"{class_metrics['recall']:.3f}",
                f"{class_metrics['f1-score']:.3f}",
                int(class_metrics['support'])
            ])

    table.add_row(['-' * 10, '-' * 10, '-' * 10, '-' * 10, '-' * 10])

    macro_avg = metrics['macro avg']
    table.add_row([
        'Macro Avg',
        f"{macro_avg['precision']:.3f}",
        f"{macro_avg['recall']:.3f}",
        f"{macro_avg['f1-score']:.3f}",
        int(macro_avg['support'])
    ])

    weighted_avg = metrics['weighted avg']
    table.add_row([
        'Weighted Avg',
        f"{weighted_avg['precision']:.3f}",
        f"{weighted_avg['recall']:.3f}",
        f"{weighted_avg['f1-score']:.3f}",
        int(weighted_avg['support'])
    ])

    # Overall accuracy
    table.add_row([
        'Accuracy',
        '',
        '',
        f"{metrics['accuracy']:.3f}",
        ''
    ])

    print("Classification Metrics:")
    print(table)

print_classification_metrics(classification_metrics)

model.save_pretrained('./bert_classification_model')
tokenizer.save_pretrained('./bert_classification_tokenizer')

with open('label_mapping.pkl', 'wb') as f:
    pickle.dump(label_to_id, f)

#Bug Prioritization Task

In [None]:
X_priority = jira_data['text_combined']
y_priority = jira_data['fields_priority_name']

# Label encoding
label_encoder_priority = LabelEncoder()
y_priority_encoded = label_encoder_priority.fit_transform(y_priority)

# Split the data into train, validation, and test sets
X_train_priority, X_temp_priority, y_train_priority, y_temp_priority = train_test_split(
    X_priority, y_priority_encoded, test_size=0.3, random_state=42, stratify=y_priority_encoded
)
X_val_priority, X_test_priority, y_val_priority, y_test_priority = train_test_split(
    X_temp_priority, y_temp_priority, test_size=0.5, random_state=42, stratify=y_temp_priority
)

class BugPriorityDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Loading the tokenizer and datasets
tokenizer_priority = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset_priority = BugPriorityDataset(X_train_priority.tolist(), y_train_priority, tokenizer_priority, max_len=128)
val_dataset_priority = BugPriorityDataset(X_val_priority.tolist(), y_val_priority, tokenizer_priority, max_len=128)
test_dataset_priority = BugPriorityDataset(X_test_priority.tolist(), y_test_priority, tokenizer_priority, max_len=128)

# Computing the class weights for imbalanced datasets
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train_priority),
    y=y_train_priority
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to('cuda')

model_priority = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(label_encoder_priority.classes_)
)
model_priority.to('cuda')

# Defining custom loss function with class weights
def custom_loss_function(outputs, labels):
    logits = outputs.logits
    loss = torch.nn.functional.cross_entropy(logits, labels, weight=class_weights)
    return loss

# Defining custom Trainer
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """
        Custom compute_loss to handle weighted cross-entropy loss.
        Accepts additional arguments like 'num_items_in_batch' without breaking.
        """
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss = custom_loss_function(outputs, labels)
        return (loss, outputs) if return_outputs else loss

# Defining training arguments
training_args_priority = TrainingArguments(
    output_dir='./results_priority',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs_priority',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,  # Increased epochs for better learning
    weight_decay=0.01,
    load_best_model_at_end=True,
    report_to='none'
)

# Initializing the custom trainer
trainer_priority = CustomTrainer(
    model=model_priority,
    args=training_args_priority,
    train_dataset=train_dataset_priority,
    eval_dataset=val_dataset_priority,
    tokenizer=tokenizer_priority,
    compute_metrics=lambda p: {
        'accuracy': accuracy_score(p.label_ids, np.argmax(p.predictions, axis=1)),
        'precision': precision_recall_fscore_support(p.label_ids, np.argmax(p.predictions, axis=1), average='weighted')[0],
        'recall': precision_recall_fscore_support(p.label_ids, np.argmax(p.predictions, axis=1), average='weighted')[1],
        'f1': precision_recall_fscore_support(p.label_ids, np.argmax(p.predictions, axis=1), average='weighted')[2],
    }
)

trainer_priority.train()
results_priority = trainer_priority.evaluate()
print("Priority Task Metrics:", results_priority)

test_results = trainer_priority.predict(test_dataset_priority)
print("Test Set Metrics:", test_results.metrics)

model_priority.save_pretrained('./bert_priority_model')
tokenizer_priority.save_pretrained('./bert_priority_model')
with open('./bert_priority_model/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder_priority, f)

#Team Assignment Task

In [None]:
semantic_model = SentenceTransformer('all-mpnet-base-v2')

# Predefined team descriptions for assignment
team_descriptions = {
    "UI": "Handles frontend, user interface design, interaction problems, CSS, HTML issues, and component rendering errors.",
    "Backend": "Responsible for server-side logic, API endpoints, database operations, backend crashes, latency, and authentication failures.",
    "DevOps": "Focuses on infrastructure setup, CI/CD pipelines, cloud deployments, system configuration, containerization issues, and monitoring failures.",
    "QA": "Manages software testing, test automation, bug reporting, regression testing, performance issues, and quality assurance processes."
}

# Computing embeddings for team descriptions
team_embeddings = {team: semantic_model.encode(desc, convert_to_tensor=True) for team, desc in team_descriptions.items()}

# Assign bug to team using semantic cosine similarity
def assign_bug_to_team(text):
    augmented_texts = [text]
    aug = naw.SynonymAug(aug_src='wordnet', aug_max=3)
    augmented_texts += aug.augment(text, n=2)

    combined_embedding = torch.mean(
        torch.stack([semantic_model.encode(aug_text, convert_to_tensor=True) for aug_text in augmented_texts]),
        dim=0
    )

    similarities = {team: util.cos_sim(combined_embedding, team_embedding).item() for team, team_embedding in team_embeddings.items()}

    # Assign the team with the highest similarity score
    best_team = max(similarities, key=similarities.get)
    return best_team, similarities

#Function to process all 3 tasks

In [None]:
# Load the saved BERT model, tokenizer, and label mapping
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import pickle

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_classification = BertForSequenceClassification.from_pretrained('./bert_classification_model').to(device)
tokenizer_classification = BertTokenizer.from_pretrained('./bert_classification_tokenizer')

with open('label_mapping.pkl', 'rb') as f:
    label_to_id = pickle.load(f)

id_to_label = {v: k for k, v in label_to_id.items()}

def classify_bug_with_bert(text):
    """Classify the bug description using the BERT model."""
    model_classification.eval()
    inputs = tokenizer_classification(
        text,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model_classification(**inputs)
        predicted_label_id = torch.argmax(outputs.logits, dim=1).item()
        classification = id_to_label[predicted_label_id]
    return classification

In [None]:
def process_bug(text):
    # Classification
    classification = classify_bug_with_bert(text)

    # Priority
    inputs = tokenizer_priority(
        text,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    inputs = {key: val.to('cuda') for key, val in inputs.items()}  # Move to GPU
    outputs = model_priority(**inputs)
    predicted_label = torch.argmax(outputs.logits, axis=1).item()
    priority = label_encoder_priority.inverse_transform([predicted_label])[0]

    # Team Assignment
    assigned_team = assign_bug_to_team(text)
    return classification, priority, assigned_team

#Testing the model

In [None]:
bug_description = "allow copy selected lines diff like copy lines diff preview eg use changelog notes commit message"
classification, priority, assigned_team = process_bug(bug_description)
print(f"Classification: {classification}, Priority: {priority}, Assigned Team: {assigned_team[0]}")