In [3]:
import os
import pickle
import random
import numpy as np
import pandas as pd
from sklearn import svm
from typing import Dict
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

import glob
import torch
import wget
import zipfile
from tensorflow.keras.preprocessing.text import Tokenizer

from gensim.models.keyedvectors import KeyedVectors
from gensim.models.fasttext import FastText

from mittens import GloVe
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import load_model
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

from setfit import SetFitModel, SetFitTrainer
from datasets import load_dataset, logging
logging.set_verbosity_error()

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import XLNetForSequenceClassification, RobertaForSequenceClassification
from transformers import XLMRobertaForSequenceClassification, DistilBertForSequenceClassification
from transformers import RobertaTokenizer, XLMRobertaTokenizer, DistilBertTokenizer, XLNetTokenizer

### Common Helper Functions

In [4]:
def get_avg_report(results, folds):
    
    """
    function takes the input of predicted model results on five folds and returns
    average of weighted and macro Precision, Recall, F-1 
    
    """
    
    weighted_precision = []
    weighted_recall = []
    weighted_f1 = []
    
    macro_precision = []
    macro_recall = []
    macro_f1 = []
    
    for result_df in results:                        
        res_rows = result_df.tail(3)

        precision_scores =  res_rows['precision'].tolist()
        recall_scores =  res_rows['recall'].tolist()
        f1_scores =  res_rows['f1-score'].tolist()

        precision_macro_avg =  precision_scores[1]
        precision_weighted_avg = precision_scores[2]

        recall_macro_avg =  recall_scores[1]
        recall_weighted_avg = recall_scores[2]

        fl_accuracy = f1_scores[0]
        f1_scores_macro_avg =  f1_scores[1]
        f1_scores_weighted_avg = f1_scores[2]
                
        weighted_precision.append(precision_weighted_avg)
        weighted_recall.append(recall_weighted_avg)
        weighted_f1.append(f1_scores_weighted_avg)
        
        macro_precision.append(precision_macro_avg)
        macro_recall.append(recall_macro_avg)
        macro_f1.append(f1_scores_macro_avg)
                
    weighted_average = round(sum(weighted_precision) / folds, 2), round(sum(weighted_recall) / folds, 2), round(sum(weighted_f1) / folds, 2)
    macro_average = round(sum(macro_precision) / folds, 2), round(sum(macro_recall) / folds, 2), round(sum(macro_f1) / folds, 2)
            
    return weighted_average, macro_average

def get_accuracy(y_actual, y_predicted):
    """
    function takes the actual and predicted labels to return
    the accuracy per fold
    
    """
    count = 0
    for index in zip(y_actual, y_predicted):
        
        if index[0] == index[1]:
                count += 1
    topk_acc = round(count / len(y_actual), 2)
    return topk_acc


### ML alogrithms Pipeline

In [10]:
def load_ML_model_files(model_name, model_path, pca):
    
    """
    function load the ML models relevant files based 
    on the parameters given
    
    """
    
    ML_model = pickle.load(open(model_path + model_name + '.pickle', 'rb'))
    if pca:
        pca_vectorizer = pickle.load(open(model_path + 'pca_vectorizer.pickle', "rb"))
    else:
        pca_vectorizer = None
    tfidf_vectorizer = pickle.load(open(model_path + 'tfidf_vectorizer.pickle', "rb"))
    
    return ML_model, pca_vectorizer, tfidf_vectorizer

In [11]:
# load dataset for testing
fold_parent = './data/dronology_five_folds/'

sub_folders = []
for folder in os.listdir(fold_parent):
    if 'fold' in folder: 
        sub_folders.append(os.path.join(fold_parent, folder))
        

In [77]:
# replace the value of 'model_name' with desired tradional ML model's name to get results for the model
# to trigger more traditional ML models check the names in: model/ML_models. examples, DT, SVM, pLR etc. 
# put 'p' infront of the model name to couple our pre-processing pipeline
model_name = 'SVM'
PCA = True
map_labels = {0: 'information', 1: 'requirement'}

In [78]:
# load test data & make prediction

ml_results = []
avg_accuracy = []
fold_count = 1

for subs in sorted(sub_folders):
    test_path = subs + '/test_' + 'fold_' + str(fold_count) + '.csv'
    
    df_test = pd.read_csv(test_path)
    df_test['STR.REQ'] = df_test['STR.REQ'].str.lower()
    X_test = df_test['STR.REQ']
    y_test = df_test['class']
    
    model_path = 'D:\Github Projects -Parth\REFSQ2023-ReqORNot\models\ML_models\\' + model_name + '\\fold_' + str(fold_count) + '\\'
    ML_model, pca_vectorizer, tfidf_vectorizer = load_ML_model_files(model_name, model_path, PCA)

    tfidf_vecs = tfidf_vectorizer.transform(X_test)
    normalized_tfidf = normalize(tfidf_vecs)

    test_vecs = pca_vectorizer.transform(normalized_tfidf.toarray())
    predicted_labels = ML_model.predict(test_vecs)
    
    evaluation_results = classification_report(y_test.tolist(), predicted_labels.tolist(), 
                                               target_names=list(map_labels.values()), 
                                               output_dict=True)
    
    avg_accuracy.append(get_accuracy(y_test.tolist(), predicted_labels.tolist()))
    
    report_df = pd.DataFrame(evaluation_results).transpose()
    ml_results.append(report_df)
    
    print('\nResults for dataset fold number :',fold_count, 'on model :', model_name)
    print('\n',report_df)
    print('--------------------------------------')
    
    fold_count += 1




Results for dataset fold number : 1 on model : SVM

               precision    recall  f1-score    support
information    0.823529  1.000000  0.903226  56.000000
requirement    1.000000  0.400000  0.571429  20.000000
accuracy       0.842105  0.842105  0.842105   0.842105
macro avg      0.911765  0.700000  0.737327  76.000000
weighted avg   0.869969  0.842105  0.815911  76.000000
--------------------------------------

Results for dataset fold number : 2 on model : SVM

               precision    recall  f1-score    support
information    0.787879  0.928571  0.852459  56.000000
requirement    0.600000  0.300000  0.400000  20.000000
accuracy       0.763158  0.763158  0.763158   0.763158
macro avg      0.693939  0.614286  0.626230  76.000000
weighted avg   0.738437  0.763158  0.733391  76.000000
--------------------------------------

Results for dataset fold number : 3 on model : SVM

               precision    recall  f1-score  support
information    0.768116  0.963636  0.854839    

  tfidf_vectorizer = pickle.load(open(model_path + 'tfidf_vectorizer.pickle', "rb"))
  tfidf_vectorizer = pickle.load(open(model_path + 'tfidf_vectorizer.pickle', "rb"))
  tfidf_vectorizer = pickle.load(open(model_path + 'tfidf_vectorizer.pickle', "rb"))
  tfidf_vectorizer = pickle.load(open(model_path + 'tfidf_vectorizer.pickle', "rb"))
  tfidf_vectorizer = pickle.load(open(model_path + 'tfidf_vectorizer.pickle', "rb"))


In [14]:
# Average results of ML pipeline

avg_acc_score = round(np.mean(avg_accuracy), 2)
weighted_avg, macro_avg = get_avg_report(ml_results, folds=5)

avg_scores = list([weighted_avg, macro_avg, (avg_acc_score, avg_acc_score, avg_acc_score)])

final_df = pd.DataFrame([x for x in avg_scores], columns=(['Precision', 'Recall', 'F1_score']),
                      index=['weighted_avg','macro_avg', 'accuracy_avg'])

final_df.rename_axis('5-folds')

Unnamed: 0_level_0,Precision,Recall,F1_score
5-folds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
weighted_avg,0.78,0.79,0.75
macro_avg,0.76,0.63,0.64
accuracy_avg,0.78,0.78,0.78


### BERT Family Pipeline

In [15]:
def load_tokenizer(model_name):
    
    """
    loads and returns the relevant tokenizer for passed parameter BERT model name
    
    """
    if model_name in ('BERT_base_uncased', 
                      'pBERT_base_uncased'):
        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased",
                                                  do_lower_case=True)
                
    elif model_name in ('BERT_base_cased',
                        'pBERT_base_cased'):
        tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
    
    elif model_name in ('pXLNet_base', 
                        'XLNet_base'):
        tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
    
    elif model_name in ('SciBERT_uncased', 
                        'pSciBERT_uncased'):
        tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased', 
                                                  do_lower_case=True)
    
    elif model_name in ('pRoBERTa_base', 
                        'RoBERTa_base'):
        tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

    elif model_name in ('DisBERT_base_cased', 
                        'pDisBERT_base_cased'):
        tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased")
    
    elif model_name in ('DisBERT_base_uncased', 
                        'pDisBERT_base_uncased'):
        tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

    else:
        #'pXRBERT_base', 'XRBERT_base'
        tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
    
    return tokenizer
        

In [16]:
def load_BERT_model(model_name, model_path):
    """
    loads and returns the BERT model based on the model name and path parameters
    
    """
    
    if model_name in ('BERT_base_uncased', 'pBERT_base_cased',
                      'pBERT_base_uncased', 'BERT_base_cased',
                      'SciBERT_uncased', 'pSciBERT_uncased'
                     ):
        model = BertForSequenceClassification.from_pretrained(model_path)                
    elif model_name in ('pXLNet_base', 
                        'XLNet_base'
                       ):
        model = XLNetForSequenceClassification.from_pretrained(model_path)
    
    elif model_name in ('pRoBERTa_base', 
                        'RoBERTa_base'
                       ):
        model = RobertaForSequenceClassification.from_pretrained(model_path)

    elif model_name in ('DisBERT_base_cased', 'DisBERT_base_uncased',
                        'pDisBERT_base_cased', 'pDisBERT_base_uncased'
                       ):
        model = DistilBertForSequenceClassification.from_pretrained(model_path)    
    
    else:
        #'pXRBERT_base', 'XRBERT_base'
        model = XLMRobertaForSequenceClassification.from_pretrained(model_path)
    
    return model

In [33]:
# replace the value of 'model_name' with BERT model's name to get results for the model
# to trigger more BERT models check the names in: model/BERT_family. examples, BERT_base_cased etc. 
# put 'p' infront of the model name to couple our pre-processing pipeline

map_labels = {0: 'information', 1: 'requirement'}

prefix = './models/BERT_family/'
model_name = 'DisBERT_base_uncased'

fold_parent = './data/dronology_five_folds/'

sub_folders = []
for folder in os.listdir(fold_parent):
    if 'fold' in folder: 
        sub_folders.append(os.path.join(fold_parent, folder))

tokenizer = load_tokenizer(model_name)
MAX_SEQ_LENGTH = 128

In [34]:
tokenizer

PreTrainedTokenizer(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [35]:
fold_count = 1
results = []
avg_accuracy = []
for subs in sorted(sub_folders):
    test_path = subs + '/test_' + 'fold_' + str(fold_count) + '.csv'
    print(test_path)
    df_test = pd.read_csv(test_path)
    selected_test = df_test[['STR.REQ','class']]

    test_sequences = selected_test['STR.REQ'].tolist()

    test_encodings = tokenizer(test_sequences, truncation=True, 
                               padding=True, 
                               max_length=MAX_SEQ_LENGTH, 
                               return_tensors="pt")
    # load model
    # model_path = glob.glob(prefix + model_name + '/fold_' + str(fold_count) + '/*')[0]
    # bert_model = load_BERT_model(model_name, model_path)
    # 
    # with torch.no_grad():
    #     logits = bert_model(**test_encodings).logits
    # 
    # predictions = np.argmax(logits, axis=1)
    # evaluation_results = classification_report(selected_test['class'].tolist(), 
    #                                            predictions.tolist(), 
    #                                            target_names=list(map_labels.values()), 
    #                                            output_dict=True)
    # 
    # avg_accuracy.append(get_accuracy(selected_test['class'].tolist(), 
    #                                  predictions.tolist()))
    # 
    # report_df = pd.DataFrame(evaluation_results).transpose()
    # results.append(report_df)
    # 
    # print('\nResults for dataset fold number :',fold_count, 'on model :', model_name)
    # print('\n',report_df)
    # print('--------------------------------------')
    # 
    # fold_count += 1
    
    model_paths = glob.glob(prefix + model_name + '/fold_' + str(fold_count) + '/*')
    print(prefix + model_name + '/fold_' + str(fold_count)+ '/*')
    if model_paths:
        model_path = model_paths[0]
        
        bert_model = load_BERT_model(model_name, model_path)
        
        # Rest of your code for model evaluation
        with torch.no_grad():
            logits = bert_model(**test_encodings).logits
    
        predictions = np.argmax(logits, axis=1)
        evaluation_results = classification_report(selected_test['class'].tolist(), 
                                                   predictions.tolist(), 
                                                   target_names=list(map_labels.values()), 
                                                   output_dict=True)
    
        avg_accuracy.append(get_accuracy(selected_test['class'].tolist(), 
                                         predictions.tolist()))
    
        report_df = pd.DataFrame(evaluation_results).transpose()
        results.append(report_df)
    
        print('\nResults for dataset fold number :',fold_count, 'on model :', model_name)
        print('\n',report_df)
        print('--------------------------------------')
        
    else:
        print(f"No model files found for fold {fold_count} and model {model_name}.")    
    fold_count+=1

./data/dronology_five_folds/fold_1/test_fold_1.csv
./models/BERT_family/DisBERT_base_uncased/fold_1/*

Results for dataset fold number : 1 on model : DisBERT_base_uncased

               precision    recall  f1-score    support
information    0.981132  0.928571  0.954128  56.000000
requirement    0.826087  0.950000  0.883721  20.000000
accuracy       0.934211  0.934211  0.934211   0.934211
macro avg      0.903610  0.939286  0.918925  76.000000
weighted avg   0.940331  0.934211  0.935600  76.000000
--------------------------------------
./data/dronology_five_folds/fold_2/test_fold_2.csv
./models/BERT_family/DisBERT_base_uncased/fold_2/*

Results for dataset fold number : 2 on model : DisBERT_base_uncased

               precision    recall  f1-score    support
information    0.833333  0.892857  0.862069  56.000000
requirement    0.625000  0.500000  0.555556  20.000000
accuracy       0.789474  0.789474  0.789474   0.789474
macro avg      0.729167  0.696429  0.708812  76.000000
weighted a

In [36]:
# Average results of BERT model

avg_acc_score = round(np.mean(avg_accuracy), 2)
weighted_avg, macro_avg = get_avg_report(results, folds=5)

avg_scores = list([weighted_avg, macro_avg, (avg_acc_score, avg_acc_score, 
                                             avg_acc_score)])

final_df = pd.DataFrame([x for x in avg_scores], 
                        columns=(['Precision', 'Recall', 'F1_score']),
                        index=['weighted_avg','macro_avg', 'accuracy_avg'])

final_df.rename_axis('5-folds')

Unnamed: 0_level_0,Precision,Recall,F1_score
5-folds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
weighted_avg,0.85,0.86,0.85
macro_avg,0.81,0.81,0.8
accuracy_avg,0.86,0.86,0.86


### Random Pipeline

In [71]:
def get_random_label(ranges):
    """
    returns the random label from the defined ranges of the labels
    """
    temp=random.randint(1, ranges[-1][-1])
    
    for r in ranges:
        if(temp>r[1] and temp<=r[-1]):
            return r[0]
    return None

def get_ranges(df):
    """
    predicts the random labels on the given test dataset
    
    """
    csum = 0
    ranges = []
    total_tr = len(df)

    for k, v in df['class'].value_counts().to_dict().items():

        csum_old = csum
        csum += round((v/total_tr) * 100,0)
        #print (k,"from", csum_old, "to",csum)
        ranges.append([k, csum_old, csum])
    
    r_out = []
    for row in test_df.iterrows():
        r3labels = []

        while len(r3labels)!=1:
            rl = get_random_label(ranges)
            if not rl in r3labels:
                r3labels.append(rl)

        r_out.append([row[1]['issueid'], row[1]['class'], r3labels])

    return ranges, r_out

In [72]:
# load dataset
fold_parent = './data/dronology_five_folds/'

sub_folders = []
for folder in os.listdir(fold_parent):
    if 'fold' in folder: 
        sub_folders.append(os.path.join(fold_parent, folder)) 

In [73]:
fold_count = 1
for subs in sorted(sub_folders):
    
    test_path = subs + '/test_' + 'fold_' + str(fold_count) + '.csv'
    test_df = pd.read_csv(test_path)
    ranges, r_out = get_ranges(test_df)
    
    random_out = pd.DataFrame()
    random_out['issueid'] = [i[0] for i in r_out]
    random_out['class'] = [i[1] for i in r_out]
    random_out['top_label'] = [i[2][0] for i in r_out]
    evaluation_results = classification_report(random_out['class'], random_out['top_label'], 
                                               target_names=list(map_labels.values()), 
                                               output_dict=True)
    
    report_df = pd.DataFrame(evaluation_results).transpose()
    print('\nResults for fold number :',fold_count)
    print('\n',report_df)
    print('--------------------------------------')
    
    fold_count += 1


Results for fold number : 1

               precision    recall  f1-score    support
information    0.760000  0.678571  0.716981  56.000000
requirement    0.307692  0.400000  0.347826  20.000000
accuracy       0.605263  0.605263  0.605263   0.605263
macro avg      0.533846  0.539286  0.532404  76.000000
weighted avg   0.640972  0.605263  0.619835  76.000000
--------------------------------------

Results for fold number : 2

               precision    recall  f1-score    support
information    0.679245  0.642857  0.660550  56.000000
requirement    0.130435  0.150000  0.139535  20.000000
accuracy       0.513158  0.513158  0.513158   0.513158
macro avg      0.404840  0.396429  0.400043  76.000000
weighted avg   0.534821  0.513158  0.523441  76.000000
--------------------------------------

Results for fold number : 3

               precision    recall  f1-score    support
information    0.740000  0.672727  0.704762  55.000000
requirement    0.280000  0.350000  0.311111  20.000000
accu