In [1]:
import logging
import sys
import json
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score,roc_curve, auc, confusion_matrix,classification_report

def read_answers(filename):
    answers={}
    with open(filename) as f:
        for line in f:
            line=line.strip()
            js=json.loads(line)
            answers[js['idx']]=js['target']
    return answers

def read_predictions(filename):
    predictions={}
    with open(filename) as f:
        for line in f:
            line=line.strip()
            idx,label=line.split()
            predictions[int(idx)]=int(label)
    return predictions

def read_predictions_prob(filename):
    predictions_prob={}
    with open(filename) as f:
        for line in f:
            line=line.strip()
            idx,label=line.split()
            predictions_prob[int(idx)]= float(label)
    return predictions_prob

def calculate_scores(answers,predictions,predictions_prob):
    Acc=[]
    Ans=[]
    Pred=[]
    Pred_prob=[]
    for key in answers:
        Ans.append(answers[key])
        if key not in predictions:
            logging.error("Missing prediction for index {}.".format(key))
            sys.exit()
        Acc.append(answers[key]==predictions[key])
    for key in predictions:
        Pred.append(predictions[key])
    for key in predictions_prob:
        Pred_prob.append(predictions_prob[key])
    scores={}
    results = []
#     scores['acc']=np.mean(Acc)
    fpr, tpr, _ = roc_curve(Ans, Pred_prob)
    results.append(auc(fpr, tpr)*100)
    results.append(accuracy_score(Ans,Pred)*100)
    results.append(precision_score(Ans,Pred,zero_division=0)*100)
    results.append(recall_score(Ans,Pred)*100)
    results.append(f1_score(Ans,Pred,zero_division=0)*100)
    zipped_result = zip(Ans,Pred,Pred_prob)
    sorted_zip = sorted(zipped_result, key=lambda x: x[2],reverse=True)
    print(confusion_matrix(Ans,Pred))
#     print('auc\t',auc(fpr, tpr))
#     print('acc\t',accuracy_score(Ans,Pred))
#     print('f1\t',f1_score(Ans,Pred))
#     print('recall\t',recall_score(Ans,Pred))
#     print('precision\t',precision_score(Ans,Pred))
    print("AUC, ACCURACY, PRECISION, RECALL, F1")
    print(results)
    print("END of calculate scores")
    return results,sorted_zip,Pred_prob

def read_output(test_dir,result_dir):
    answers=read_answers(test_dir) #+'test.jsonl'
    predictions=read_predictions(result_dir + 'predictions.txt')
    predictions_prob = read_predictions_prob(result_dir + 'predictions_prob.txt')
    scores,sorted_zip, Pred_prob=calculate_scores(answers,predictions,predictions_prob)
    return scores,sorted_zip,Pred_prob

In [2]:
# MUST SELECT BEST MODEL and RUN TEST BEFORE RUNNING THIS

result_list = [] 
test_dir = "$DIRECTORY_TO_BIGVUL_TEST_SET_IN_JSONL_FORMAT e.g ./bigvul_test.jsonl"
for i in range(44,45):
    try:
        result_dir = f'linevul/origin/saved_models_llm_{i}/'
        print(result_dir)
        result, sorted_zip, pred_prob = read_output(test_dir,result_dir)
    except Exception as e:
        print('error',i)
        print(e)
        # break

BIGVul Testing set
linevul/origin/saved_models_llm_44/
[[7918 9867]
 [ 349  730]]
AUC, ACCURACY, PRECISION, RECALL, F1
[56.388989273848935, 45.84393553859203, 6.888742096819854, 67.65523632993512, 12.504282288454949]
END of calculate scores


In [3]:
# MUST SELECT BEST MODEL and RUN TEST BEFORE RUNNING THIS
print("REVEL TESTING SET")
result_list = [] 
test_dir = "$DIRECTORY_TO_REVEAL_TRANSFORMED_INTO_JSONL -> example: ../reveal.jsonl"
for i in range(44,45): # experiment numbers ranges -> 44 was Extension with out Retriever
    try:
        result_dir = f'linevul/origin/saved_models_llm_{i}/'
        print(result_dir)
        result, sorted_zip, pred_prob = read_output(test_dir,result_dir)
    except Exception as e:
        print('error',i)
        print(e)
       
        # break

REVEL TESTING SET
linevul/origin/saved_models_llm_44/
[[16674  2719]
 [  747   354]]
AUC, ACCURACY, PRECISION, RECALL, F1
[64.40774977422166, 83.08773299502293, 11.519687601692159, 32.15258855585831, 16.962146621945376]
END of calculate scores
