In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from nltk.translate.bleu_score import corpus_bleu
import sys
import os
import json
parent_dir = os.path.dirname(os.getcwd())
sys.path.insert(0, parent_dir)

## SUPPORT FUNCTIONS (METRICS)

In [3]:
prompt_methods = ['zero-shot', 'few-shot-project', 'few-shot-bm25', 'few-shot-codeBERT', 'cot', 'critique', 'expert']

def compute_bleu(res, data, method='zero-shot'):
    candidate_list = []
    reference_list = []
    for i in range(0, len(res)):
        for j in range(0, len(res[i]['functions_res'])):
            candidate = word_tokenize(res[i]['functions_res'][j][method])#.split()
            reference = word_tokenize(data[i]['functions'][j]['docstring'])#.split()
            candidate_list.append(candidate)
            reference_list.append([reference])
    bleu_score = corpus_bleu(reference_list, candidate_list)
    print(f"BLEU Score ({method}):", bleu_score)
    return bleu_score

In [4]:
import nltk
from nltk.translate import meteor
from nltk import word_tokenize
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('omw-1.4')
def compute_meteor(res, data, method="zero-shot"):
    scores = []
    for i in range(0, len(res)):
        for j in range(0, len(res[i]['functions_res'])):
            candidate = word_tokenize(res[i]['functions_res'][j][method])#.split()
            reference = word_tokenize(data[i]['functions'][j]['docstring'])#.split()
            score = round(meteor([reference], candidate), 4)
            scores.append(score)
    meteor_score = sum(scores) / len(scores)
    print(f"METEOR Score ({method}):", meteor_score)
    return meteor_score

[nltk_data] Downloading package wordnet to /home/quanvo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/quanvo/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/quanvo/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
def compute_rougel(res, data, method="zero-shot"):
    #CODE BASED ON RENCOS: https://github.com/zhangj111/rencos/blob/master/evaluation/rouge/rouge.py
    def my_lcs(string, sub):
        """
        Calculates longest common subsequence for a pair of tokenized strings
        :param string : list of str : tokens from a string split using whitespace
        :param sub : list of str : shorter string, also split using whitespace
        :returns: length (list of int): length of the longest common subsequence between the two strings

        Note: my_lcs only gives length of the longest common subsequence, not the actual LCS
        """
        if(len(string)< len(sub)):
            sub, string = string, sub

        lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)]

        for j in range(1,len(sub)+1):
            for i in range(1,len(string)+1):
                if(string[i-1] == sub[j-1]):
                    lengths[i][j] = lengths[i-1][j-1] + 1
                else:
                    lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1])

        return lengths[len(string)][len(sub)]

    def calc_score(candidate, refs, beta=1.2):
        """
        Compute ROUGE-L score given one candidate and references for an image
        :param candidate: str : candidate sentence to be evaluated
        :param refs: list of str : COCO reference sentences for the particular image to be evaluated
        :returns score: int (ROUGE-L score for the candidate evaluated against references)
        """
        assert(len(candidate)==1)
        assert(len(refs)>0)         
        prec = []
        rec = []

        # split into tokens
        token_c = candidate[0].split(" ")

        for reference in refs:
            # split into tokens
            token_r = reference.split(" ")
            # compute the longest common subsequence
            lcs = my_lcs(token_r, token_c)
            prec.append(lcs/float(len(token_c)))
            rec.append(lcs/float(len(token_r)))

        prec_max = max(prec)
        rec_max = max(rec)

        if(prec_max!=0 and rec_max !=0):
            score = ((1 + beta**2)*prec_max*rec_max)/float(rec_max + beta**2*prec_max)
        else:
            score = 0.0
        return score
    
    scores = []
    for i in range(0, len(res)):
        for j in range(0, len(res[i]['functions_res'])):
            candidate = res[i]['functions_res'][j][method]
            reference = data[i]['functions'][j]['docstring']
            score = round(calc_score([candidate], [reference]), 4)
            scores.append(score)
    rougel_score = sum(scores) / len(scores)
    print(f"ROUGE-L Score ({method}):", rougel_score)
    return rougel_score

In [6]:
import evaluate
from evaluate import load
bertscore = load("bertscore")
def compute_bertscore(res, data, method="zero-shot"):
    candidate_list = []
    reference_list = []
    for i in range(0, len(res)):
        for j in range(0, len(res[i]['functions_res'])):
            candidate = res[i]['functions_res'][j][method]
            reference = data[i]['functions'][j]['docstring']
            candidate_list.append(candidate)
            reference_list.append(reference)
    bert_score = bertscore.compute(predictions=candidate_list, references=reference_list, lang="en")
    f1 = sum(bert_score['f1']) / len(bert_score['f1'])
    print(f"BERT Score F1 ({method}):", f1)
    return bert_score

In [13]:
from sentence_transformers import SentenceTransformer, SimilarityFunction

def compute_sentencebert(res, data, method="zero-shot"):
    candidate_list = []
    reference_list = []
    for i in range(0, len(res)):
        for j in range(0, len(res[i]['functions_res'])):
            candidate = res[i]['functions_res'][j][method]
            reference = data[i]['functions'][j]['docstring']
            candidate_list.append(candidate)
            reference_list.append(reference)
    model = SentenceTransformer("all-mpnet-base-v2", similarity_fn_name=SimilarityFunction.COSINE)
    candidate_embeddings = model.encode(candidate_list)
    reference_embeddings = model.encode(reference_list)
    cos_sim_pairwise = model.similarity(candidate_embeddings, reference_embeddings)
    cos_sim = 0
    model = SentenceTransformer("all-mpnet-base-v2", similarity_fn_name=SimilarityFunction.EUCLIDEAN)
    euclidean_sim_pairwise = model.similarity(candidate_embeddings, reference_embeddings)
    euclidean_sim = 0
    for i in range(0, len(candidate_list)):
        euclidean_sim += euclidean_sim_pairwise[i,i]
        cos_sim += cos_sim_pairwise[i,i]
    euclidean_sim /= len(candidate_list)
    cos_sim /= len(candidate_list)
    print(f"SentenceBert euclidean similarity ({method}):", euclidean_sim.item())
    print(f"SentenceBert cosine similarity ({method}):", cos_sim.item())
    return euclidean_sim, cos_sim
    

# JAVA

In [7]:
#LOAD GENERATED RESULT
res = []
path = 'data'
file_name = 'res_java.jsonl'
file_path = os.path.join(parent_dir, path, file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        try:
            json_object = json.loads(line)
            res.append(json_object)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON on line: {line.strip()} - {e}")

In [8]:
#LOAD TEST DATA
data = []
path = 'data'
file_name = 'java-test-train-small.jsonl'
file_path = os.path.join(parent_dir, path, file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        try:
            json_object = json.loads(line)
            data.append(json_object)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON on line: {line.strip()} - {e}")

In [9]:
print(data[0]['functions'][0].keys())
print(res[0]['functions_res'][0].keys())

dict_keys(['code', 'code_tokens', 'docstring', 'project', 'name', 'code_tokens_processed', 'bm25', 'CodeBERT'])
dict_keys(['zero-shot', 'few-shot-project', 'few-shot-codeBERT', 'cot', 'critique', 'few-shot-bm25', 'expert'])


## BLEU

In [14]:
for i in range(0, len(prompt_methods)):
    compute_bleu(res, data, method=prompt_methods[i])

BLEU Score (zero-shot): 0.016874090156521204
BLEU Score (few-shot-project): 0.03584189326003828
BLEU Score (few-shot-bm25): 0.02243248381754831
BLEU Score (few-shot-codeBERT): 0.029398894297030725
BLEU Score (cot): 0.011396419813513766
BLEU Score (critique): 0.011497885679768026
BLEU Score (expert): 0.024701784595697784


## METEOR

In [15]:
for i in range(0, len(prompt_methods)):
    compute_meteor(res, data, method=prompt_methods[i])

METEOR Score (zero-shot): 0.21896869032350025
METEOR Score (few-shot-project): 0.26845264357987775
METEOR Score (few-shot-bm25): 0.2342813738064698
METEOR Score (few-shot-codeBERT): 0.2537084366538404
METEOR Score (cot): 0.18621852643579878
METEOR Score (critique): 0.17721209918768746
METEOR Score (expert): 0.25225680490237995


## ROUGE-L

In [16]:
for i in range(0, len(prompt_methods)):
    compute_rougel(res, data, method=prompt_methods[i])

ROUGE-L Score (zero-shot): 0.10532109163460163
ROUGE-L Score (few-shot-project): 0.14004801197092734
ROUGE-L Score (few-shot-bm25): 0.11471065982613647
ROUGE-L Score (few-shot-codeBERT): 0.12691771412284458
ROUGE-L Score (cot): 0.09429030924896685
ROUGE-L Score (critique): 0.08378399600969066
ROUGE-L Score (expert): 0.1238815020664095


## BERT SCORE

In [30]:
for i in range(0, len(prompt_methods)):
    compute_bertscore(res, data, method=prompt_methods[i])

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT Score F1 (zero-shot): 0.8190588913565945
BERT Score F1 (few-shot-project): 0.8199630769820945
BERT Score F1 (few-shot-bm25): 0.818791975913198
BERT Score F1 (few-shot-codeBERT): 0.8172474101905692
BERT Score F1 (cot): 0.8183680419417653
BERT Score F1 (critique): 0.8121273609432244
BERT Score F1 (expert): 0.8221570065392887


## SENTENCE BERT

In [14]:
for i in range(0, len(prompt_methods)):
    compute_sentencebert(res, data, method=prompt_methods[i])

SentenceBert euclidean similarity (zero-shot): -0.907550573348999
SentenceBert cosine similarity (zero-shot): 0.5725163817405701
SentenceBert euclidean similarity (few-shot-project): -0.9024014472961426
SentenceBert cosine similarity (few-shot-project): 0.5658039450645447
SentenceBert euclidean similarity (few-shot-bm25): -0.9144586324691772
SentenceBert cosine similarity (few-shot-bm25): 0.5613376498222351
SentenceBert euclidean similarity (few-shot-codeBERT): -0.9085538387298584
SentenceBert cosine similarity (few-shot-codeBERT): 0.5611753463745117
SentenceBert euclidean similarity (cot): -0.9636720418930054
SentenceBert cosine similarity (cot): 0.5176437497138977
SentenceBert euclidean similarity (critique): -0.9899874925613403
SentenceBert cosine similarity (critique): 0.490191787481308
SentenceBert euclidean similarity (expert): -0.8498820662498474
SentenceBert cosine similarity (expert): 0.6223868727684021


## UNIVERSAL SENTENCE ENCODER