In [6]:
%load_ext autoreload
%autoreload 2

In [52]:
import sys
import os
import json
parent_dir = os.path.dirname(os.getcwd())
sys.path.insert(0, parent_dir)

## SUPPORT FUNCTIONS (METRICS)

In [102]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

prompt_methods = ['zero-shot', 'few-shot-project', 'few-shot-bm25', 'few-shot-codeBERT', 'cot', 'critique', 'expert']

def compute_bleu(res, data, method='zero-shot'):
    candidate_list = []
    reference_list = []
    project_scores = []
    chencherry = SmoothingFunction()
    for i in range(0, len(res)):
        candidate_list_project = []
        reference_list_project = []
        for j in range(0, len(res[i]['functions_res'])):
            candidate = word_tokenize(res[i]['functions_res'][j][method])#.split()
            reference = word_tokenize(data[i]['functions'][j]['docstring'])#.split()
            candidate_list.append(candidate)
            reference_list.append([reference])
            candidate_list_project.append(candidate)
            reference_list_project.append([reference])
        project_scores.append(corpus_bleu(reference_list_project, candidate_list_project, smoothing_function=chencherry.method0))
    bleu_score = corpus_bleu(reference_list, candidate_list, smoothing_function=chencherry.method0)
    bleu_score_per_project = sum(project_scores) / len(res)
    print(f"BLEU Score ({method}):", bleu_score, ", average by project:", bleu_score_per_project)
    return bleu_score, bleu_score_per_project

In [103]:
import nltk
from nltk.translate import meteor
from nltk import word_tokenize
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('omw-1.4')
def compute_meteor(res, data, method="zero-shot"):
    scores = []
    project_scores = []
    for i in range(0, len(res)):
        cur_scores = []
        for j in range(0, len(res[i]['functions_res'])):
            candidate = word_tokenize(res[i]['functions_res'][j][method])#.split()
            reference = word_tokenize(data[i]['functions'][j]['docstring'])#.split()
            score = round(meteor([reference], candidate), 4)
            scores.append(score)
            cur_scores.append(score)
        cur_scores = sum(cur_scores) / len(cur_scores)
        project_scores.append(cur_scores)
    meteor_score = sum(scores) / len(scores)
    meteor_score_per_project = sum(project_scores) / len(res)
    print(f"METEOR Score ({method}):", meteor_score, ", average by project:", meteor_score_per_project)
    return meteor_score, meteor_score_per_project

[nltk_data] Downloading package wordnet to /home/quanvo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/quanvo/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/quanvo/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [104]:
def compute_rougel(res, data, method="zero-shot"):
    #CODE BASED ON RENCOS: https://github.com/zhangj111/rencos/blob/master/evaluation/rouge/rouge.py
    def my_lcs(string, sub):
        """
        Calculates longest common subsequence for a pair of tokenized strings
        :param string : list of str : tokens from a string split using whitespace
        :param sub : list of str : shorter string, also split using whitespace
        :returns: length (list of int): length of the longest common subsequence between the two strings

        Note: my_lcs only gives length of the longest common subsequence, not the actual LCS
        """
        if(len(string)< len(sub)):
            sub, string = string, sub

        lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)]

        for j in range(1,len(sub)+1):
            for i in range(1,len(string)+1):
                if(string[i-1] == sub[j-1]):
                    lengths[i][j] = lengths[i-1][j-1] + 1
                else:
                    lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1])

        return lengths[len(string)][len(sub)]

    def calc_score(candidate, refs, beta=1.2):
        """
        Compute ROUGE-L score given one candidate and references for an image
        :param candidate: str : candidate sentence to be evaluated
        :param refs: list of str : COCO reference sentences for the particular image to be evaluated
        :returns score: int (ROUGE-L score for the candidate evaluated against references)
        """
        assert(len(candidate)==1)
        assert(len(refs)>0)         
        prec = []
        rec = []

        # split into tokens
        token_c = candidate[0].split(" ")

        for reference in refs:
            # split into tokens
            token_r = reference.split(" ")
            # compute the longest common subsequence
            lcs = my_lcs(token_r, token_c)
            prec.append(lcs/float(len(token_c)))
            rec.append(lcs/float(len(token_r)))

        prec_max = max(prec)
        rec_max = max(rec)

        if(prec_max!=0 and rec_max !=0):
            score = ((1 + beta**2)*prec_max*rec_max)/float(rec_max + beta**2*prec_max)
        else:
            score = 0.0
        return score
    
    scores = []
    project_scores = []
    for i in range(0, len(res)):
        cur_scores = []
        for j in range(0, len(res[i]['functions_res'])):
            candidate = res[i]['functions_res'][j][method]
            reference = data[i]['functions'][j]['docstring']
            score = round(calc_score([candidate], [reference]), 4)
            scores.append(score)
            cur_scores.append(score)
        cur_scores = sum(cur_scores) / len(cur_scores)
        project_scores.append(cur_scores)
    rougel_score = sum(scores) / len(scores)
    rougel_score_per_project = sum(project_scores) / len(project_scores)
    print(f"ROUGE-L Score ({method}):", rougel_score, ", average by project:", rougel_score_per_project)
    return rougel_score, rougel_score_per_project

In [105]:
import evaluate
from evaluate import load
bertscore = load("bertscore")
def compute_bertscore(res, data, method="zero-shot"):
    candidate_list = []
    reference_list = []
    project_scores = []

    #GLOBAL
    for i in range(0, len(res)):
        for j in range(0, len(res[i]['functions_res'])):
            candidate = res[i]['functions_res'][j][method]
            reference = data[i]['functions'][j]['docstring']
            candidate_list.append(candidate)
            reference_list.append(reference)
    bert_score = bertscore.compute(predictions=candidate_list, references=reference_list, lang="en")
    f1 = sum(bert_score['f1']) / len(bert_score['f1'])
    # PER PROJECT
    k = 0
    for i in range(0, len(res)):
        score = []
        for j in range(0, len(res[i]['functions_res'])):
            score.append(bert_score['f1'][k])
            k += 1
        score = sum(score) / len(score)
        project_scores.append(score)
    bert_score_per_project = sum(project_scores) / len(project_scores)
    
    print(f"BERT Score F1 ({method}):", f1, ", average by project:", bert_score_per_project)
    return bert_score

In [106]:
from sentence_transformers import SentenceTransformer, SimilarityFunction

def compute_sentencebert(res, data, method="zero-shot"):
    candidate_list = []
    reference_list = []
    project_scores_cos = []
    project_scores_euc = []
    for i in range(0, len(res)):
        for j in range(0, len(res[i]['functions_res'])):
            candidate = res[i]['functions_res'][j][method]
            reference = data[i]['functions'][j]['docstring']
            candidate_list.append(candidate)
            reference_list.append(reference)
    #COSINE SIMILARITY
    model = SentenceTransformer("all-mpnet-base-v2", similarity_fn_name=SimilarityFunction.COSINE)
    candidate_embeddings = model.encode(candidate_list)
    reference_embeddings = model.encode(reference_list)
    # PER PROJECT COSINE SIMILARITY
    k = 0
    for i in range(0, len(res)):
        candidate_embeddings_project = []
        reference_embeddings_project = []
        for j in range(0, len(res[i]['functions_res'])):
            candidate_embeddings_project.append(candidate_embeddings[k])
            reference_embeddings_project.append(reference_embeddings[k])
            k += 1
        cos_sim_pairwise = model.similarity(candidate_embeddings_project, reference_embeddings_project)
        cos_sim = 0
        for i in range(0, len(res[i]['functions_res'])):
            cos_sim += cos_sim_pairwise[i, i]
        cos_sim /= len(candidate_embeddings_project)
        project_scores_cos.append(cos_sim)
    # GLOBAL COSINE SIMILARITY
    cos_sim_pairwise = model.similarity(candidate_embeddings, reference_embeddings)
    cos_sim = 0

    # EUCLIDEAN
    model = SentenceTransformer("all-mpnet-base-v2", similarity_fn_name=SimilarityFunction.EUCLIDEAN)
    # PER PROJECT EUCLIDEAN SIMILARITY
    k = 0
    for i in range(0, len(res)):
        candidate_embeddings_project = []
        reference_embeddings_project = []
        for j in range(0, len(res[i]['functions_res'])):
            candidate_embeddings_project.append(candidate_embeddings[k])
            reference_embeddings_project.append(reference_embeddings[k])
            k += 1
        euclidean_sim_pairwise = model.similarity(candidate_embeddings_project, reference_embeddings_project)
        euclidean_sim = 0
        for i in range(0, len(res[i]['functions_res'])):
            euclidean_sim += euclidean_sim_pairwise[i, i]
        euclidean_sim /= len(candidate_embeddings_project)
        project_scores_euc.append(euclidean_sim)

    # GLOBAL EUCLIDEAN SIMILARITY
    euclidean_sim_pairwise = model.similarity(candidate_embeddings, reference_embeddings)
    euclidean_sim = 0
    for i in range(0, len(candidate_list)):
        euclidean_sim += euclidean_sim_pairwise[i,i]
        cos_sim += cos_sim_pairwise[i,i]
    euclidean_sim /= len(candidate_list)
    cos_sim /= len(candidate_list)
    euclidean_sim_per_project = sum(project_scores_euc) / len(project_scores_euc)
    cos_sim_per_project = sum(project_scores_cos) / len(project_scores_cos)
    print(f"SentenceBert euclidean similarity ({method}):", euclidean_sim.item(), ", average by project:", euclidean_sim_per_project.item())
    print(f"SentenceBert cosine similarity ({method}):", cos_sim.item(), ", average by project:", cos_sim_per_project.item())
    return euclidean_sim, cos_sim, euclidean_sim_per_project, cos_sim_per_project 

In [107]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
from numpy import dot
from numpy.linalg import norm

def compute_USE(res, data, method='zero-shot'):
    module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
    model = hub.load(module_url)
    def embed(input):
      return model(input)
    compute_cos_sim = lambda a, b: dot(a, b)/(norm(a)*norm(b))

    candidate_list = []
    reference_list = []
    project_scores = []
    for i in range(0, len(res)):
        for j in range(0, len(res[i]['functions_res'])):
            candidate = res[i]['functions_res'][j][method]
            reference = data[i]['functions'][j]['docstring']
            candidate_list.append(candidate)
            reference_list.append(reference)
    candidate_embedding = np.array(embed(candidate_list)).tolist()
    reference_embedding = np.array(embed(reference_list)).tolist()

    k = 0
    for i in range(0, len(res)):
        cur_score = 0
        for j in range(0, len(res[i]['functions_res'])):
            candidate = candidate_embedding[k]
            reference = reference_embedding[k]
            cur_score += compute_cos_sim(candidate, reference)
            k += 1
        cur_score /= len(res[i]['functions_res'])
        project_scores.append(cur_score)
    use_score_per_project = sum(project_scores) / len(res)

    use_score = 0
    for i in range(0, len(candidate_list)):
        candidate = candidate_embedding[i]
        reference = reference_embedding[i]
        use_score += compute_cos_sim(candidate, reference)
    use_score /= len(candidate_list)
    print(f"Universal Sentence Encoder Cosine Similarity: ({method}):", use_score, ", average by project:", use_score_per_project)
    return use_score, use_score_per_project

# JAVA

In [69]:
#LOAD GENERATED RESULT
res = []
path = 'data'
file_name = 'res_java.jsonl'
file_path = os.path.join(parent_dir, path, file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        try:
            json_object = json.loads(line)
            res.append(json_object)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON on line: {line.strip()} - {e}")

In [70]:
#LOAD TEST DATA
data = []
path = 'data'
file_name = 'java-test-train-small.jsonl'
file_path = os.path.join(parent_dir, path, file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        try:
            json_object = json.loads(line)
            data.append(json_object)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON on line: {line.strip()} - {e}")

In [71]:
print(data[0]['functions'][0].keys())
print(res[0]['functions_res'][0].keys())

dict_keys(['code', 'code_tokens', 'docstring', 'project', 'name', 'code_tokens_processed', 'bm25', 'CodeBERT'])
dict_keys(['zero-shot', 'few-shot-project', 'few-shot-codeBERT', 'cot', 'critique', 'few-shot-bm25', 'expert'])


## BLEU

In [72]:
for i in range(0, len(prompt_methods)):
    compute_bleu(res, data, method=prompt_methods[i])

BLEU Score (zero-shot): 0.016874090156521204 , average by project: 0.013233802055073034
BLEU Score (few-shot-project): 0.03584189326003828 , average by project: 0.049089874196136474
BLEU Score (few-shot-bm25): 0.02243248381754831 , average by project: 0.022016782167999457
BLEU Score (few-shot-codeBERT): 0.029398894297030725 , average by project: 0.02881632316649521
BLEU Score (cot): 0.011396419813513766 , average by project: 0.008050624423061611
BLEU Score (critique): 0.011497885679768026 , average by project: 0.008528160291698887
BLEU Score (expert): 0.024701784595697784 , average by project: 0.020031717196577127


## METEOR

In [73]:
for i in range(0, len(prompt_methods)):
    compute_meteor(res, data, method=prompt_methods[i])

METEOR Score (zero-shot): 0.21896869032350025 , average by project: 0.20949970035452117
METEOR Score (few-shot-project): 0.26845264357987775 , average by project: 0.29165793751041347
METEOR Score (few-shot-bm25): 0.2342813738064698 , average by project: 0.23426783267115214
METEOR Score (few-shot-codeBERT): 0.2537084366538404 , average by project: 0.25175977986845494
METEOR Score (cot): 0.18621852643579878 , average by project: 0.17471191915004153
METEOR Score (critique): 0.17721209918768746 , average by project: 0.16699517353049306
METEOR Score (expert): 0.25225680490237995 , average by project: 0.24690678078216


## ROUGE-L

In [80]:
for i in range(0, len(prompt_methods)):
    compute_rougel(res, data, method=prompt_methods[i])

ROUGE-L Score (zero-shot): 0.10532109163460163 , average by project: 0.0979719950133674
ROUGE-L Score (few-shot-project): 0.14004801197092734 , average by project: 0.15305117797474685
ROUGE-L Score (few-shot-bm25): 0.11471065982613647 , average by project: 0.11225008880629651
ROUGE-L Score (few-shot-codeBERT): 0.12691771412284458 , average by project: 0.12004186925996775
ROUGE-L Score (cot): 0.09429030924896685 , average by project: 0.08760500621067219
ROUGE-L Score (critique): 0.08378399600969066 , average by project: 0.07641002808547133
ROUGE-L Score (expert): 0.1238815020664095 , average by project: 0.11532169139159419


## BERT SCORE

In [89]:
for i in range(0, len(prompt_methods)):
    compute_bertscore(res, data, method=prompt_methods[i])

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT Score F1 (zero-shot): 0.8190588910762819 , average by project: 0.8156653514041078
BERT Score F1 (few-shot-project): 0.8199630774832594 , average by project: 0.8184899001565394
BERT Score F1 (few-shot-bm25): 0.8187919765247891 , average by project: 0.8149774811369865
BERT Score F1 (few-shot-codeBERT): 0.8172474105982966 , average by project: 0.813838841140864
BERT Score F1 (cot): 0.8183680412792083 , average by project: 0.815188294083968
BERT Score F1 (critique): 0.8121273606204402 , average by project: 0.809092252568307
BERT Score F1 (expert): 0.822157006947016 , average by project: 0.8186684723482897


## SENTENCE BERT

In [90]:
for i in range(0, len(prompt_methods)):
    compute_sentencebert(res, data, method=prompt_methods[i])

  a = torch.tensor(a)


SentenceBert euclidean similarity (zero-shot): -0.907550573348999 , average by project: tensor(-0.9212)
SentenceBert cosine similarity (zero-shot): 0.5725163817405701 , average by project: tensor(0.5594)
SentenceBert euclidean similarity (few-shot-project): -0.9024014472961426 , average by project: tensor(-0.8900)
SentenceBert cosine similarity (few-shot-project): 0.5658039450645447 , average by project: tensor(0.5726)
SentenceBert euclidean similarity (few-shot-bm25): -0.9144586324691772 , average by project: tensor(-0.9138)
SentenceBert cosine similarity (few-shot-bm25): 0.5613376498222351 , average by project: tensor(0.5605)
SentenceBert euclidean similarity (few-shot-codeBERT): -0.9085538387298584 , average by project: tensor(-0.9121)
SentenceBert cosine similarity (few-shot-codeBERT): 0.5611753463745117 , average by project: tensor(0.5570)
SentenceBert euclidean similarity (cot): -0.9636720418930054 , average by project: tensor(-0.9748)
SentenceBert cosine similarity (cot): 0.5176

## UNIVERSAL SENTENCE ENCODER

In [93]:
for i in range(0, len(prompt_methods)):
    compute_USE(res, data, method=prompt_methods[i])

Universal Sentence Encoder Cosine Similarity: (zero-shot): 0.4284162148154346 , average by project: 0.4044803945941557
Universal Sentence Encoder Cosine Similarity: (few-shot-project): 0.4612214404477187 , average by project: 0.456045630839091
Universal Sentence Encoder Cosine Similarity: (few-shot-bm25): 0.44599682883566505 , average by project: 0.43100824068609406
Universal Sentence Encoder Cosine Similarity: (few-shot-codeBERT): 0.45319276200601094 , average by project: 0.4338135633976017
Universal Sentence Encoder Cosine Similarity: (cot): 0.367630400185942 , average by project: 0.344585915872059
Universal Sentence Encoder Cosine Similarity: (critique): 0.3510616932881093 , average by project: 0.32906833971913174
Universal Sentence Encoder Cosine Similarity: (expert): 0.4794636586843965 , average by project: 0.4570255705009818


# PYTHON

In [94]:
#LOAD GENERATED RESULT
res = []
path = 'data'
file_name = 'res_python.jsonl'
file_path = os.path.join(parent_dir, path, file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        try:
            json_object = json.loads(line)
            res.append(json_object)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON on line: {line.strip()} - {e}")

In [95]:
#LOAD TEST DATA
data = []
path = 'data'
file_name = 'python-test-train-small.jsonl'
file_path = os.path.join(parent_dir, path, file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        try:
            json_object = json.loads(line)
            data.append(json_object)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON on line: {line.strip()} - {e}")

In [96]:
print(data[0]['functions'][0].keys())
print(res[0]['functions_res'][0].keys())

dict_keys(['code', 'code_tokens', 'docstring', 'project', 'name', 'code_tokens_processed', 'bm25', 'CodeBERT'])
dict_keys(['zero-shot', 'few-shot-project', 'few-shot-bm25', 'few-shot-codeBERT', 'cot', 'critique', 'expert'])


## BLEU

In [97]:
for i in range(0, len(prompt_methods)):
    compute_bleu(res, data, method=prompt_methods[i])

BLEU Score (zero-shot): 0.009135714279285239 , average by project: 0.007194712177461264
BLEU Score (few-shot-project): 0.025773850387974764 , average by project: 0.023659840837820065
BLEU Score (few-shot-bm25): 0.011774038346827541 , average by project: 0.011025166442711056
BLEU Score (few-shot-codeBERT): 0.01371393496488132 , average by project: 0.010923028937304582


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU Score (cot): 0.008740850561325569 , average by project: 0.0063465703737370215
BLEU Score (critique): 0.007218017940833739 , average by project: 0.00499975416535761
BLEU Score (expert): 0.013500418938127435 , average by project: 0.010573555970738774


## METEOR

In [98]:
for i in range(0, len(prompt_methods)):
    compute_meteor(res, data, method=prompt_methods[i])

METEOR Score (zero-shot): 0.17187812437512476 , average by project: 0.16937840746435553
METEOR Score (few-shot-project): 0.2370252949410128 , average by project: 0.24044231262144627
METEOR Score (few-shot-bm25): 0.19833061387722437 , average by project: 0.19948417667357568
METEOR Score (few-shot-codeBERT): 0.2000970605878822 , average by project: 0.19966116199925527
METEOR Score (cot): 0.1664951009798039 , average by project: 0.16264873694981893
METEOR Score (critique): 0.14902243551289746 , average by project: 0.14723480921864998
METEOR Score (expert): 0.20001829634073248 , average by project: 0.19693170632801402


## ROUGE-L

In [99]:
for i in range(0, len(prompt_methods)):
    compute_rougel(res, data, method=prompt_methods[i])

ROUGE-L Score (zero-shot): 0.0737549490101978 , average by project: 0.07226701042450463
ROUGE-L Score (few-shot-project): 0.11504749050189975 , average by project: 0.11668073801715607
ROUGE-L Score (few-shot-bm25): 0.08958230353929209 , average by project: 0.0889728344541853
ROUGE-L Score (few-shot-codeBERT): 0.0886434113177364 , average by project: 0.08750540820954825
ROUGE-L Score (cot): 0.07760591881623684 , average by project: 0.07594554254745678
ROUGE-L Score (critique): 0.0626043991201759 , average by project: 0.06097140149628914
ROUGE-L Score (expert): 0.0759969806038791 , average by project: 0.07408391580450062


## BERT SCORE

In [108]:
for i in range(0, len(prompt_methods)):
    compute_bertscore(res, data, method=prompt_methods[i])

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT Score F1 (zero-shot): 0.8089383118416257 , average by project: 0.8084108151062861
BERT Score F1 (few-shot-project): 0.8128694572989356 , average by project: 0.812612724294526
BERT Score F1 (few-shot-bm25): 0.8093283590949123 , average by project: 0.8073308567236545
BERT Score F1 (few-shot-codeBERT): 0.8046031195720276 , average by project: 0.8044130419853818
BERT Score F1 (cot): 0.8109882023901349 , average by project: 0.8104600911764411




BERT Score F1 (critique): 0.8008047559337124 , average by project: 0.8005093521595632
BERT Score F1 (expert): 0.7952278106242651 , average by project: 0.7944018327103624


## SENTENCE BERT

In [100]:
for i in range(0, len(prompt_methods)):
    compute_sentencebert(res, data, method=prompt_methods[i])

SentenceBert euclidean similarity (zero-shot): -0.982590913772583 , average by project: tensor(-0.9870)
SentenceBert cosine similarity (zero-shot): 0.5028473734855652 , average by project: tensor(0.4985)
SentenceBert euclidean similarity (few-shot-project): -0.9129919409751892 , average by project: tensor(-0.9081)
SentenceBert cosine similarity (few-shot-project): 0.5645687580108643 , average by project: tensor(0.5668)
SentenceBert euclidean similarity (few-shot-bm25): -0.9531254172325134 , average by project: tensor(-0.9526)
SentenceBert cosine similarity (few-shot-bm25): 0.5321130752563477 , average by project: tensor(0.5323)
SentenceBert euclidean similarity (few-shot-codeBERT): -0.9548503160476685 , average by project: tensor(-0.9559)
SentenceBert cosine similarity (few-shot-codeBERT): 0.528922975063324 , average by project: tensor(0.5282)
SentenceBert euclidean similarity (cot): -0.9914019107818604 , average by project: tensor(-0.9980)
SentenceBert cosine similarity (cot): 0.49311

## UNIVERSAL SENTENCE ENCODER

In [101]:
for i in range(0, len(prompt_methods)):
    compute_USE(res, data, method=prompt_methods[i])

Universal Sentence Encoder Cosine Similarity: (zero-shot): 0.3320116902873427 , average by project: 0.3340418949858086
Universal Sentence Encoder Cosine Similarity: (few-shot-project): 0.4375413893169157 , average by project: 0.44132198794163524
Universal Sentence Encoder Cosine Similarity: (few-shot-bm25): 0.4093193930521929 , average by project: 0.41336426823032774
Universal Sentence Encoder Cosine Similarity: (few-shot-codeBERT): 0.4106005694359953 , average by project: 0.41411143963281416
Universal Sentence Encoder Cosine Similarity: (cot): 0.329668331338655 , average by project: 0.3284375239602013
Universal Sentence Encoder Cosine Similarity: (critique): 0.29909015337150474 , average by project: 0.2977883428698053
Universal Sentence Encoder Cosine Similarity: (expert): 0.4206500335883102 , average by project: 0.422800042445348
