In [None]:
!pip install datasets evaluate

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading

In [None]:
import datasets
import sys, math, re, xml.sax.saxutils
import numpy as np
from nltk.translate import meteor_score
from packaging import version

import evaluate


if evaluate.config.PY_VERSION < version.parse("3.8"):
    import importlib_metadata
else:
    import importlib.metadata as importlib_metadata


NLTK_VERSION = version.parse(importlib_metadata.version("nltk"))
if NLTK_VERSION >= version.Version("3.6.4"):
    from nltk import word_tokenize

import nltk
from nltk.translate import meteor_score
from nltk import word_tokenize
import numpy as np
nltk.download("punkt")
nltk.download("omw-1.4")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# Data setup

In [None]:
def format_code_snippet(snippet):
    formatted_snippet = ' '.join(snippet.split())
    symbols = ['=', '+', '-', '*', '/', '(', ')', '[', ']', ',', ':', '@', '.']
    for symbol in symbols:
        formatted_snippet = formatted_snippet.replace(symbol, f' {symbol} ')
    return formatted_snippet

In [None]:
format_code_snippet('''n = pts_des.shape[1]
J = np.empty((0, 6))
for i in range(n):
    tmp_J = ibvs_jacobian(K, pts_obs[:, i].reshape(-1, 1), zs[i])
    J = np.vstack((J, tmp_J))''')

'n  =  pts_des . shape [ 1 ]  J  =  np . empty (  ( 0 ,  6 )  )  for i in range ( n )  :  tmp_J  =  ibvs_jacobian ( K ,  pts_obs [  :  ,  i ]  . reshape (  - 1 ,  1 )  ,  zs [ i ]  )  J  =  np . vstack (  ( J ,  tmp_J )  ) '

# BLEU Score (Abandoned)

In [None]:
# Added to bypass NIST-style pre-processing of hyp and ref files -- wade
nonorm = 0

preserve_case = False
eff_ref_len = "shortest"

normalize1 = [
    ('<skipped>', ''),         # strip "skipped" tags
    (r'-\n', ''),              # strip end-of-line hyphenation and join lines
    (r'\n', ' '),              # join lines
]
normalize1 = [(re.compile(pattern), replace) for (pattern, replace) in normalize1]

normalize2 = [
    (r'([\{-\~\[-\` -\&\(-\+\:-\@\/])',r' \1 '), # tokenize punctuation. apostrophe is missing
    (r'([^0-9])([\.,])',r'\1 \2 '),              # tokenize period and comma unless preceded by a digit
    (r'([\.,])([^0-9])',r' \1 \2'),              # tokenize period and comma unless followed by a digit
    (r'([0-9])(-)',r'\1 \2 ')                    # tokenize dash when preceded by a digit
]
normalize2 = [(re.compile(pattern), replace) for (pattern, replace) in normalize2]

def normalize(s):
    '''Normalize and tokenize text. This is lifted from NIST mteval-v11a.pl.'''
    # Added to bypass NIST-style pre-processing of hyp and ref files -- wade
    if (nonorm):
        return s.split()
    if type(s) is not str:
        s = " ".join(s)
    # language-independent part:
    for (pattern, replace) in normalize1:
        s = re.sub(pattern, replace, s)
    s = xml.sax.saxutils.unescape(s, {'&quot;':'"'})
    # language-dependent part (assuming Western languages):
    s = " %s " % s
    if not preserve_case:
        s = s.lower()
    for (pattern, replace) in normalize2:
        s = re.sub(pattern, replace, s)
    return s.split()

def count_ngrams(words, n=4):
    counts = {}
    for k in range(1,n+1):
        for i in range(len(words)-k+1):
            ngram = tuple(words[i:i+k])
            counts[ngram] = counts.get(ngram, 0)+1
    return counts

def cook_refs(refs, n=4):
    '''Takes a list of reference sentences for a single segment
    and returns an object that encapsulates everything that BLEU
    needs to know about them.'''

    refs = [normalize(ref) for ref in refs]
    maxcounts = {}
    for ref in refs:
        counts = count_ngrams(ref, n)
        for (ngram,count) in counts.items():
            maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
    return ([len(ref) for ref in refs], maxcounts)

def cook_test(test, item, n=4):
    '''Takes a test sentence and returns an object that
    encapsulates everything that BLEU needs to know about it.'''
    (reflens, refmaxcounts)=item
    test = normalize(test)
    result = {}
    result["testlen"] = len(test)

    # Calculate effective reference sentence length.

    if eff_ref_len == "shortest":
        result["reflen"] = min(reflens)
    elif eff_ref_len == "average":
        result["reflen"] = float(sum(reflens))/len(reflens)
    elif eff_ref_len == "closest":
        min_diff = None
        for reflen in reflens:
            if min_diff is None or abs(reflen-len(test)) < min_diff:
                min_diff = abs(reflen-len(test))
                result['reflen'] = reflen

    result["guess"] = [max(len(test)-k+1,0) for k in range(1,n+1)]

    result['correct'] = [0]*n
    counts = count_ngrams(test, n)
    for (ngram, count) in counts.items():
        result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)

    return result

def score_cooked(allcomps, n=4, ground=0, smooth=1):
    totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}
    for comps in allcomps:
        for key in ['testlen','reflen']:
            totalcomps[key] += comps[key]
        for key in ['guess','correct']:
            for k in range(n):
                totalcomps[key][k] += comps[key][k]
    logbleu = 0.0
    all_bleus = []
    for k in range(n):
      correct = totalcomps['correct'][k]
      guess = totalcomps['guess'][k]
      addsmooth = 0
      if smooth == 1 and k > 0:
        addsmooth = 1
      logbleu += math.log(correct + addsmooth + sys.float_info.min)-math.log(guess + addsmooth+ sys.float_info.min)
      if guess == 0:
        all_bleus.append(-10000000)
      else:
        all_bleus.append(math.log(correct + sys.float_info.min)-math.log( guess ))

    logbleu /= float(n)
    all_bleus.insert(0, logbleu)

    brevPenalty = min(0,1-float(totalcomps['reflen'] + 1)/(totalcomps['testlen'] + 1))
    for i in range(len(all_bleus)):
      if i ==0:
        all_bleus[i] += brevPenalty
      all_bleus[i] = math.exp(all_bleus[i])
    return all_bleus

def compute_bleu_for_pair(code_snippet, summary, n=4, smooth=1):
    '''Computes BLEU score for a single pair of code snippet and its summary.'''
    # Normalize and prepare the data
    ref = [normalize(code_snippet)]
    candidate = normalize(summary)

    # Compute BLEU score
    refs = cook_refs(ref, n)
    test = cook_test(candidate, refs, n)
    score = score_cooked([test], n=n, smooth=smooth)  # Removed 'ground' parameter

    # Return the BLEU score
    return score[0]

if __name__ == '__main__':
    code_snippet = '''I love you'''
    summary = '''I hate you'''
    bleu_score = compute_bleu_for_pair(code_snippet, summary)
    print("BLEU Score:", bleu_score)


BLEU Score: 0.5773502691896257


# METEOR Score

In [None]:
class Meteor:
    def __init__(self):
        pass

    def compute_score(self, prediction, reference, alpha=0.9, beta=3, gamma=0.5):
        if version.Version(nltk.__version__) >= version.Version("3.6.5"):
            prediction_tokens = word_tokenize(prediction)
            reference_tokens = word_tokenize(reference)
        else:
            prediction_tokens = prediction.split()
            reference_tokens = reference.split()

        score = meteor_score.single_meteor_score(reference_tokens, prediction_tokens, alpha=alpha, beta=beta, gamma=gamma)
        return score

## Zero shot results

In [None]:
meteor = Meteor()
score1 = meteor.compute_score("Reverses a list and finds the index of a value in it, considering the reversed order", "return the last index of the value in the list")
score2 = meteor.compute_score("Retrieves a value from a map using a string key; if not found, returns a default value", "returns the object associated with the specified name of the child 's node")
score3 = meteor.compute_score("Populates a NumPy matrix with ratings based on user and item IDs from a dataset", "create a user-item matrix from the dataset")
score4 = meteor.compute_score("Returns the value of a variable called frame rate", "returns the frame rate value for the encoding process")
score5 = meteor.compute_score("Adds a key-value pair to a parameter dictionary", "sets the value of a parameter")
score6 = meteor.compute_score("Returns the name of the parent directory of a file path", "get the topic name from the file path")
score7 = meteor.compute_score("Performs a series of mathematical operations on a NumPy array", "project points into camera")
score8 = meteor.compute_score("Reads data from an input stream and handles exceptions", "tests if the content length set in the stream equals the bytes read from the stream, if any exception is thrown, then the test fails.")
score9 = meteor.compute_score("Computes a direction cosine matrix (DCM) from roll pitch yaw angles", "generate rotation matrix from roll, pitch, yaw Euler angles")
score10 = meteor.compute_score("Creates a server socket with an arbitrary port and starts a Python process","starts the python script")

for i in range(1, 11):
    score_var_name = f"score{i}"
    print(f"{score_var_name}: {globals()[score_var_name]}")

score1: 0.5349990463475109
score2: 0.07352941176470588
score3: 0.25641025641025644
score4: 0.41333333333333333
score5: 0.30241935483870974
score6: 0.4481927710843373
score7: 0.0
score8: 0.07662835249042145
score9: 0.38070436507936506
score10: 0.20408163265306123


## Few shot results

In [None]:
meteor = Meteor()
score1 = meteor.compute_score("Defines a function rindex that returns the last index of a value in a list lst", "return the last index of the value in the list")
score2 = meteor.compute_score("Defines an object function that retrieves a value from a map if it exists, otherwise returns a default value", "returns the object associated with the specified name of the child 's node")
score3 = meteor.compute_score("Initializes a matrix with zeros and populates it with ratings from a dataset", "create a user-item matrix from the dataset")
score4 = meteor.compute_score("Defines an integer function that returns the value of frame rate", "returns the frame rate value for the encoding process")
score5 = meteor.compute_score("Defines a void function that puts key-value pairs into a parameter map", "sets the value of a parameter")
score6 = meteor.compute_score("Defines a function get_topic_name that extracts the name of the parent directory from a file path", "get the topic name from the file path")
score7 = meteor.compute_score("Performs a series of operations on arrays pts and pts_cam", "project points into camera")
score8 = meteor.compute_score("Reads data from an input stream, checks its length, and consumes the stream using utility methods", "tests if the content length set in the stream equals the bytes read from the stream, if any exception is thrown, then the test fails.")
score9 = meteor.compute_score("Defines a function dcm_from_rpy that computes a Direction Cosine Matrix from roll, pitch, and yaw angles", "generate rotation matrix from roll, pitch, yaw Euler angles")
score10 = meteor.compute_score("Creates a server socket on an available port and starts a Python server","starts the python script")

for i in range(1, 11):
    score_var_name = f"score{i}"
    print(f"{score_var_name}: {globals()[score_var_name]}")

score1: 0.6625884433962265
score2: 0.072992700729927
score3: 0.26315789473684215
score4: 0.4043478260869566
score5: 0.3872053872053872
score6: 0.48453282828282834
score7: 0.0
score8: 0.14814814814814814
score9: 0.6657318376068376
score10: 0.20408163265306123


## Chain of Thought results

In [None]:
meteor = Meteor()
score1 = meteor.compute_score("The code's purpose is to return the last index of the given value in the list", "return the last index of the value in the list")
score2 = meteor.compute_score("The purpose of this code is to get a value from a map with a fallback value if the key is not present", "returns the object associated with the specified name of the child 's node")
score3 = meteor.compute_score("The purpose is to create a user-item rating matrix from a dataset", "create a user-item matrix from the dataset")
score4 = meteor.compute_score("The purpose is to retrieve and return the value of the frame rate", "returns the frame rate value for the encoding process")
score5 = meteor.compute_score("The purpose is to set key value pairs in parameters", "sets the value of a parameter")
score6 = meteor.compute_score("The purpose is to obtain the name of the parent directory from the file path", "get the topic name from the file path")
score7 = meteor.compute_score("The purpose seems to be some form of geometric or mathematical transformation on data represented by arrays", "project points into camera")
score8 = meteor.compute_score("The purpose appears to be related to managing input streams and handling exceptions", "tests if the content length set in the stream equals the bytes read from the stream, if any exception is thrown, then the test fails.")
score9 = meteor.compute_score("The purpose is to calculate a direction cosine matrix from roll pitch yaw angles", "generate rotation matrix from roll, pitch, yaw Euler angles")
score10 = meteor.compute_score("The purpose seems to be setting up a server socket and initializing a python interpreter","starts the python script")

for i in range(1, 11):
    score_var_name = f"score{i}"
    print(f"{score_var_name}: {globals()[score_var_name]}")

score1: 0.9308411214953272
score2: 0.17857142857142858
score3: 0.6394557823129252
score4: 0.5437352245862884
score5: 0.3125
score6: 0.7295331925873797
score7: 0.0
score8: 0.07547169811320754
score9: 0.45231071779744353
score10: 0.19607843137254904
