Code for Figure 1

In [None]:
def anti_shuffle(s):
    return ' '.join([''.join(sorted(list(i))) for i in s.split(' ')])

In [None]:
def anti_shuffle(s):
    return ' '.join([sorted(word) for word in s.split()])

Code for Figure 2

In [17]:
# reference
def anti_shuffle(s):
    return ' '.join([
        ''.join(sorted(list(i))) 
        for i in s.split(' ')
    ])

In [19]:
# partially correct code
def anti_shuffle(s):
    return ' '.join([sorted(list(i)) for i in s.split(' ')])

In [22]:
# totally wrong code
def anti_shuffle(s):
    pass

In [24]:
# lexically different code
def anti_shuffle(s):
    return ' '.join([
        ''.join(sorted(list(word))) 
        for word in s.split(' ')
    ])

In [26]:
# semantically different code
def anti_shuffle(s):
    def sort(word):
        return ''.join(sorted(list(word)))
    word_list = []
    current_word = ""
    for i in range(len(s)):
        if s[i] != " ":
            current_word += s[i]
        else:
            word_list.append(sort(current_word))
            current_word = ""
    word_list.append(sort(current_word))
    return ' '.join(word_list)

In [27]:
def check(anti_shuffle):
    assert anti_shuffle("Hi") == "Hi"
    assert anti_shuffle("hello") == "ehllo"
    assert anti_shuffle("number") == "bemnru"
    assert anti_shuffle("abcd") == "abcd"
    assert anti_shuffle("Hello World!!!") == "Hello !!!Wdlor"
    assert anti_shuffle("") == ""
    assert (
        anti_shuffle("Hi. My name is Mister Robot. How are you?")
        == ".Hi My aemn is Meirst .Rboot How aer ?ouy"
    )

check(anti_shuffle)

Test for Figure 2

In [9]:
import code_bert_score
from codegen_metrics import (
    codebleu,
    ruby,
    chrf,
    bleu,
    meteor,
    rougel,
)
import os
os.environ["https_proxy"] = "127.0.0.1:17890"

partially_correct_prediction = """\
def anti_shuffle(s):
    return ' '.join([sorted(list(i)) for i in s.split(' ')])
"""

lexically_different_prediction = """\
def anti_shuffle(s):
    return ' '.join([''.join(sorted(list(word))) for word in s.split(' ')])
"""
semantically_different_prediction = """\
def anti_shuffle(s):
    word_list = []
    current_word = ""
    for i in range(len(s)):
        if s[i] != " ":
            current_word += s[i]
        else:
            word_list.append("".join(sorted(list(current_word))))
            current_word = ""
    word_list.append("".join(sorted(list(current_word))))
    return ' '.join(word_list)
"""

totally_wrong_prediction = """\
def anti_shuffle(s):
    pass
"""

predictions = [
    partially_correct_prediction,
    totally_wrong_prediction,
    lexically_different_prediction,
    semantically_different_prediction,
]

name = [
    "a",
    "b",
    "c",
    "d",
]

reference = """\
def anti_shuffle(s):
    return ' '.join([''.join(sorted(list(i))) for i in s.split(' ')])
"""

out = []
for index, p in enumerate(predictions):
    _, _, f1, f3 = code_bert_score.score(
        cands=[p], refs=[reference], lang="python"
    )
    f1 = f1.tolist()[0]
    f3 = f3.tolist()[0]

    out.append({
        "bleu": bleu(reference, p),
        "codebleu": codebleu(reference, p),
        "chrf": chrf(reference, p),
        "rougel": rougel(reference, p),
        "ruby": ruby(reference, p),
        "meteor": meteor(reference, p),
        "code_bert_score_f1": f1,
        "code_bert_score_f3": f3,
    })

methods = {
    "bleu": "BLEU",
    "codebleu": "CodeBLEU",
    "chrf": "chrF",
    "rougel": "ROUGE-L",
    "ruby": "RUBY",
    "meteor": "METEOR",
    "code_bert_score_f1": "CodeBERTScore$_{f1}$",
    "code_bert_score_f3": "CodeBERTScore$_{f3}$",
}

for m in methods:
    print(f"{methods[m]} & " + " & ".join([f"{o[m]:.3f}" for o in out]) + " \\\\")



BLEU & 0.779 & 0.010 & 0.858 & 0.231 \\
CodeBLEU & 0.852 & 0.052 & 0.983 & 0.851 \\
chrF & 0.852 & 0.266 & 0.891 & 0.466 \\
ROUGE-L & 0.914 & 0.267 & 0.947 & 0.431 \\
RUBY & 0.811 & 0.364 & 0.990 & 0.533 \\
METEOR & 0.846 & 0.164 & 0.947 & 0.705 \\
CodeBERTScore$_{f1}$ & 0.990 & 0.796 & 0.976 & 0.800 \\
CodeBERTScore$_{f3}$ & 0.988 & 0.746 & 0.976 & 0.841 \\
