In [19]:

import rouge

rouge = rouge.Rouge()
# Assume that the correct modification is black -> red
hyp_list = [
    'A black fox',  # identical
    'A  black\t fox',  # identical but some format differences
    'A red fox',  # correct update
    'A cute black fox jumps over the lazy dog',  # extend only
    'A cute red fox jumps over the lazy dog',  # correct update while extend more
    'A crimson vulpine',  # Correct but close to generation instead of updating
]
ref = ['A black fox'] * len(hyp_list)

scores = rouge.get_scores(refs=ref, hyps=hyp_list)
score = scores[0]['rouge-l']['r']  # focus on the recall point

print(f"Original: {ref[0]}")
print(f"Expected: {hyp_list[2]}")
for i, score in enumerate(scores):
    hyp = hyp_list[i]
    recall = score['rouge-l']['r']
    precision = score['rouge-l']['p']
    f = score['rouge-l']['f']
    print(f'p: {precision:.2f} r: {recall:.2f} f: {f:.2f} - {hyp}')

Original: A black fox
Expected: A red fox
p: 1.00 r: 1.00 f: 1.00 - A black fox
p: 1.00 r: 1.00 f: 1.00 - A  black	 fox
p: 0.67 r: 0.67 f: 0.67 - A red fox
p: 0.33 r: 1.00 f: 0.50 - A cute black fox jumps over the lazy dog
p: 0.22 r: 0.67 f: 0.33 - A cute red fox jumps over the lazy dog
p: 0.33 r: 0.33 f: 0.33 - A crimson vulpine


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def cal_cosine_similarity(text1, text2):
    # Create a TF-IDF Vectorizer
    vectorizer = TfidfVectorizer()

    # Transform texts to TF-IDF vectors
    tfidf_matrix = vectorizer.fit_transform([text1, text2])

    # Calculate Cosine Similarity
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]

    return similarity


In [41]:
import difflib


def calculate_diff(original: str, updated: str):
    matcher = difflib.SequenceMatcher(lambda x: x == " ", original, updated)
    print(matcher.get_opcodes())
    modifications = []
    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag != 'equal':
            # Add the modification pair (old, new) to the list
            modifications.append((original[i1:i2], updated[j1:j2]))

    return modifications


src_method = "@ApiOperation(value = \"Save cluster basic configuration.\")\n    @PutMapping(path = \"/clusters/basic\", consumes = APPLICATION_JSON_VALUE)\n    public ResponseEntity<Void> saveBasicCluster(\n        @AuthenticationPrincipal Account acc,\n        @RequestHeader(value = \"demoMode\", defaultValue = \"false\") boolean demo,\n        @RequestBody JsonObject changedItems\n    ) {\n        cfgsSrv.saveBasicCluster(new ConfigurationKey(acc.getId(), demo), changedItems);\n\n        return ResponseEntity.ok().build();\n    }"
dst_method =  "@PutMapping(path = \"/clusters/basic\", consumes = APPLICATION_JSON_VALUE)\n    public ResponseEntity<Void> saveBasicCluster(\n        @AuthenticationPrincipal Account acc,\n        @RequestHeader(value = \"demoMode\", defaultValue = \"false\") boolean demo,\n        @RequestBody JsonObject changedItems\n    ) {\n        cfgsSrv.saveBasicCluster(new ConfigurationKey(acc.getId(), demo), changedItems);\n\n        return ResponseEntity.ok().build();\n    }"
src_javadoc = 'Save basic clusters.'

print(cal_cosine_similarity(src_method, dst_method))

candidates = [
    'Save cluster basic configuration.',
    'Save basic configuration.',
    'Save cluster configuration.',
    'Save configuration.',
    'Save basic clusters.',
    'Save basic cluster.',
    'Save cluster.',
    'Preserve fundamental setup.',
    'Preserve fundamental cluster setup.',
    'Preserve and maintain the primary, fundamental setup and configuration of the computing cluster, ensuring its initial, basic settings are securely stored and kept intact.'
]

scores = rouge.get_scores(hyps=candidates,
                          refs=[src_javadoc] * len(candidates))
cand_tuples = []
for i, score in enumerate(scores):
    hyp = candidates[i]
    recall = score['rouge-l']['r']
    precision = score['rouge-l']['p']
    f = score['rouge-l']['f']
    cs = cal_cosine_similarity(dst_method, hyp)
    cand_tuples.append((hyp, recall, cs))
    # print(f'p: {precision:.2f} r: {recall:.2f} f: {f:.2f} cs: {cal_cosine_similarity(dst_method, hyp):.5f} - {hyp}')

n = len(candidates)
m = 5
cand_tuples0 = sorted(cand_tuples, key=lambda x: x[1], reverse=True)
cand_tuples1 = list(filter(lambda x: x[0] != src_javadoc, cand_tuples0))
cand_tuples2 = cand_tuples1[:min(m, len(cand_tuples1))]
cand_tuples3 = sorted(cand_tuples2, key=lambda x: x[2], reverse=True)

for t in cand_tuples3:
    print(f'recall: {t[1]:.2f} cs: {t[2]:.2f} - {t[0]}')


0.9129122931152769
recall: 0.67 cs: 0.05 - Save basic configuration.
recall: 0.67 cs: 0.05 - Save basic cluster.
recall: 0.67 cs: 0.04 - Save cluster basic configuration.
recall: 0.33 cs: 0.00 - Save cluster configuration.
recall: 0.33 cs: 0.00 - Save configuration.
