In [1]:
import os
from difflib import SequenceMatcher

In [5]:

def get_similarity_scores(model_type, split):

    scenario_scores = {}

    result_dir = f'../experiments/repo_eval/repo-eval-350m-{model_type}/trained/functions'

    for scenario in os.listdir(result_dir):
        if scenario == 'result.jsonl':
            continue

        # read in the expected code
        with open(f'../custom_data_eval/{split}/{scenario}/completed.py') as infile:
            expected_output = infile.read()

        # read each generated code example for the scenario
        outputs = []
        if model_type == 'prefix':
            scenario_output_dir = f"{result_dir}/{scenario}/in_repo_output"
        elif model_type == 'lm':
            scenario_output_dir = f"{result_dir}/{scenario}/orig_output"
            
        for result in os.listdir(scenario_output_dir):
            with open(f'{scenario_output_dir}/{result}') as res_file:
                outputs.append(res_file.read())

        # calculate scores
        min_score = 1
        max_score = 0
        scores = []
        for completion in outputs:
            score = SequenceMatcher(None, expected_output, completion).ratio()
            scores.append(score)

            if score < min_score:
                min_score = score
            if score > max_score:
                max_score = score
    
        # sometimes we get no comlpetions
        if len(scores) != 0:
            avg_score = sum(scores)/len(scores)

            scenario_scores[scenario] = {
                'min_score': min_score, 
                'max_score': max_score,
                'avg_score': avg_score
                }

    return scenario_scores



In [6]:
split = 'test'
model_type = 'prefix'

get_similarity_scores(model_type, split)

{'example2': {'min_score': 0.7185840707964601,
  'max_score': 0.8115384615384615,
  'avg_score': 0.7577841036726565},
 'example1': {'min_score': 0.12418300653594772,
  'max_score': 0.47843137254901963,
  'avg_score': 0.3803422237642011},
 'example3': {'min_score': 0.4178049929345266,
  'max_score': 0.5957805907172996,
  'avg_score': 0.4850626251830469},
 'example4': {'min_score': 0.43316831683168316,
  'max_score': 0.8720626631853786,
  'avg_score': 0.7754845860184766}}

In [7]:
split = 'test'
model_type = 'lm'

get_similarity_scores(model_type, split)

{'example2': {'min_score': 0.6197183098591549,
  'max_score': 0.9140625,
  'avg_score': 0.7479658035898037},
 'example1': {'min_score': 0.06841686555290374,
  'max_score': 0.48725212464589235,
  'avg_score': 0.24105947833959585},
 'example3': {'min_score': 0.42092746730083236,
  'max_score': 0.505175983436853,
  'avg_score': 0.46549576853199454},
 'example4': {'min_score': 0.59375,
  'max_score': 0.8493333333333334,
  'avg_score': 0.797072631689971}}