In [None]:
import os
import re
import json
import editdistance
import pandas as pd

from tqdm import trange
from bert_score import BERTScorer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

In [None]:
def exact_match(results, config, average='macro'):
    actuals, predictions, correctness = [], [], []
    for i in range(len(results['input'])):
        input, answer, actual = results['input'][i], results['prediction'][i], results['target'][i]
        options = [f'\"{option}\"' for option in results['options'][i].split('\t')]
        if config['filename'] in ['english_proverbs/task.json', 'tracking_shuffled_objects/task.json', 'logical_deduction/task.json']:
            options = ['\"{}\"'.format(option.rstrip('.')) for option in results['options'][i].split('\t')]
        if config['filename'] in ['modified_arithmetic/task.json']:
            input = input.replace(' ?', '')
            answer = answer.replace('?', '').replace(input, '')
        if config['model'] in [
            'chavinlo/alpaca-native',
            'chavinlo/alpaca-13b',
            'models--llama/7B',
            'models--llama/13B',
            'models--llama/30B'
        ]:
            answer = answer.lstrip('unk>').replace('<unk>', '').replace('<s>', '').replace('</s>', '').replace('⁇', '')
        answer = answer.lstrip(',').lstrip('.').lstrip(' ').lstrip('\n').lower()
        prediction = '-100'
        if config['prompt_type'] in ['closed', 'open', 'closed-info_removed']:
            for option in options:
                bare = option.lstrip('\"').rstrip('\"') # some models did not output quotation marks
                if prediction == '-100':
                    if option.lower() == answer[:len(option)] or bare.lower() == answer[:len(bare)]:
                        prediction = option
                elif option.lower() in answer[len(option):] or bare.lower() in answer[len(bare):]:
                    prediction = '-100'
                    break
        if config['prompt_type'] in ['closed-adv']:
            if (actual == answer[:len(actual)] and
                '(a)' not in answer[len(actual):] and
                '(b)' not in answer[len(actual):] and
                '(c)' not in answer[len(actual):]):
                prediction = actual
        actuals.append(actual)
        predictions.append(prediction)
        if actual == prediction:
            correctness.append('T')
        else:
            correctness.append('F')
    accuracy = accuracy_score(actuals, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(actuals, predictions, average=average, zero_division=0)
    results['exact_match_correctness'] = correctness
    with open('results/results-{}.json'.format(config['run_id']), 'w') as file:
        json.dump(results, file, indent=4, ensure_ascii=False)
    return accuracy, precision, recall, f1

def bert_score(results, config, scorer, average='macro'):
    actuals, predictions, correctness = [], [], []
    for i in trange(len(results['input'])):
        actual = results['target'][i]
        options = [f'\"{option}\"' for option in results['options'][i].split('\t')]
        if config['filename'] in ['english_proverbs/task.json', 'tracking_shuffled_objects/task.json', 'logical_deduction/task.json']:
            options = ['\"{}\"'.format(option.rstrip('.')) for option in results['options'][i].split('\t')]
        _, _, f1s = scorer.score([results['prediction'][i] for _ in range(len(options))], options)
        actuals.append(actual)
        predictions.append(options[f1s.argmax()])
        if actual == options[f1s.argmax()]:
            correctness.append('T')
        else:
            correctness.append('F')
    accuracy = accuracy_score(actuals, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(actuals, predictions, average=average, zero_division=0)
    results['bert_score_correctness'] = correctness
    with open('results/results-{}.json'.format(config['run_id']), 'w') as file:
        json.dump(results, file, indent=4, ensure_ascii=False)
    return accuracy, precision, recall, f1

def average_edit_distance(results, config):
    distances = 0
    for i in range(len(results['input'])):
        actual = results['target'][i]
        if config['filename'] in ['modified_arithmetic/task.json']:
            input = results['input'][i].replace(' ?', '')
            prediction = results['prediction'][i].replace('?', '').replace(input, '')
        if config['model'] in [
            'chavinlo/alpaca-native',
            'chavinlo/alpaca-13b',
            'models--llama/7B',
            'models--llama/13B',
            'models--llama/30B'
        ]:
            prediction = results['prediction'][i].lstrip('unk>').lstrip('?? ').replace('<unk>', '').replace('<s>', '').replace('</s>', '')
        else:
            prediction = results['prediction'][i]
        distances += editdistance.eval(actual, prediction)
    return distances / len(results['input'])

def average_single_bert_score(results, scorer, config):
    predictions = results['prediction']
    if config['filename'] in ['modified_arithmetic/task.json']:
        for i in range(len(results['input'])):
            input = results['input'][i].replace(' ?', '')
            predictions[i] = results['prediction'][i].replace('?', '').replace(input, '')
    if config['model'] in [
            'chavinlo/alpaca-native',
            'chavinlo/alpaca-13b',
            'models--llama/7B',
            'models--llama/13B',
            'models--llama/30B'
        ]:
        for i in range(len(results['input'])):
            predictions[i] = results['prediction'][i].lstrip('unk>').lstrip('?? ').replace('<unk>', '').replace('<s>', '').replace('</s>', '')
    _, _, f1s = scorer.score(predictions, results['target'])
    return float(f1s.mean())
    
def loss_and_gain(results, config, evaluation_scores_df, metric):
    try:
        run_id_reference = evaluation_scores_df['run_id'][
            (evaluation_scores_df.model==pairs[config['model']]) &
            (evaluation_scores_df.prompt_type==config['prompt_type']) &
            (evaluation_scores_df.number_of_shots==config['number_of_shots']) &
            (evaluation_scores_df.seed==config['seed'])
        ].tolist()[0]
    except Exception as e:
        run_id_reference = evaluation_scores_df['run_id'][
            (evaluation_scores_df.model==pairs[config['model']]) &
            (evaluation_scores_df.prompt_type==config['prompt_type']) &
            (evaluation_scores_df.number_of_shots==config['number_of_shots']) &
            (evaluation_scores_df.seed==str(config['seed']))
        ].tolist()[0]
    try:
        with open(f'results/results-{run_id_reference}.json') as file:
            results_reference = json.loads(file.read())
    except Exception as e:
        with open(f'results_20230618/results-{run_id_reference}.json') as file:
            results_reference = json.loads(file.read())
    loss, gain = 0, 0
    for i in range(len(results['input'])):
        correctness = results[metric][i]
        correctness_reference = results_reference[metric][i]
        # candidate model got it wrong when reference model got it right
        if correctness == 'F' and correctness_reference == 'T':
            loss += 1
        # candidate model got it right when reference model got it wrong
        if correctness == 'T' and correctness_reference == 'F':
            gain += 1
    return loss, gain

## accuracy

In [None]:
config_df = pd.read_csv('evaluation_scores.csv')
scorer = BERTScorer(model_type='roberta-large', device='cuda')

for run_id in config_df.run_id.tolist():
    config = {k: v[0] for k, v in config_df[config_df.run_id == run_id].to_dict(orient='list').items()}
    name = config['filename'].replace('/task.json', '')
    with open(f'results/results-{run_id}.json') as file:
        results = json.loads(file.read())
    config['exact_match_accuracy'], _, _, config['exact_match_f1'] = exact_match(results, config)
    config['bert_score_accuracy'], config['bert_score_f1'] = -100, -100
    if config['prompt_type'] != 'closed-adv' and name not in ['gsm8k', 'codenames', 'modified_arithmetic']:
        config['bert_score_accuracy'], _, _, config['bert_score_f1'] = bert_score(results, config, scorer)
    with open(f'evaluation_scores/evaluation_scores_{name}.txt', 'a') as file:
        file.write(('\t'.join(['{' + i + '}' for i in config.keys()]) + '\n').format(**config))
        file.close()

## edit distance

In [None]:
for filename in os.listdir('evaluation_scores/'):
    config_df = pd.read_csv(
        'evaluation_scores/' + filename,
        sep='\t',
        names=['run_id', 'filename', 'number_of_data',
                'model', 'prompt_type', 'number_of_shots',
                'temperature', 'max_new_tokens', 'batch_size',
                'pad_token', 'pad_token_id', 'eos_token_id',
                'seed', 'device', 'exact_match_accuracy',
                'exact_match_f1', 'bert_score_accuracy',
                'bert_score_f1']
    )
    for run_id in config_df.run_id.tolist():
        config = {k: v[0] for k, v in config_df[config_df.run_id == run_id].to_dict(orient='list').items()}
        with open(f'results/results-{run_id}.json') as file:
            results = json.loads(file.read())
        config['edit_distance'] = average_edit_distance(results, config)
        with open(f'evaluation_scores/plus edit distance/plus_edit_distance.txt', 'a') as file:
            file.write(('\t'.join(['{' + i + '}' for i in config.keys()]) + '\n').format(**config))
            file.close()

## single bert score

In [None]:
scorer = BERTScorer(model_type='roberta-large', device='cuda')
config_df = pd.read_csv(
    'evaluation_scores/plus edit distance/plus_edit_distance.txt',
    sep='\t'
)
for run_id in config_df.run_id.tolist():
    config = {k: v[0] for k, v in config_df[config_df.run_id == run_id].to_dict(orient='list').items()}
    config['single_bert_score'] = -100
    with open(f'results/results-{run_id}.json') as file:
        results = json.loads(file.read())
    config['single_bert_score'] = average_single_bert_score(results, scorer, config)
    with open(f'evaluation_scores/plus 2/plus_2.txt', 'a') as file:
        file.write(('\t'.join(['{' + i + '}' for i in config.keys()]) + '\n').format(**config))
        file.close()