In [None]:
from datasets import load_dataset
import numpy as np
import json
import pandas as pd

In [None]:
final_filepath = "/kaggle/working/qwen2-0.5B_eval.csv"
# dataset_filepaths = ["/kaggle/input/flan-t5/flan-t5-base/iter_0_results.parquet",
#                     "/kaggle/input/flan-t5/flan-t5-base/iter_1_results_refined.parquet",
#                     "/kaggle/input/flan-t5/flan-t5-base/iter_2_results_refined.parquet"]
dataset_filepaths = [f"/kaggle/input/flan-t5/qwen2-0.5B/iter_{iter_i}/results.parquet" for iter_i in range(11)]

# running test
RUN_LIMITED_TEST = False

evaluate_labels = {'truth_label': 'expected_answer',
                   'predicted_label': 'predicted_answer'}

# evaluate_labels = [{'truth_label': 'expected_answer',
#                    'predicted_label': 'predicted_answer'},
#                   {'truth_label': 'context',
#                    'predicted_label': 'predicted_answer'},
#                   {'truth_label': 'context',
#                    'predicted_label': 'question_predicted_answer'}]
# final_filepath_endings = ['_answer_vs_answer',
#                          '_context_vs_answer',
#                          '_context_vs_qpa']

# Prepping for Evaluation

In [None]:
!pip install evaluate
!pip install rouge_score
!pip install bert_score

In [None]:
from evaluate import load

In [None]:
def obtain_basic_metrics(dataset, truth_label, predicted_label):
    final = []
    keys = ['exact_match',
            'rouge1', 'rouge2', 'rougeL', 'rougeLsum', 
            'meteor',
            'bert_score_model', 'bert_score_avg_precision', 'bert_score_avg_recall', 'bert_score_avg_f1',
           ]
    
    exact_match_metric = load("exact_match")
    results = exact_match_metric.compute(predictions=dataset[predicted_label], references=dataset[truth_label], ignore_case=True, ignore_punctuation=True)
    final.append(results['exact_match'])
  
    rouge = load("rouge")
    results = rouge.compute(predictions=dataset[predicted_label], references=dataset[truth_label])
    final.append(results['rouge1'])
    final.append(results['rouge2'])
    final.append(results['rougeL'])
    final.append(results['rougeLsum'])
    
    rouge = load("meteor")
    results = rouge.compute(predictions=dataset[predicted_label], references=dataset[truth_label])
    final.append(results['meteor'])
    
    bertscore = load("bertscore")
    results = bertscore.compute(predictions=dataset[predicted_label], references=dataset[truth_label], lang="en")

    final.append(results['hashcode'])
    final.append(np.mean(results['precision']))
    final.append(np.mean(results['recall']))
    final.append(np.mean(results['f1']))
    # final['bert_score_model']
    # final['bert_score_avg_precision']
    # final['bert_score_avg_recall']
    # final['bert_score_avg_f1']

    return final, keys

In [None]:
# mock
# def obtain_basic_metrics(dataset_filepath, truth_label, predicted_label):
#     return [dataset_filepath,2,3], ['a','b','c']

In [None]:
def evaluate_dataset(dataset_filepath, truth_label, predicted_label):
    dataset = load_dataset("parquet", data_files=dataset_filepath)["train"]
    
    if RUN_LIMITED_TEST:
        dataset = dataset.select(range(150))
        print("Running a test: computing 150 examples.")
    
    metric_values, metric_names = obtain_basic_metrics(dataset, truth_label, predicted_label)
    return metric_values, metric_names

# Run for All Datasets

In [None]:
all_metrics = []

for iter_i, filepath in enumerate(dataset_filepaths):
    metric_values, metric_names = evaluate_dataset(filepath, 
                                                   evaluate_labels['truth_label'], 
                                                   evaluate_labels['predicted_label']
                                                  )
    all_metrics.append(metric_values)


all_metrics_df = pd.DataFrame(all_metrics, columns = metric_names)
all_metrics_df['iteration'] = range(len(dataset_filepaths))
all_metrics_df

In [None]:
all_metrics_df.to_csv(final_filepath)

## obtain summac scores

In [None]:
print('COMPUTING SUMMAC SCORES')

In [None]:
!pip install summac

In [None]:
from summac.model_summac import SummaCConv

In [None]:
model_conv = SummaCConv(models=["vitc"], bins='percentile', granularity="sentence", nli_labels="e", device="cuda:0", start_file="default", agg="mean")

In [None]:
summac_scores = []

for iter_i, filepath in enumerate(dataset_filepaths):
    print(f"- Computing summac for {iter_i}")
    dataset = load_dataset("parquet", data_files=filepath)["train"]
    
    if RUN_LIMITED_TEST:
        dataset = dataset.select(range(150))
        print("Running a test: computing 150 examples.")
    
    score_conv1 = model_conv.score(dataset[evaluate_labels['truth_label']], 
                                   dataset[evaluate_labels['predicted_label']])
    summac_scores.append(np.mean(score_conv1['scores']))

all_metrics_df['summac_conv_scores'] = summac_scores
all_metrics_df

In [None]:
all_metrics_df.to_csv(final_filepath)