In [1]:
import os
import json
import numpy as np
import random
from tqdm.notebook import tqdm
from datasets import load_from_disk
import gc
from collections import Counter
import re
import string
from rouge import Rouge

  from .autonotebook import tqdm as notebook_tqdm


# SUM RESULTS (ROUGE)

In [38]:
def rouge(path):
    rouge = Rouge()
    pred = []
    true = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line)
            pred.append(data['pred_clust'])
            true.append(data['answers'])

    res = rouge.get_scores(pred, true)
    r1 = np.mean([r['rouge-1']['f'] for r in res])
    r2 = np.mean([r['rouge-2']['f'] for r in res])
    rl = np.mean([r['rouge-l']['f'] for r in res])
    mean_r = np.mean([r1,r2,rl])

    return f'mean rouge-1 : {r1} \n mean rouge-2 : {r2} \n mean rouge-l : {rl} \n mean rouge : {mean_r}'


# BEFORE

In [39]:
rouge("BEFORE/QasperSumInstructLONGBENCH.jsonl")
'mean rouge-1 : 0.22230406054912186 \n mean rouge-2 : 0.06967721115827072 \n mean rouge-l : 0.20778082247617455 \n mean rouge : 0.1665873647278557'


'mean rouge-1 : 0.22230406054912186 \n mean rouge-2 : 0.06967721115827072 \n mean rouge-l : 0.20778082247617455 \n mean rouge : 0.1665873647278557'

# AFTER

In [40]:
rouge("AFTER/QasperSumInstructLONGBENCH.jsonl")
'mean rouge-1 : 0.20684256953651334 \n mean rouge-2 : 0.06720703041953023 \n mean rouge-l : 0.19641296510546705 \n mean rouge : 0.15682085502050355'


'mean rouge-1 : 0.20684256953651334 \n mean rouge-2 : 0.06720703041953023 \n mean rouge-l : 0.19641296510546705 \n mean rouge : 0.15682085502050355'

# QA RESULTS (qa_f1_score)

In [45]:
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""

    def remove_articles(text):
        return re.sub(r"\b(a|an|the)\b", " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def f1_score(prediction, ground_truth, **kwargs):
    common = Counter(prediction) & Counter(ground_truth)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction)
    recall = 1.0 * num_same / len(ground_truth)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def qa_f1_score(prediction, ground_truth):
    normalized_prediction = normalize_answer(prediction)
    normalized_ground_truth = normalize_answer(ground_truth)

    prediction_tokens = normalized_prediction.split()
    ground_truth_tokens = normalized_ground_truth.split()
    return f1_score(prediction_tokens, ground_truth_tokens)

def qa(path):
    res = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line)
            res.append(qa_f1_score(data['pred_clust'], data['answers']))
    
    return f'mean qa_f1_score: {np.mean(res)}'


# BEFORE

In [46]:
qa("BEFORE/QasperQAInstructLONGBENCH.jsonl")
'mean qa_f1_score: 0.11678618735369924'


'mean qa_f1_score: 0.11678618735369924'

In [47]:
qa("AFTER/QasperQAInstructLONGBENCH.jsonl")
'mean qa_f1_score: 0.05164489309181426'

'mean qa_f1_score: 0.05164489309181426'