In [1]:
import json
import numpy as np

In [2]:
from typing import List, Dict, Tuple
import math
from collections import Counter

def rouge_scores(expected_strs: List[str], results_strs: List[str]) -> Dict[str, float]:
    def tokenize(text: str) -> List[str]:
        return text.lower().split()

    def compute_rouge(reference: List[str], candidate: List[str]) -> Tuple[float, float, float]:
        ref_counter = Counter(reference)
        cand_counter = Counter(candidate)
        
        common = sum((ref_counter & cand_counter).values())
        
        precision = common / len(candidate) if len(candidate) > 0 else 0
        recall = common / len(reference) if len(reference) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        
        return precision, recall, f1

    def compute_bleu(reference: List[str], candidate: List[str]) -> float:
        ref_len = len(reference)
        cand_len = len(candidate)
        
        if cand_len == 0:
            return 0
        
        brevity_penalty = min(1, math.exp(1 - ref_len / cand_len))
        
        max_n_gram = 4
        weights = [1/max_n_gram] * max_n_gram
        
        clipped_counts = [0] * max_n_gram
        total_counts = [0] * max_n_gram
        
        for n in range(1, max_n_gram + 1):
            ref_ngrams = Counter(zip(*[reference[i:] for i in range(n)]))
            cand_ngrams = Counter(zip(*[candidate[i:] for i in range(n)]))
            
            clipped_counts[n-1] = sum((ref_ngrams & cand_ngrams).values())
            total_counts[n-1] = max(sum(cand_ngrams.values()), 1)
        
        geometric_mean = math.exp(sum(w * math.log(c/t) for w, c, t in zip(weights, clipped_counts, total_counts) if c > 0))
        
        return brevity_penalty * geometric_mean

    def compute_exact_match(reference: str, candidate: str) -> int:
        return int(reference.lower() == candidate.lower())

    rouge_1_scores = []
    rouge_2_scores = []
    rouge_l_scores = []
    bleu_scores = []
    exact_match_scores = []

    for reference, candidate in zip(expected_strs, results_strs):
        ref_tokens = tokenize(reference)
        cand_tokens = tokenize(candidate)
        
        # ROUGE-1
        rouge_1_scores.append(compute_rouge(ref_tokens, cand_tokens)[2])  # F1 score
        
        # ROUGE-2
        ref_bigrams = list(zip(ref_tokens, ref_tokens[1:]))
        cand_bigrams = list(zip(cand_tokens, cand_tokens[1:]))
        rouge_2_scores.append(compute_rouge(ref_bigrams, cand_bigrams)[2])  # F1 score
        
        # ROUGE-L (simplified as longest common subsequence)
        rouge_l_scores.append(compute_rouge(ref_tokens, cand_tokens)[2])  # F1 score
        
        # BLEU
        bleu_scores.append(compute_bleu(ref_tokens, cand_tokens))
        
        # Exact Match
        exact_match_scores.append(compute_exact_match(reference, candidate))

    return {
        "ROUGE-1": sum(rouge_1_scores) / len(rouge_1_scores),
        "ROUGE-2": sum(rouge_2_scores) / len(rouge_2_scores),
        "ROUGE-L": sum(rouge_l_scores) / len(rouge_l_scores),
        "BLEU": sum(bleu_scores) / len(bleu_scores),
        "Exact Match": sum(exact_match_scores) / len(exact_match_scores)
    }

In [8]:
import json
from typing import List, Dict

def read_jsonl(file_path: str) -> List[Dict]:
    with open(file_path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]

def extract_texts(data: List[Dict]) -> Dict[str, List[str]]:
    generated = [item['generated'] for item in data]
    expected = [item['expected'] for item in data]
    return {'generated': generated, 'expected': expected}

def compare_jsonl_file(file_path: str) -> Dict[str, float]:
    # Read JSONL file
    data = read_jsonl(file_path)

    # Extract generated and expected texts
    texts = extract_texts(data)

    # Calculate ROUGE scores
    scores = rouge_scores(texts['expected'], texts['generated'])

    return scores

def compare_multiple_jsonl_files(file_paths: List[str]) -> Dict[str, Dict[str, float]]:
    results = {}
    for file_path in file_paths:
        results[file_path] = compare_jsonl_file(file_path)
    return results
  

# Example usage
file1_path = '../test-doc-raq_meta_model_2.pt_results.jsonl'
file2_path = '../test-doc-vanilla_meta_model_2.pt_results.jsonl'

results = compare_multiple_jsonl_files([file1_path, file2_path])

for file_path, scores in results.items():
    print(f"Results for {file_path}:")
    for metric, score in scores.items():
        print(f"  {metric}: {score:.4f}")
    print()

Results for ../test-doc-raq_meta_model_2.pt_results.jsonl:
  ROUGE-1: 0.2514
  ROUGE-2: 0.0971
  ROUGE-L: 0.2514
  BLEU: 0.7751
  Exact Match: 0.1347

Results for ../test-doc-vanilla_meta_model_2.pt_results.jsonl:
  ROUGE-1: 0.2823
  ROUGE-2: 0.1045
  ROUGE-L: 0.2823
  BLEU: 0.7826
  Exact Match: 0.1587

