In [2]:
%pip install huggingface_hub
%pip install rouge_score bert_score sentence-transformers

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=5980114e72a831316938ee628cce3c4e6bd4dbf2ecf3513060a77999383480ac
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score, bert_score
Successfully installed bert_score-0.3.13 rouge_score-0.1.2


In [3]:
from huggingface_hub import login
login('your_token')


In [4]:
import torch
import json
import random

SEED = 2024
torch.manual_seed(SEED)
random.seed(SEED)

# Evaluation Class

In [5]:
import pandas as pd
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from bert_score import score as bertscore
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import precision_score, recall_score, f1_score

In [6]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [14]:
class RAGEvaluateMetrics:
    def __init__(self, references, candidates, sbert_path):
        self.references = references
        self.candidates = candidates
        self.sbert_model = SentenceTransformer(sbert_path)

    def compute_bleu(self):
        """
        Compute BLEU precision for each sentence using sentence-level BLEU score.
        """
        precisions = []
        smooth = SmoothingFunction().method1

        for ref, cand in zip(self.references, self.candidates):
            reference_tokens = [nltk.word_tokenize(ref.lower())]
            candidate_tokens = nltk.word_tokenize(cand.lower())

            # BLEU Precision (BLEU uses n-gram overlap precision by default)
            precision = sentence_bleu(reference_tokens, candidate_tokens, smoothing_function=smooth)
            precisions.append(precision)

        return sum(precisions) / len(precisions)

    def compute_rouge_l(self):
        """
        Compute ROUGE-L precision, recall, and F1.
        """
        rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
        precisions, recalls, f1_scores = [], [], []

        for ref, cand in zip(self.references, self.candidates):
            scores = rouge.score(ref.lower(), cand.lower())
            rouge_l = scores['rougeL']

            precisions.append(rouge_l.precision)
            recalls.append(rouge_l.recall)
            f1_scores.append(rouge_l.fmeasure)

        return sum(precisions) / len(precisions), sum(recalls) / len(recalls), sum(f1_scores) / len(f1_scores)

    def compute_bertscore(self, lang="en"):
        """
        Compute BERTScore (Precision, Recall, and F1).
        """
        P, R, F1 = bertscore(self.candidates, self.references, lang=lang, verbose=False) ###
        return P.mean().item(), R.mean().item(), F1.mean().item()

    def compute_sbert(self):
        """
        Compute SBERT cosine similarity between reference and candidate sentences.
        """
        # Encode reference and candidate sentences using SBERT
        ref_embeddings = self.sbert_model.encode(self.references, convert_to_tensor=True)
        cand_embeddings = self.sbert_model.encode(self.candidates, convert_to_tensor=True)

        # Compute cosine similarity between each pair of reference and candidate
        cosine_similarities = util.pytorch_cos_sim(ref_embeddings, cand_embeddings)

        # Take the diagonal (similarity between ref[i] and cand[i])
        diagonal_similarities = cosine_similarities.diagonal()

        # Average similarity score
        average_similarity = diagonal_similarities.mean().item()

        return average_similarity

    def evaluate(self):
        """
        Run all evaluation metrics (BLEU, ROUGE-L, BERTScore) and return their results.
        :return: Dictionary containing BLEU, ROUGE-L, and BERTScore results.
        """
        results = {}

        # Compute BLEU
        bleu_p = self.compute_bleu()
        results['BLEU'] = {
            'Precision': bleu_p
        }

        # Compute ROUGE-L
        rouge_p, rouge_r, rouge_f1 = self.compute_rouge_l()
        results['ROUGE-L'] = {
            'Precision': rouge_p,
            'Recall': rouge_r,
            'F1': rouge_f1
        }

        # Compute BERTScore
        bert_p, bert_r, bert_f1 = self.compute_bertscore()
        results['BERTScore'] = {
            'Precision': bert_p,
            'Recall': bert_r,
            'F1': bert_f1
        }

        # Compute SBERT cosine similarity
        sbert_similarity = self.compute_sbert()
        results['SBERT'] = {
            'Cosine Similarity': sbert_similarity
        }

        return results

In [23]:
with open("/content/references_candidates.json", "r") as f:
    data = json.load(f)

references_list = data["references"]
candidates_list = data["candidates"]

In [26]:
evaluator = RAGEvaluateMetrics(references_list, candidates_list, "sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
results = evaluator.evaluate()
print(results)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'BLEU': {'Precision': 0.4067465768082245}, 'ROUGE-L': {'Precision': 0.7059594799995719, 'Recall': 0.7059594799995719, 'F1': 0.7059594799995719}, 'BERTScore': {'Precision': 0.9580601453781128, 'Recall': 0.9522309303283691, 'F1': 0.9551219940185547}, 'SBERT': {'Cosine Similarity': 0.9596773982048035}}


In [27]:
results['BLEU']["Precision"]

0.4067465768082245

In [28]:
results['ROUGE-L']['F1']

0.7059594799995719

In [29]:
results['SBERT']['Cosine Similarity']

0.9596773982048035