In [24]:
import math
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from bert_score import score

# Function Definitions
def calculate_precision_recall_ndcg(retrieved_contexts, reference, reference_answer, k=3):
    relevant_docs = set(reference)
    retrieved_docs = retrieved_contexts[0][:k]  # Top-k retrieved contexts
    relevance_scores = [1 if doc in relevant_docs else 0 for doc in retrieved_docs]

    # Precision@k
    precision_at_k = sum(relevance_scores) / len(retrieved_docs) if len(retrieved_docs) > 0 else 0

    # Recall@k
    recall_at_k = sum(relevance_scores) / len(relevant_docs) if len(relevant_docs) > 0 else 0

    # DCG@k
    def dcg(scores):
        return sum(score / math.log2(idx + 2) for idx, score in enumerate(scores))

    dcg_k = dcg(relevance_scores)
    idcg_k = dcg(sorted(relevance_scores, reverse=True))  # Ideal DCG

    # nDCG@k
    ndcg_at_k = dcg_k / idcg_k if idcg_k > 0 else 0

    return precision_at_k, recall_at_k, ndcg_at_k


# BLEU Metric
def evaluate_bleu(output, reference):
    reference_tokens = [ref.split() for ref in reference]
    output_tokens = output[0].split()
    return sentence_bleu(reference_tokens, output_tokens)

# ROUGE Metric
def evaluate_rouge(output, reference):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference[0], output[0])
    return scores

# BERTScore Metric
def evaluate_bertscore(output, reference):
    P, R, F1 = score(output, reference, lang="en", rescale_with_baseline=True)
    return {
        "precision": P.mean().item(),
        "recall": R.mean().item(),
        "f1": F1.mean().item()
    }

# Load Excel File
file_path = "/content/query.xlsx"  # Path to your Excel file
data = pd.read_excel(file_path)

assert all(col in data.columns for col in ['Question', 'RetrievedContexts', 'Reference', 'ReferenceAnswer', 'GeneratedOutput'])

results = []

for index, row in data.iterrows():
    question = row['Question']
    retrieved_contexts = eval(row['RetrievedContexts'])  # Convert string representation to list
    reference = eval(row['Reference'])                  # Convert string representation to list
    reference_answer = [row['ReferenceAnswer']]
    generated_output = [row['GeneratedOutput']]

    # Calculate Metrics
    precision, recall, ndcg = calculate_precision_recall_ndcg(retrieved_contexts, reference,reference_answer ,k=3)
    bleu = evaluate_bleu(generated_output, reference_answer)
    rouge = evaluate_rouge(generated_output, reference_answer)
    bertscore = evaluate_bertscore(generated_output, reference_answer)

    results.append({
        "Question": question,
        "Precision@k": precision,
        "Recall@k": recall,
        "nDCG@k": ndcg,
        "BLEU": bleu,
        "ROUGE-1": rouge['rouge1'].fmeasure,
        "ROUGE-2": rouge['rouge2'].fmeasure,
        "ROUGE-L": rouge['rougeL'].fmeasure,
        "BERTScore-F1": bertscore['f1']
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Save to Excel
output_file = "evaluation_results.xlsx"
results_df.to_excel(output_file, index=False)

print(f"Evaluation results saved to {output_file}")


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation results saved to evaluation_results.xlsx


In [7]:
pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.0.0-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.0.0 sacrebleu-2.4.3


In [6]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=7be6924ed041e0971a73b2e1052b142a2865f350509fedb37ff2a94cb7c237ac
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [5]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [4]:
pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


In [2]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)



In [3]:
from datasets import Dataset

In [1]:
pip install ragas

Collecting ragas
  Downloading ragas-0.2.9-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets (from ragas)
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting tiktoken (from ragas)
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting langchain-community (from ragas)
  Downloading langchain_community-0.3.13-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain_openai (from ragas)
  Downloading langchain_openai-0.2.14-py3-none-any.whl.metadata (2.7 kB)
Collecting appdirs (from ragas)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting pysbd>=0.3.4 (from ragas)
  Downloading pysbd-0.3.4-py3-none-any.whl.metadata (6.1 kB)
Collecting diskcache>=5.6.3 (from ragas)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets->ragas)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets->ragas