In [None]:
import nltk
import torch
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from rouge_score import rouge_scorer
from nltk.tokenize import word_tokenize
from bert_score import score as bert_score
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
df = pd.read_excel("rag_evaluation_results_OpenAI.xlsx")
df.head()

Unnamed: 0,Question,Ground Truth Answer,ChatGBT,KnowledgeBase,Gemini
0,كيف يمكن تحديد الكيانات التي تصنف كجماعات إرها...,تُحدد الكيانات التي تصنف كجماعات إرهابية بأنها...,تحديد الكيانات التي تصنف كجماعات إرهابية في مص...,تحديد الكيانات التي تصنف كجماعات إرهابية في م...,تحديد الكيانات التي تصنف كجماعات إرهابية وفقًا...
1,ما هي أنواع الأسلحة التي تُعتبر غير تقليدية وم...,الأسلحة غير التقليدية تشمل الأسلحة والمواد الن...,الأسلحة غير التقليدية تشمل الأسلحة والمواد الن...,الأسلحة غير التقليدية تشمل مجموعة من الأسلحة ...,بناءً على النصوص المقدمة، يُضاف إلى القانون رق...
2,ما هي الإجراءات التي يمكن اتخاذها للحفاظ على ا...,يمكن اتخاذ عدة إجراءات للحفاظ على الأمن والنظا...,عند وقوع خطر إرهابي أو كارثة بيئية، يمكن اتخاذ...,عند وقوع خطر إرهابي أو كارثة بيئية، يمكن اتخا...,وفقًا للقانون المصري، تتخذ الإجراءات للحفاظ عل...
3,كيف يتم التعامل مع قرارات اتخاذ التدابير الأمن...,يجب عرض قرار اتخاذ التدابير الأمنية الطارئة عل...,تتطلب قرارات اتخاذ التدابير الأمنية الطارئة في...,يجب عرض قرار اتخاذ التدابير الأمنية الطارئة عل...,تتخذ التدابير الأمنية الطارئة عادة في إطار إعل...
4,ما هي العقوبات المقررة لمن يستولي أو يهاجم مقا...,العقوبات تتراوح بين السجن المشدد الذي لا يقل ع...,تُعاقب القوانين المصرية كل من يستولي أو يهاجم ...,يعاقب كل من يستولي أو يهاجم مقار الجهات الحكو...,يعد الاستيلاء على مقار الجهات الحكومية أو مهاج...


In [None]:
def precision(predicted_tokens, ground_truth_tokens):
    common = set(predicted_tokens) & set(ground_truth_tokens)
    return len(common) / len(predicted_tokens) if predicted_tokens else 0

def recall(predicted_tokens, ground_truth_tokens):
    common = set(predicted_tokens) & set(ground_truth_tokens)
    return len(common) / len(ground_truth_tokens) if ground_truth_tokens else 0

def f1_score(predicted, ground_truth):
    predicted_tokens = predicted.split()
    ground_truth_tokens = ground_truth.split()
    p = precision(predicted_tokens, ground_truth_tokens)
    r = recall(predicted_tokens, ground_truth_tokens)
    f1 = 2 * p * r / (p + r) if (p + r) > 0 else 0
    return p, r, f1

def intersection_over_union(predicted_span, ground_truth_span):
    predicted_start, predicted_end = predicted_span
    ground_truth_start, ground_truth_end = ground_truth_span

    intersection_start = max(predicted_start, ground_truth_start)
    intersection_end = min(predicted_end, ground_truth_end)

    intersection = max(0, intersection_end - intersection_start)
    union = max(predicted_end, ground_truth_end) - min(predicted_start, ground_truth_start)

    return intersection / union if union > 0 else 0

def compute_text_iou(predicted_text, ground_truth_text):
    predicted_span = (0, len(predicted_text))
    ground_truth_span = (0, len(ground_truth_text))
    return intersection_over_union(predicted_span, ground_truth_span)

def compute_metrics_for_text(ground_truth, predicted):
    p, r, f1 = f1_score(predicted, ground_truth)
    iou = compute_text_iou(predicted, ground_truth)
    return p, r, f1, iou

In [None]:
results = []

for index, row in df.iterrows():
    ground_truth = row["Ground Truth Answer"]

    KnowledgeBase_model = row["KnowledgeBase"]
    GPT_model = row["ChatGBT"]
    Gemini_model = row["Gemini"]

    # Compute metrics for Model A
    p_a, r_a, f1_a, iou_a = compute_metrics_for_text(ground_truth, KnowledgeBase_model)
    results.append({
        "Model": "Knowledge Base",
        "Precision": p_a,
        "Recall": r_a,
        "F1": f1_a,
        "IoU": iou_a,
        "Question": index+1,
    })

    # Compute metrics for Model A
    p_b, r_b, f1_b, iou_b = compute_metrics_for_text(ground_truth, GPT_model)
    results.append({
        "Model": "GPT",
        "Precision": p_b,
        "Recall": r_b,
        "F1": f1_b,
        "IoU": iou_b,
        "Question": index+1,
    })

    p_c, r_c, f1_c, iou_c = compute_metrics_for_text(ground_truth, Gemini_model)
    results.append({
        "Model": "Gemini",
        "Precision": p_c,
        "Recall": r_c,
        "F1": f1_c,
        "IoU": iou_c,
        "Question": index+1,
    })

In [None]:
metrics_df = pd.DataFrame(results)

avg_metrics = metrics_df.groupby("Model").mean().reset_index()

avg_metrics = avg_metrics.drop("Question", axis=1)

print("\nAverage Metrics per Model:")
avg_metrics


Average Metrics per Model:


Unnamed: 0,Model,Precision,Recall,F1,IoU
0,GPT,0.213509,0.355733,0.257256,0.560288
1,Gemini,0.195284,0.227171,0.190028,0.585837
2,Knowledge Base,0.311688,0.435334,0.349405,0.615622


In [None]:
metrics_df.head()

Unnamed: 0,Model,Precision,Recall,F1,IoU,Question
0,Knowledge Base,0.158416,0.333333,0.214765,0.472081,1
1,GPT,0.14433,0.291667,0.193103,0.478559,1
2,Gemini,0.134146,0.229167,0.169231,0.551383,1
3,Knowledge Base,0.144578,0.266667,0.1875,0.547135,2
4,GPT,0.866667,0.866667,0.866667,1.0,2


In [None]:
def precision(predicted_tokens, ground_truth_tokens):
    common = set(predicted_tokens) & set(ground_truth_tokens)
    return len(common) / len(predicted_tokens) if predicted_tokens else 0

def recall(predicted_tokens, ground_truth_tokens):
    common = set(predicted_tokens) & set(ground_truth_tokens)
    return len(common) / len(ground_truth_tokens) if ground_truth_tokens else 0

def f1_score(predicted, ground_truth):
    p = precision(predicted, ground_truth)
    r = recall(predicted, ground_truth)
    f1 = 2 * p * r / (p + r) if (p + r) > 0 else 0
    return p, r, f1

def compute_text_iou(predicted_text, ground_truth_text):
    predicted_tokens = set(predicted_text)
    ground_truth_tokens = set(ground_truth_text)

    intersection = predicted_tokens.intersection(ground_truth_tokens)
    union = predicted_tokens.union(ground_truth_tokens)

    iou = len(intersection) / len(union)
    return iou

def compute_metrics_for_text(ground_truth, predicted):
    p, r, f1 = f1_score(predicted, ground_truth)
    iou = compute_text_iou(predicted, ground_truth)
    return p, r, f1, iou

def compute_token_metrics(reference, candidate):

    from nltk.tokenize import word_tokenize

    ref_tokens = word_tokenize(reference.lower())
    cand_tokens = word_tokenize(candidate.lower())

    ref_counts = Counter(ref_tokens)
    cand_counts = Counter(cand_tokens)

    overlap = sum((ref_counts & cand_counts).values())

    p, r, f1 = f1_score(cand_tokens, ref_tokens)
    iou = compute_text_iou(cand_tokens, ref_tokens)
    return p, r, f1, iou

In [None]:
def compute_bleu3(reference, candidate):
    smoothing = SmoothingFunction().method1
    ref_tokens = nltk.word_tokenize(reference.lower())
    cand_tokens = nltk.word_tokenize(candidate.lower())
    try:
        bleu_score = sentence_bleu(
            [ref_tokens],
            cand_tokens,
            weights=(1/3, 1/3, 1/3),
            smoothing_function=smoothing
        )
    except ZeroDivisionError:
        bleu_score = 0
    return bleu_score

def compute_rouge(reference, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, candidate)
    rouge1 = scores['rouge1'].fmeasure
    rouge2 = scores['rouge2'].fmeasure
    rougeL = scores['rougeL'].fmeasure
    return rouge1, rouge2, rougeL

In [None]:
def compute_bertscore(reference, candidate):
    P, R, F1 = bert_score([candidate], [reference], lang='ar', verbose=False)
    return P[0].item(), R[0].item(), F1[0].item()

In [None]:
def compute_all_metrics(reference, candidate):
    token_precision, token_recall, token_f1, iou = compute_token_metrics(reference, candidate)
    bleu3   = compute_bleu3(reference, candidate)
    rouge1, rouge2, rougeL = compute_rouge(reference, candidate)
    bert_p, bert_r, bert_f1 = compute_bertscore(reference, candidate)

    return {
        "precision": token_precision,
        "recall": token_recall,
        "f1": token_f1,
        "iou": iou,
        "bleu3": bleu3,
        "rouge1": rouge1,
        "rouge2": rouge2,
        "rougeL": rougeL,
        "bertscore_precision": bert_p,
        "bertscore_recall": bert_r,
        "bertscore_f1": bert_f1,
    }

In [None]:
results = []

for index, row in df.iterrows():
    ground_truth = row["Ground Truth Answer"]
    answers = {
        "KnowledgeBase": row["KnowledgeBase"],
        "GPT Model": row["ChatGBT"],
        "Gemini Model": row["Gemini"]
    }
    for model_name, candidate in answers.items():
        metrics = compute_all_metrics(ground_truth, candidate)
        metrics["Model"] = model_name
        metrics["Question"] = index + 1
        results.append(metrics)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

In [None]:
metrics_df = pd.DataFrame(results)
metrics_df.head(6)

Unnamed: 0,precision,recall,f1,iou,bleu3,rouge1,rouge2,rougeL,bertscore_precision,bertscore_recall,bertscore_f1,Model,Question
0,0.163462,0.346939,0.222222,0.163462,0.063214,0.0,0.0,0.0,0.727283,0.743032,0.735073,KnowledgeBase,1
1,0.15,0.306122,0.201342,0.135135,0.063804,0.0,0.0,0.0,0.723293,0.747923,0.735402,GPT Model,1
2,0.144578,0.244898,0.181818,0.118812,0.07249,0.0,0.0,0.0,0.671129,0.706605,0.68841,Gemini Model,1
3,0.151163,0.282609,0.19697,0.132653,0.076042,0.0,0.0,0.0,0.721554,0.759465,0.740024,KnowledgeBase,2
4,0.869565,0.869565,0.869565,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,GPT Model,2
5,0.085714,0.130435,0.103448,0.064516,0.036522,0.0,0.0,0.0,0.622921,0.661206,0.641492,Gemini Model,2


In [None]:
avg_metrics = metrics_df.groupby("Model").mean().reset_index()
avg_metrics = avg_metrics.drop("Question", axis=1)
print("\nAverage Metrics per Model:")
avg_metrics


Average Metrics per Model:


Unnamed: 0,Model,precision,recall,f1,iou,bleu3,rouge1,rouge2,rougeL,bertscore_precision,bertscore_recall,bertscore_f1
0,GPT Model,0.221251,0.374476,0.268115,0.215066,0.143804,0.016946,0.004836,0.016946,0.744383,0.779854,0.761475
1,Gemini Model,0.220137,0.250491,0.212467,0.141109,0.06978,0.001934,0.0,0.001451,0.726841,0.738158,0.731858
2,KnowledgeBase,0.316891,0.450236,0.357318,0.326978,0.265942,0.023877,0.008704,0.023877,0.780531,0.811703,0.795543


In [None]:
avg_metrics.T

Unnamed: 0,0,1,2
Model,GPT Model,Gemini Model,KnowledgeBase
precision,0.221251,0.220137,0.316891
recall,0.374476,0.250491,0.450236
f1,0.268115,0.212467,0.357318
iou,0.215066,0.141109,0.326978
bleu3,0.143804,0.06978,0.265942
rouge1,0.016946,0.001934,0.023877
rouge2,0.004836,0.0,0.008704
rougeL,0.016946,0.001451,0.023877
bertscore_precision,0.744383,0.726841,0.780531


Since Model A is the best for 5 metrics and Model C is the same so I will choose with another way to choose the best model by computing the mean value of all the metrics for each model.