# GPT-4o QA + context Evaluator (Batch Mode) v2
This notebook uses GPT-4o to evaluate the relevance, faithfulness and completeness of RAG-generated answers based on the provided context and question.
It batches 10 items per call for cost and performance efficiency.

v0 - baseline few shot only <br>
v1 - prompt tuning (gpt-4o)<br>
*   add explanations to few shot examples
*   improve prompt (system message, all rubrics)

v2 -
*   List item
*   List item





In [716]:
!pip install openai pandas



# Imports

In [719]:
import os
import pandas as pd
import json
import time
import re
import numpy as np

from collections import defaultdict, Counter

# openAI
from openai import OpenAI

# drive
from google.colab import drive
from google.colab import userdata

# stats
from scipy.stats import spearmanr, pearsonr, kendalltau

In [720]:
# Google Drive Setup

# mount google drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [721]:
!ls '/content/drive/MyDrive/Colab Notebooks/final_project_266'

 200_questions.csv
 266_FP_Phase_Z_Scores.xlsx
 266_FP_SO_evaluator_gpt4.ipynb
 266_FP_SO_QExtract_gpt4.ipynb
 266_SO_RAGAS_evaluator.ipynb
 batch_requests_eval.jsonl
 batch_requests.jsonl
 cleaned_batch_requests_eval.jsonl
 context_question_answer_only.csv
 context_question_answer_only.gsheet
 extracted_passages_Jnana.csv
 final_scored_all_20_qa_pairs_v3.xlsx
'full_generated_q_a_jnana (1).csv'
 full_generated_q_a_jnana.csv
 generated_q_a_df_25passages.csv
 generated_q_a_df.csv
 generated_q_a_df.gsheet
 get_qa.ipynb
 golden_scores_20_eval_7-30-25.csv
 golden_scores_20_eval_7-30-25.gsheet
 golden_scores_20_eval_7-31-25_v0.csv
 golden_scores_20_eval_8-1-25_v1.csv
 gpt4o-mini_QA_evaluator_v0.ipynb
 gpt4o-mini_QA_evaluator_v1.ipynb
 gpt4o-mini_QA_evaluator_v2.ipynb
 gpt_evaluator_scores_v1.xlsx
 gpt_evaluator_scores_v2_0-explained.xlsx
 gpt_evaluator_scores.xlsx
 llama8B_control_group_generate_answers.ipynb
 llama_baseline_eval_7-30-25.csv
 llama_baseline_eval_7-30-25.gsheet
 llama_baselin

# Data Setup - File Paths and Data Frames

## Data Setup: Unit Tests

In [722]:
def fix_mojibake(text):
    if not isinstance(text, str):
        return text
    return (text.replace('‚Äú', '"')
                .replace('‚Äù', '"')
                .replace('‚Äò', "'")
                .replace('‚Äô', "'")
                .replace('â€”', '—')
                .replace('â€“', '–')
                .replace('Ã©', 'é')
                .replace('â€˜', "'")
                .replace('â€™', "'")
                .replace('â€œ', '"')
                .replace('â€', '"')
                .replace('Â', ''))

In [723]:
golden_scores_path = '/content/drive/MyDrive/Colab Notebooks/final_project_266/final_scored_all_20_qa_pairs_v3.xlsx'

In [757]:
golden_scores_df = pd.read_excel(golden_scores_path)
golden_scores_df = golden_scores_df.rename(columns={'Passage': 'context',
                                                    'Insightful Question': 'question',
                                                    'Answer': 'answer',
                                                    'Relevance': 'relevance',
                                                    'Factual Accuracy': 'faithfulness',
                                                    'Completeness': 'completeness'})

golden_scores_df = golden_scores_df[['context', 'question', 'answer', 'relevance', 'faithfulness', 'completeness']]
golden_scores_df['relevance'] = golden_scores_df['relevance'].round(0)
golden_scores_df['faithfulness'] = golden_scores_df['faithfulness'].round(0)
golden_scores_df['completeness'] = golden_scores_df['completeness'].round(0)

for col in golden_scores_df.columns:
  golden_scores_df[col] = golden_scores_df[col].apply(fix_mojibake)

# golden_scores_df


In [758]:
q_a_only = golden_scores_df[['context', 'question', 'answer']]
# q_a_only

In [2]:
pip install krippendorff


Collecting krippendorff
  Downloading krippendorff-0.8.1-py3-none-any.whl.metadata (3.0 kB)
Downloading krippendorff-0.8.1-py3-none-any.whl (18 kB)
Installing collected packages: krippendorff
Successfully installed krippendorff-0.8.1


## Data Setup: Llama baseline (no tuning)


In [759]:
llama_baseline_path = '/content/drive/MyDrive/Colab Notebooks/final_project_266/266_FP_Phase_Z_Scores.xlsx'

In [876]:
llama_baseline_experiment_path = '/content/drive/MyDrive/Colab Notebooks/final_project_266/llama_control_generated_answers_200.csv'

In [880]:
llama_base_exp_df = pd.read_csv(llama_baseline_experiment_path)
llama_base_exp_df = llama_base_exp_df.rename(columns={'generated_answer': 'answer'})
llama_base_exp_df.head()


Unnamed: 0,context,question,answer
0,"When you turn over the pages, you are actually...",What is the role of the Sadhaka in the context...,The Sadhaka is a seeker of spiritual knowledge...
1,"You have the unique chance of meeting Him, at ...",What implication does the passage have for the...,The passage does not have any implication for ...
2,"In the pages of the magazine, published with H...",What challenge does the passage identify in th...,The passage identifies cynicism and credal fan...
3,Until one gets firmly fixed in the path that g...,What is the significance of attuning the mind ...,The passage does not mention attuning the mind...
4,When Ignorance and its concomitant Delusion di...,In what way does deep sleep affect the percept...,Deep sleep does not affect the perception of t...


In [760]:
llama_baseline_results_df = pd.read_excel(llama_baseline_path, sheet_name='HUMAN.CG.BASE.LLAMA')
llama_baseline_results_df = llama_baseline_results_df.rename(columns={'passage': 'context',
                                                                      'generated_answer': 'answer'})
llama_baseline_results_df = llama_baseline_results_df[['context', 'question', 'answer', 'relevance', 'faithfulness', 'completeness']]
llama_baseline_results_df = llama_baseline_results_df[0:50]

for col in llama_baseline_results_df.columns:
  llama_baseline_results_df[col] = llama_baseline_results_df[col].apply(fix_mojibake)

# moved to unit test
llama_baseline_results_df = llama_baseline_results_df.drop(index=[8, 12])
llama_baseline_results_df = llama_baseline_results_df.drop(index=[18])
llama_baseline_results_df = llama_baseline_results_df.drop(index=[19])
llama_baseline_results_df.reset_index(drop=True, inplace=True)

llama_baseline_results_df

Unnamed: 0,context,question,answer,relevance,faithfulness,completeness
0,"N. KASTURI Editor, Sanathana Sarathi JNANA V A...",What role do practices like Bhajan play in the...,The passage does not mention Bhajan or any oth...,2.0,2.0,1.0
1,It is only when full knowledge is won that one...,What is necessary to overcome the challenges t...,To overcome the challenges that arise during D...,4.0,4.0,3.0
2,It is only when full knowledge is won that one...,What is the result of attaining Moksha as desc...,The result of attaining Moksha (Liberation) is...,3.0,1.0,2.0
3,The Sadhaka (aspirant for spiritual progress) ...,What is the source of joy for the Jnani?,The context does not mention the source of joy...,2.0,1.0,2.0
4,"Overfeeding,exhaustion through too much of mov...","What is the significance of the phrase ""Brahma...","There is no mention of the phrase ""Brahmavith ...",3.0,1.0,2.0
5,The basis for both is the attraction of the ou...,What implications does the passage suggest abo...,The passage suggests that attachment to the ou...,4.0,3.0,4.0
6,"Therefore, the Jnani who has had a vision of t...",What role does practice play in managing agita...,The passage does not mention practice as a mea...,3.0,3.0,2.0
7,The mind must be attuned to the contemplation ...,"How are Vrittis described in the passage, and ...",Vrittis are not explicitly described in the pa...,3.0,4.0,2.0
8,These processes are for the evanescent body; t...,"What is implied by the phrase ""There is no dua...","The phrase ""There is no duality there"" implies...",5.0,4.0,4.0
9,"The second, the Brahmavidvara has at- tained t...",What does the dark night of ignorance signify ...,The dark night of ignorance is not mentioned i...,2.0,4.0,2.0


In [761]:
llama_baseline_qa_df = llama_baseline_results_df[['context', 'question', 'answer']]
llama_baseline_qa_df.tail()

Unnamed: 0,context,question,answer
41,It isenough joy to act as a handicap. The joy ...,What is the symbolic significance of the sun r...,The symbolic significance of the sun rising is...
42,"Through attachments and affection, and even en...",Why is desire considered the primary obstacle ...,Desire is considered the primary obstacle to L...
43,He has achieved it by dwelling on the Mithya (...,What qualities are necessary for achieving the...,The passage mentions the following qualities a...
44,He has achieved it by dwelling on the Mithya (...,How does the individual perceive wealth and wo...,"According to the passage, the individual perce..."
45,He will not mistake the rope for the snake. He...,"What does the statement about ""effulgence of B...",The statement implies that the individual (Jiv...


## Data Setup: Llama fine-tuned model results


In [762]:
llama_fine_tuned_path = '/content/drive/MyDrive/Colab Notebooks/final_project_266/266_FP_Phase_Z_Scores.xlsx'

In [763]:
llama_tuned_scored_df = pd.read_excel(llama_fine_tuned_path, sheet_name='HUMAN.fine_tuned_llama')
# llama_tuned_scored_df
llama_tuned_scored_df = llama_tuned_scored_df.rename(columns={'passage': 'context',
                                                              'gen_answer': 'answer',
                                                              'Relevance' : 'relevance',
                                                              'Faithfulness' : 'faithfulness',
                                                              'Completeness' : 'completeness'})
llama_tuned_scored_df = llama_tuned_scored_df[['context','question', 'answer', 'relevance', 'faithfulness', 'completeness']]
llama_tuned_scored_df = llama_tuned_scored_df[0:50]

for col in llama_tuned_scored_df.columns:
  llama_tuned_scored_df[col] = llama_tuned_scored_df[col].apply(fix_mojibake)

# move questions to unit test
golden_scores_df = pd.concat([golden_scores_df, llama_tuned_scored_df.iloc[[8]]], ignore_index=True)
golden_scores_df = pd.concat([golden_scores_df, llama_tuned_scored_df.iloc[[12]]], ignore_index=True)

# questions moved to unit test
llama_tuned_scored_df = llama_tuned_scored_df.drop(index=[8, 12])
llama_tuned_scored_df = llama_tuned_scored_df.drop(index=[18])
llama_tuned_scored_df = llama_tuned_scored_df.drop(index=[19])
llama_tuned_scored_df.reset_index(drop=True, inplace=True)
# llama_tuned_scored_df

In [764]:
llama_tuned_q_a_only = llama_tuned_scored_df[['context', 'question', 'answer']]
llama_tuned_q_a_only

Unnamed: 0,context,question,answer
0,"N. KASTURI Editor, Sanathana Sarathi JNANA V A...",What role do practices like Bhajan play in the...,The passage does not mention Bhajan as a pract...
1,It is only when full knowledge is won that one...,What is necessary to overcome the challenges t...,"According to the passage, consulting the Sadgu..."
2,It is only when full knowledge is won that one...,What is the result of attaining Moksha as desc...,"According to the passage, the result of attain..."
3,The Sadhaka (aspirant for spiritual progress) ...,What is the source of joy for the Jnani?,The passage does not mention the source of joy...
4,"Overfeeding,exhaustion through too much of mov...","What is the significance of the phrase ""Brahma...","The passage does not mention the phrase ""Brahm..."
5,The basis for both is the attraction of the ou...,What implications does the passage suggest abo...,The passage suggests that attachment to the ou...
6,"Therefore, the Jnani who has had a vision of t...",What role does practice play in managing agita...,"According to the passage, the Sadhaka (spiritu..."
7,The mind must be attuned to the contemplation ...,"How are Vrittis described in the passage, and ...",The passage does not describe Vrittis. It ment...
8,These processes are for the evanescent body; t...,"What is implied by the phrase ""There is no dua...","The phrase ""There is no duality there"" implies..."
9,"The second, the Brahmavidvara has at- tained t...",What does the dark night of ignorance signify ...,"The passage does not mention the ""dark night o..."


## Data Setup: RAG

In [765]:
rag_scored_path = '/content/drive/MyDrive/Colab Notebooks/final_project_266/266_FP_Phase_Z_Scores.xlsx'

In [766]:
rag_scored_df = pd.read_excel(rag_scored_path, sheet_name='HUMAN.ph_z_50_rag')
rag_scored_df = rag_scored_df.rename(columns={'generated_answer' : 'answer',
                                              'Relevance' : 'relevance',
                                              'Faithfulness' : 'faithfulness',
                                              'Completeness' : 'completeness'})
rag_scored_df = rag_scored_df[['question', 'answer', 'relevance', 'faithfulness', 'completeness']]
rag_scored_df = rag_scored_df[0:50]

for col in rag_scored_df.columns:
  rag_scored_df[col] = rag_scored_df[col].apply(fix_mojibake)

# rag_scored_df.head()

In [767]:
# add context column for RAG evaluation
rag_scored_df['context'] = llama_tuned_q_a_only['context']
rag_qa_df = rag_scored_df[['context', 'question', 'answer']]
# rag_qa_df.head()

In [768]:
## add rag 18,19 to golden scores --> few shot examples

golden_scores_df = pd.concat([golden_scores_df, rag_scored_df.iloc[[18]]], ignore_index=True)
golden_scores_df = pd.concat([golden_scores_df, rag_scored_df.iloc[[19]]], ignore_index=True)


In [769]:
# remove questions from data
# rag_scored_df = rag_scored_df.drop(index=[19])
# rag_qa_df = rag_qa_df.drop(index=[19])
rag_scored_df = rag_scored_df.drop(index=[8, 12, 18, 19])
rag_qa_df = rag_qa_df.drop(index=[8, 12, 18, 19])

rag_scored_df.reset_index(drop=True, inplace=True)
rag_qa_df.reset_index(drop=True, inplace=True)

rag_scored_df.tail()

Unnamed: 0,question,answer,relevance,faithfulness,completeness,context
41,What is the symbolic significance of the sun r...,The symbolic significance of the sun rising in...,5.0,5.0,5.0,He will not mistake the rope for the snake. He...
42,Why is desire considered the primary obstacle ...,Desire is considered the primary obstacle to L...,4.0,4.0,4.0,
43,What qualities are necessary for achieving the...,The passage mentions that a Jnani (a seeker of...,4.0,4.0,3.0,
44,How does the individual perceive wealth and wo...,"According to the passage, individuals who have...",4.0,5.0,5.0,
45,"What does the statement about ""effulgence of B...","The statement about the ""effulgence of Brahman...",4.0,3.0,4.0,


# OpenAI setup

In [770]:
# OpenAI setup

# Set your OpenAI API key here
os.environ['OPENAI_API_KEY'] = '' # your key here


# Functions

In [771]:
def generate_few_shot_prompt(few_shot_df, metric):
    """
    Generate few-shot prompt examples for a specific metric.

    Args:
        few_shot_df (pd.DataFrame): DataFrame with few-shot examples.
        metric (str): One of 'relevance', 'faithfulness', or 'completeness'.

    Returns:
        str: Formatted few-shot prompt with examples.
    """

    assert metric in {"relevance", "faithfulness", "completeness"}, "Invalid metric."

    lines = [f"Examples for evaluating {metric}:"]

    for i, row in few_shot_df.iterrows():
        lines.append(f"Item {i}:")
        lines.append(f"Question: {row['question']}")
        lines.append(f"Context: {row['context']}")
        lines.append(f"Answer: {row['answer']}")
        lines.append(f"{metric.capitalize()}: {row[metric]}")

        # include explanations if available
        explanation_col = f"{metric}_explanation"
        if explanation_col in row and pd.notnull(row[explanation_col]):
            lines.append(f"Explanation: {row[explanation_col]}")

        lines.append("")  # blank line between examples

    return "\n".join(lines).strip()


In [772]:
def format_score_descriptions(rubric_dict):
    lines = []
    for category, scores in rubric_dict.items():
        lines.append(f"{category.capitalize()} (1-5):")
        for score in sorted(scores.keys(), reverse=True):
            lines.append(f"  {score}: {scores[score]}")
        lines.append("")
    return "\n".join(lines).strip()

In [773]:
def count_big_delta(diff_df):
  numeric_diff = diff_df.copy()

  # Flatten the column index (if it's a MultiIndex)
  if isinstance(numeric_diff.columns, pd.MultiIndex):
      numeric_diff.columns = ['_'.join(col).strip() for col in numeric_diff.columns.values]

  # Get all pairs of self/other columns
  self_cols = [col for col in numeric_diff.columns if col.endswith('_self')]
  other_cols = [col.replace('_self', '_other') for col in self_cols]

  # Count where absolute difference > 1
  count = 0
  for s_col, o_col in zip(self_cols, other_cols):
      diff = (numeric_diff[s_col] - numeric_diff[o_col]).abs()
      count += (diff > 1).sum()

  # print("Number of self-to-other differences greater than 1:", count)

  return count



# GPT Evaluation Prompt

In [775]:
# few_shot_df = golden_scores_df[7:13]
# few_shot_df = golden_scores_df.iloc[[0,1,7,8,9,10,11,12]]
few_shot_df = golden_scores_df
few_shot_df.tail()

Unnamed: 0,context,question,answer,relevance,faithfulness,completeness
19,No attention is paid to the internal and the e...,In what way is self-realization described not ...,"Wise don't know they be wise, thinking they un...",4.0,4.0,4.0
20,"When one takes in an intoxicant, one is not aw...",How does the mind contribute to the cycles of ...,"According to the passage, the mind contributes...",1.0,1.0,3.0
21,"They are like the mirage, which superimposes w...","What is implied by the phrase ""it is ever free...","The phrase ""it is ever free"" implies that Brah...",3.0,1.0,3.0
22,Not to know that this human interlude is but t...,What is the significance of the examples of Sa...,"Based on the provided context, I don't have an...",3.0,4.0,4.0
23,If it is stated that women are not entitledto ...,How does the analogy of the jack tree contribu...,I don't know. The passage does not mention the...,2.0,3.0,4.0


In [776]:
few_shot_rel_df = golden_scores_df.iloc[[0, 9, 14, 19, 23]]
few_shot_faith_df = golden_scores_df.iloc[[0, 9, 14, 19, 20, 22, 23]]
few_shot_comp_df = golden_scores_df.iloc[[0, 9, 14, 19, 20]]

# few_shot_rel_df = golden_scores_df.iloc[[0, 9, 14, 19, 20, 21, 22, 23]]
# few_shot_faith_df = golden_scores_df.iloc[[0, 9, 14, 19, 20, 21, 22, 23]]
# few_shot_comp_df = golden_scores_df.iloc[[0, 9, 14, 19, 20, 21, 22, 23]]

In [777]:
unit_test_scores_df = golden_scores_df.iloc[[1, 2, 4, 6, 7, 8,  11, 12, 13, 16, 17, 18, 10, 3, 5, 15]]
q_a_only = q_a_only.iloc[[1, 2, 4, 6, 7, 8, 11, 12, 13, 16, 17, 18, 10, 3, 5, 15]]

In [778]:
# remove unecessary columns
few_shot_rel_df = few_shot_rel_df[['context', 'question', 'answer', 'relevance']]
few_shot_faith_df = few_shot_faith_df[['context', 'question', 'answer', 'faithfulness']]
few_shot_comp_df = few_shot_comp_df[['context', 'question', 'answer', 'completeness']]
few_shot_rel_df

Unnamed: 0,context,question,answer,relevance
0,If it is stated that women are not entitled to...,How does the denial of scriptural access to wo...,The exclusion of women from Vedic study reflec...,5.0
9,Without a clear understanding of the play in w...,What does the metaphor of life as a 'play in w...,"Somethings move, other things stay stationary,...",3.0
14,"Deprive renunciation (Vairagya) of that basis,...",Is there a contradiction between bhakti (devot...,"Some say bhakti and jnana are alike, but other...",1.0
19,No attention is paid to the internal and the e...,In what way is self-realization described not ...,"Wise don't know they be wise, thinking they un...",4.0
23,If it is stated that women are not entitledto ...,How does the analogy of the jack tree contribu...,I don't know. The passage does not mention the...,2.0


In [779]:
# add explanatioins to few shot examples - relevance
few_shot_rel_df['relevance_explanation'] = '1'

few_shot_rel_df.loc[0, 'relevance_explanation'] = 'The answer directly and specifically addresses the question by linking gender exclusion to Vedantic metaphysics, as described in the context. No irrelevant information is introduced.'
few_shot_rel_df.loc[9, 'relevance_explanation'] = 'The answer touches on the metaphor of movement but shifts entirely off-topic with the reference to “trucks and cars,” which is unrelated to the question or context. Partial topical overlap, but unfocused.'
few_shot_rel_df.loc[14, 'relevance_explanation'] = 'The answer does not address the question of whether bhakti and jnana are stages or contradictory. It makes a vague generalization and does not refer to any concepts in the passage.'
few_shot_rel_df.loc[19, 'relevance_explanation'] = 'The answer loosely aligns with the theme of self-realization as remembering, but the poetic phrasing is indirect and lacks full engagement with the context’s key ideas. Mostly on-topic but underdeveloped.'
# few_shot_rel_df.loc[20, 'relevance_explanation'] = 'The answer focuses on the mind’s role but strays significantly from the specific question about bondage and liberation cycles, making it topically related but not relevant to the core inquiry.'
# few_shot_rel_df.loc[21, 'relevance_explanation'] = 'The answer stays generally on-topic by discussing Brahmam’s nature, but it does not directly engage with the meaning of “ever free” in context, making it only partially relevant.'
# few_shot_rel_df.loc[22, 'relevance_explanation'] = 'Partially relevant; references Arjuna and Krishna, who aren''t in the passage.'
few_shot_rel_df.loc[23, 'relevance_explanation'] = 'Answer avoids the topic and doesn’t engage with the passage.'

few_shot_rel_df

Unnamed: 0,context,question,answer,relevance,relevance_explanation
0,If it is stated that women are not entitled to...,How does the denial of scriptural access to wo...,The exclusion of women from Vedic study reflec...,5.0,The answer directly and specifically addresses...
9,Without a clear understanding of the play in w...,What does the metaphor of life as a 'play in w...,"Somethings move, other things stay stationary,...",3.0,The answer touches on the metaphor of movement...
14,"Deprive renunciation (Vairagya) of that basis,...",Is there a contradiction between bhakti (devot...,"Some say bhakti and jnana are alike, but other...",1.0,The answer does not address the question of wh...
19,No attention is paid to the internal and the e...,In what way is self-realization described not ...,"Wise don't know they be wise, thinking they un...",4.0,The answer loosely aligns with the theme of se...
23,If it is stated that women are not entitledto ...,How does the analogy of the jack tree contribu...,I don't know. The passage does not mention the...,2.0,Answer avoids the topic and doesn’t engage wit...


In [780]:
# add explanatioins to few shot examples - faithfulness
few_shot_faith_df['faithfulness_explanation'] = '1'

few_shot_faith_df.loc[0, 'faithfulness_explanation'] = 'The answer faithfully paraphrases the passage’s examples (e.g., Maithreyi, Parvathi) and correctly interprets the Vedantic principle that the Self is genderless. All claims are directly supported by the context.'
few_shot_faith_df.loc[9, 'faithfulness_explanation'] = 'The answer introduces fabricated imagery (“trucks and cars”) that is unrelated to the context and unsupported by any passage content. No part of the answer is grounded in the source.'
few_shot_faith_df.loc[14, 'faithfulness_explanation'] = 'The answer gestures toward the topic but lacks specific grounding. It does not refer to the context’s emphasis on the unity of bhakti, jnana, and vairagya, making it only partially faithful.'
few_shot_faith_df.loc[19, 'faithfulness_explanation'] = '	The answer paraphrases the concept of self-realization as remembering and indirectly reflects the context''s ideas. However, it omits key terms like “Thuriya” and lacks full grounding in specific phrasing.'
few_shot_faith_df.loc[20, 'faithfulness_explanation'] = 'The answer introduces concepts not grounded in the passage, like deterministic behavior of the mind and its spiritual effects, which are not explicitly mentioned in the context.'
# few_shot_faith_df.loc[21, 'faithfulness_explanation'] = '	The answer presents vague metaphorical ideas that are not supported by the text. Key terms and ideas in the passage (like illusion or misidentification) are ignored.'
few_shot_faith_df.loc[22, 'faithfulness_explanation'] = 'Most of the content is grounded in the context’s omission of the named individuals. However, mentioning Arjuna and Krishna introduces an unsupported claim, even though the denial is otherwise justified.'
few_shot_faith_df.loc[23, 'faithfulness_explanation'] = 'The answer correctly states that the jack tree analogy is not mentioned in the passage, but it does not reference any actual content. The absence claim is plausible, but not deeply grounded.'

few_shot_faith_df


Unnamed: 0,context,question,answer,faithfulness,faithfulness_explanation
0,If it is stated that women are not entitled to...,How does the denial of scriptural access to wo...,The exclusion of women from Vedic study reflec...,5.0,The answer faithfully paraphrases the passage’...
9,Without a clear understanding of the play in w...,What does the metaphor of life as a 'play in w...,"Somethings move, other things stay stationary,...",1.0,The answer introduces fabricated imagery (“tru...
14,"Deprive renunciation (Vairagya) of that basis,...",Is there a contradiction between bhakti (devot...,"Some say bhakti and jnana are alike, but other...",3.0,The answer gestures toward the topic but lacks...
19,No attention is paid to the internal and the e...,In what way is self-realization described not ...,"Wise don't know they be wise, thinking they un...",4.0,\tThe answer paraphrases the concept of self-r...
20,"When one takes in an intoxicant, one is not aw...",How does the mind contribute to the cycles of ...,"According to the passage, the mind contributes...",1.0,The answer introduces concepts not grounded in...
22,Not to know that this human interlude is but t...,What is the significance of the examples of Sa...,"Based on the provided context, I don't have an...",4.0,Most of the content is grounded in the context...
23,If it is stated that women are not entitledto ...,How does the analogy of the jack tree contribu...,I don't know. The passage does not mention the...,3.0,The answer correctly states that the jack tree...


In [781]:
# add explanatioins to few shot examples - completeness
few_shot_comp_df['completeness_explanation'] = '1'

few_shot_comp_df.loc[0, 'completeness_explanation'] = 'Answer addresses all parts of the question.'
few_shot_comp_df.loc[9, 'completeness_explanation'] = 'Answer completes some of the question but is missing important details.'
few_shot_comp_df.loc[14, 'completeness_explanation'] = 'Answer is brief and while a complete sentence, does not fully answer the question. Does not address the connection between bhakti and jnana'
few_shot_comp_df.loc[19, 'completeness_explanation'] = 'Answers is mostly complete but missing minor details. Does not mention the Thuriya stage explicitly'
few_shot_comp_df.loc[20, 'completeness_explanation'] = 'While the answer mentions the mind''s role, it does not clearly explain the cycle or how the mind facilitates bondage or liberation. Partial content is present but incomplete.'
# few_shot_comp_df.loc[21, 'completeness_explanation'] = 'The answer touches on the concept of Brahmam being unchanging, but fails to explain how "ever free" relates to the context’s discussion of bondage, illusion, or perception. Important details are omitted.'
# few_shot_comp_df.loc[22, 'completeness_explanation'] = 'Acknowledges missing info, but doesn''t fully explore the question.'
# few_shot_comp_df.loc[23, 'completeness_explanation'] = 'Notes the omission but misses tying it back to core themes.'

few_shot_comp_df

Unnamed: 0,context,question,answer,completeness,completeness_explanation
0,If it is stated that women are not entitled to...,How does the denial of scriptural access to wo...,The exclusion of women from Vedic study reflec...,5.0,Answer addresses all parts of the question.
9,Without a clear understanding of the play in w...,What does the metaphor of life as a 'play in w...,"Somethings move, other things stay stationary,...",3.0,Answer completes some of the question but is m...
14,"Deprive renunciation (Vairagya) of that basis,...",Is there a contradiction between bhakti (devot...,"Some say bhakti and jnana are alike, but other...",2.0,"Answer is brief and while a complete sentence,..."
19,No attention is paid to the internal and the e...,In what way is self-realization described not ...,"Wise don't know they be wise, thinking they un...",4.0,Answers is mostly complete but missing minor d...
20,"When one takes in an intoxicant, one is not aw...",How does the mind contribute to the cycles of ...,"According to the passage, the mind contributes...",3.0,"While the answer mentions the minds role, it d..."


In [782]:
# few_shot_comp_df['completeness_explanation'] = '1'

# few_shot_comp_df.loc[0, 'completeness_explanation'] = 'The answer fully addresses the question, explaining both the exclusion of women and its contradiction with Vedantic principles. No relevant information from the context is omitted.'
# few_shot_comp_df.loc[9, 'completeness_explanation'] = 'The answer only loosely touches on the metaphor of life as a “play,” but fails to explain its implications for identity or detachment, which are central to the question. Important aspects are missing.'
# few_shot_comp_df.loc[14, 'completeness_explanation'] = 'The answer vaguely references disagreement but does not explain the relationship between bhakti and jnana or how the passage describes their interdependence. Covers only a small part of the question.'
# few_shot_comp_df.loc[19, 'completeness_explanation'] = 'The answer captures the idea of self-realization as remembering, but it does not mention specific contextual elements such as the loss of sensory awareness or the "Fourth (Thuriya) stage." Mostly complete but lacking one key detail.'

# few_shot_comp_df

In [783]:
client = OpenAI()

SYSTEM_MSG = '''You are a strict, careful evaluator for a RAG-based QA system. Your task is to assess answers using only the rubric and examples provided.
You must not use outside knowledge, assumptions, or interpretations. Base every score only on what is explicitly or clearly stated in the context.
Evaluate each item independently. Do not explain or justify scores unless instructed. Return only a JSON list of numeric scores in the specified format.
Be consistent. Do not guess. Follow the rubric exactly.
'''

FEW_SHOT_PROMPT_REL = generate_few_shot_prompt(few_shot_rel_df, 'relevance')
FEW_SHOT_PROMPT_FAITH = generate_few_shot_prompt(few_shot_faith_df, 'faithfulness')
FEW_SHOT_PROMPT_COMP = generate_few_shot_prompt(few_shot_comp_df, 'completeness')

# SCORE_GUIDELINES = format_score_descriptions(score_descriptions)

RELEVANCE_RUBRIC = f'''You are a grading assistant for a subject matter expert.
Your job is to apply the rubric carefully and consistently using only the context provided. Do not use external knowledge or reward fluency alone.
Relevance is about how well the answer directly addresses the question, not just the general topic.

Rate each answer from 1 to 5:
5 – Fully answers the question, highly focused and on-topic.
4 – Mostly answers the question, but includes minor unrelated information.
3 – Partially addresses the question, some digression or vagueness.
2 – Weakly related to the question; mostly off-topic.
1 – Irrelevant or unrelated to the question.

Return only a JSON list in this format:
[
  {{"index": 0, "relevance": 5,"relevance_explanation": "Your explanation here."}},
  ...
]


'''

FAITHFULNESS_RUBRIC = f'''You are a grading assistant for a subject matter expert.
Evaluate how factually consistent the answer is with the provided context. Do not rely on outside knowledge. Do not assume — only accept content that is explicitly or clearly supported.

Faithfulness scores:
5 – Fully supported by the context; no hallucination or unsupported claims.
4 – Mostly supported; minor inference or vague phrasing not directly grounded.
3 – Partially supported; contains some unsupported or ambiguous statements.
2 – Minimally supported; includes clear inaccuracies or weak grounding.
1 – Contradicts or fabricates information; no meaningful support from context.

Return only a JSON list in this format:
[
  {{"index": 0, "faithfulness": 5, "faithfulness_explanation": "Your explanation here."}},
  ...
]

'''

COMPLETENESS_RUBRIC = f'''You are a grading assistant for a subject matter expert.
Assess how completely the answer addresses all parts of the question using only the provided context. Do not assume or reward unnecessary elaboration. A brief but complete answer may receive a 5.

Completeness scores:
5 – Fully answers all aspects of the question; nothing important is missing.
4 – Mostly complete; one minor detail or part may be missing.
3 – Partially complete; some important information is missing.
2 – Incomplete; only covers a small portion of the necessary content.
1 – Severely incomplete; fails to address the question meaningfully.

Return only a JSON list in this format:
[
  {{"index": 0, "completeness": 5, "completeness_explanation": "Your explanation here."}},
  ...
]


'''

In [784]:
def batch_prompt(batch, metric, debug_mode=False):

    if metric == 'relevance':
        RUBRIC = RELEVANCE_RUBRIC
        FEW_SHOT_PROMPT = FEW_SHOT_PROMPT_REL
    elif metric == 'faithfulness':
        RUBRIC = FAITHFULNESS_RUBRIC
        FEW_SHOT_PROMPT = FEW_SHOT_PROMPT_FAITH
    elif metric == 'completeness':
        RUBRIC = COMPLETENESS_RUBRIC
        FEW_SHOT_PROMPT = FEW_SHOT_PROMPT_COMP
    else:
        raise ValueError(f"Invalid metric: {metric}")

    # Base prompt: few-shot examples + rubric
    prompt = FEW_SHOT_PROMPT + "\n\n" + RUBRIC

    # Add instruction to generate explanations if needed
    if debug_mode:
        prompt += "\n\nInclude a brief explanation for each score."

    if debug_mode:
        prompt += (
            "\n\nThis is a debugging session. For each item, explain why the score was given "
            "based only on the context. Be concise and tie the reasoning to the rubric."
        )

    # Add the actual items to evaluate
    for i, item in enumerate(batch):
        prompt += f"\n\nItem {i}:\n"
        prompt += f"Question: {item['question']}\n"
        prompt += f"Context: {item['context']}\n"
        prompt += f"Answer: {item['answer']}"

    return prompt.strip()

def extract_json_list(text):
    match = re.search(r"\[\s*{.*?}\s*\]", text, re.DOTALL)
    if match:
        return json.loads(match.group(0))
    else:
        raise ValueError("No JSON list found in output.")

# def evaluate_batch(batch, metric, debug_mode):
#     prompt = batch_prompt(batch, metric, debug_mode)
#     response = client.chat.completions.create(
#         model="gpt-4o-mini",
#         temperature=0,
#         messages=[
#             {"role": "system", "content": SYSTEM_MSG},
#             {"role": "user", "content": prompt}
#         ]
#     )
#     return extract_json_list(response.choices[0].message.content)

def evaluate_batch(batch, metric, debug_mode=False, n_completions=3, aggregation="average"):
    prompt = batch_prompt(batch, metric, debug_mode=debug_mode)

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        temperature=0,
        n=n_completions,
        messages=[
            {"role": "system", "content": SYSTEM_MSG},
            {"role": "user", "content": prompt}
        ]
    )

    parsed_completions = []
    for i in range(n_completions):
        try:
            content = response.choices[i].message.content
            parsed = extract_json_list(content)
            parsed_completions.append(parsed)
        except Exception as e:
            print(f"Failed to parse completion {i}: {e}")

    if not parsed_completions:
        raise ValueError("No valid completions parsed.")

    # Aggregate scores (and explanations if present)
    num_items = len(parsed_completions[0])
    aggregated = []

    for i in range(num_items):
        item_result = {"index": i}
        for key in [metric]:
            values = [comp[i][key] for comp in parsed_completions if key in comp[i]]
            if aggregation == "average":
                item_result[key] = round(sum(values) / len(values))
            else:  # fallback to majority vote
                item_result[key] = Counter(values).most_common(1)[0][0]

            # Optional: aggregate explanations
            exp_key = f"{key}_explanation"
            explanations = [comp[i].get(exp_key) for comp in parsed_completions if exp_key in comp[i]]
            if explanations:
                item_result[exp_key] = Counter(explanations).most_common(1)[0][0]

        aggregated.append(item_result)

    return aggregated

def run_on_dataframe(df, batch_size=10, debug=False, n_completions=3):
    rows = df.to_dict("records")
    results = []

    for i in range(0, len(rows), batch_size):
        batch = rows[i:i + batch_size]
        try:
            batch_result_rel = evaluate_batch(batch, 'relevance', debug_mode=debug, n_completions=n_completions)
            batch_result_faith = evaluate_batch(batch, 'faithfulness', debug_mode=debug, n_completions=n_completions)
            batch_result_comp = evaluate_batch(batch, 'completeness', debug_mode=debug, n_completions=n_completions)

            for j in range(len(batch)):
                base = {
                    "context": batch[j]["context"],
                    "question": batch[j]["question"],
                    "answer": batch[j]["answer"],

                    "relevance": batch_result_rel[j].get("relevance"),
                    "faithfulness": batch_result_faith[j].get("faithfulness"),
                    "completeness": batch_result_comp[j].get("completeness"),

                    "relevance_explanation": batch_result_rel[j].get("relevance_explanation"),
                    "faithfulness_explanation": batch_result_faith[j].get("faithfulness_explanation"),
                    "completeness_explanation": batch_result_comp[j].get("completeness_explanation"),
                }
                results.append(base)

        except Exception as e:
            print(f"Batch {i}-{i + batch_size} failed: {e}")

    return pd.DataFrame(results)

## Unit Test

In [785]:
test_eval_df= run_on_dataframe(q_a_only, batch_size=5, debug=False, n_completions=3)
# test_eval_df

In [786]:
# test_eval_df

Unnamed: 0,context,question,answer,relevance,faithfulness,completeness,relevance_explanation,faithfulness_explanation,completeness_explanation
0,"That is to say, he who has known Brahmam be- c...",If the realized soul becomes indistinguishable...,Although the realized soul has merged complete...,5,4,5,The answer directly addresses the question by ...,The answer reflects the idea that individualit...,The answer fully addresses the question by exp...
1,Arguments JNANA VAHINI 31 and discussions mult...,Why are philosophical arguments seen as a dist...,Vedanta views philosophical argument as a dive...,5,5,5,The answer fully addresses the question by exp...,The answer accurately captures the essence of ...,The answer completely explains why philosophic...
2,The ever-moving waves of the lake have to be s...,Why does Vedanta treat the intellect (buddhi) ...,Vedanta regards the intellect as a necessary t...,5,5,5,The answer clearly explains the dual role of t...,"The answer is fully supported by the context, ...",The answer fully addresses the dual role of th...
3,You may know from the Sastras that He who has ...,Why is turning the gaze inward portrayed as mo...,Turning inward is seen as revolutionary becaus...,5,5,5,The answer effectively addresses the question ...,"The answer is fully supported by the context, ...",The answer completely explains why turning the...
4,JNANA VAHINI 19 All agitations will cease the ...,How does understanding the world as mithya (il...,Recognizing the world as mithya dissolves atta...,5,5,5,The answer directly relates to the question by...,"The answer is fully supported by the context, ...",The answer fully addresses how understanding t...
5,"Thus, the mind has to be caged in the cave of ...",Why is silence considered a higher mode of com...,Silence is valued above words in self-realizat...,5,4,5,The answer directly addresses the question by ...,The answer captures the essence of silence as ...,The answer fully addresses the question by exp...
6,The Jiva is a bull reclining in the shade of M...,What does the imagery of the sun reflecting in...,Cousin reflected in many vessels illustrates ...,5,5,5,The answer effectively explains the imagery of...,The answer accurately reflects the context's m...,The answer completely explains the imagery of ...
7,Samam - Control of the senses. Samsara - Chang...,What shifts in perspective occur when death is...,"Death means something stops, and people feel d...",1,1,1,The answer does not address the question regar...,The answer contradicts the context by oversimp...,The answer fails to meaningfully address the q...
8,One will fail to benefit even from one’s actua...,How does the ideal teacher in Advaita Vedanta ...,Teachers sometimes talk and sometimes do not. ...,2,2,2,"The answer is weakly related to the question, ...",The answer minimally supports the context by a...,The answer is brief and does not adequately ex...
9,"This is the discipline called Brahmabhyasa, th...",How does the dissolution of the ego lead to ac...,"Without ego, one may act differently.",2,3,2,"The answer is weakly related to the question, ...",The answer partially supports the context by i...,The answer is incomplete and does not sufficie...


In [787]:
unit_scores = unit_test_scores_df

relavance_res = spearmanr(test_eval_df['relevance'], unit_scores['relevance'])
faithfulness_res = spearmanr(test_eval_df['faithfulness'], unit_scores['faithfulness'])
completeness_res = spearmanr(test_eval_df['completeness'], unit_scores['completeness'])

print('Sperarmans rho correlation - baseline')
print(f'Relevance: {relavance_res.statistic}')
print(f'Faithfulness: {faithfulness_res.statistic}')
print(f'Completeness: {completeness_res.statistic}')

Sperarmans rho correlation - baseline
Relevance: 0.7564093584889487
Faithfulness: 0.7743330496523382
Completeness: 0.8352193110742215


In [788]:
# kendall tau, ndcg
relavance_res = kendalltau(test_eval_df['relevance'], unit_scores['relevance'])
faithfulness_res = kendalltau(test_eval_df['faithfulness'], unit_scores['faithfulness'])
completeness_res = kendalltau(test_eval_df['completeness'], unit_scores['completeness'])

print('Kendall Tau correlation - baseline')
print(f'Relevance: {relavance_res.statistic}')
print(f'Faithfulness: {faithfulness_res.statistic}')
print(f'Completeness: {completeness_res.statistic}')

Kendall Tau correlation - baseline
Relevance: 0.7242486824147758
Faithfulness: 0.7112431779901639
Completeness: 0.7672697427301893


In [789]:
print('Relevance: ', test_eval_df['relevance'].mean())
print('Faithfulness: ', test_eval_df['faithfulness'].mean())
print('Completeness: ', test_eval_df['completeness'].mean())

Relevance:  3.5625
Faithfulness:  3.375
Completeness:  3.5625


In [851]:
unit_test_scores_df.reset_index(drop=True, inplace=True)

In [843]:
test_eval_df = test_eval_df[['context', 'question', 'answer', 'relevance', 'faithfulness', 'completeness']]

In [848]:
test_eval_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   context       16 non-null     object
 1   question      16 non-null     object
 2   answer        16 non-null     object
 3   relevance     16 non-null     int64 
 4   faithfulness  16 non-null     int64 
 5   completeness  16 non-null     int64 
dtypes: int64(3), object(3)
memory usage: 900.0+ bytes


In [854]:
diff_df = test_eval_df.compare(unit_test_scores_df)
diff_count = diff_df.notna().sum().sum()
print(f'{diff_count/2}/60')
# diff_df

test_correct = 60 - (diff_count/2)
test_ob1 = (diff_count/2) - count_big_delta(diff_df)

print(f'accuracy: {test_correct/60.0:.4f}')
print(f'accuracy +/- 1: {(test_correct + test_ob1)/60.0:.4f}')

29.0/60
Number of self-to-other differences greater than 1: 7
accuracy: 0.5167
accuracy +/- 1: 0.8833


In [699]:
# test_eval_df.to_csv("/content/drive/MyDrive/Colab Notebooks/final_project_266/golden_scores_20_eval_8-1-25_v1.csv", index=False)

# Evaluation

## Evaluate: Llama 8B (no tuning)


In [881]:
# llama_baseline_eval_df = run_on_dataframe(llama_baseline_results_df, batch_size=10, debug=False, n_completions=3)
llama_baseline_eval_df = run_on_dataframe(llama_base_exp_df, batch_size=10, debug=False, n_completions=3)
llama_baseline_eval_df.tail()

Batch 60-70 failed: list index out of range


Unnamed: 0,context,question,answer,relevance,faithfulness,completeness,relevance_explanation,faithfulness_explanation,completeness_explanation
199,"Through inquiry on these lines, knowledge is r...",What is the significance of inquiry in achievi...,"According to the passage, inquiry on the lines...",5,5,5,The answer fully addresses the question by cle...,The answer accurately reflects the significanc...,The answer fully addresses the significance of...
200,It cannot be got in Swarga (heaven) or in Moun...,"According to the passage, where can true knowl...","According to the passage, true knowledge of th...",5,5,5,The answer directly answers the question by st...,The answer correctly states that true knowledg...,The answer completely addresses where true kno...
201,He hasachieved it by dwelling on the Mithya (u...,"What does the term ""Mithya"" refer to in the pa...","The term ""Mithya"" refers to the unreality of t...",5,5,5,The answer accurately defines the term 'Mithya...,The answer accurately defines 'Mithya' as the ...,The answer fully explains the term 'Mithya' as...
202,The Jivanmuktha has no trace of the “will toli...,What transformation occurs in the aspirant's u...,The aspirant's understanding transforms in tha...,5,5,5,The answer comprehensively describes the trans...,The answer clearly describes the transformatio...,The answer comprehensively describes the trans...
203,Hewill not mistake the rope for the snake. He ...,"What is the significance of the term ""Abhasa A...","The term ""Abhasa Avaranam"" refers to the mista...",5,5,5,The answer effectively explains the significan...,The answer accurately explains 'Abhasa Avarana...,The answer fully explains the significance of ...


In [882]:
llama_baseline_eval_df.to_csv("/content/drive/MyDrive/Colab Notebooks/final_project_266/llama_baseline_eval_v2_experiment_set.csv", index=False)

In [857]:
# spearmans correlation comparison
relevance_res_base = spearmanr(llama_baseline_eval_df['relevance'], llama_baseline_results_df['relevance'])
faithfulness_res_base = spearmanr(llama_baseline_eval_df['faithfulness'], llama_baseline_results_df['faithfulness'])
completeness_res_base = spearmanr(llama_baseline_eval_df['completeness'], llama_baseline_results_df['completeness'])

print('Sperarmans correlation - llama baseline')
print(f'Relevance: {relevance_res_base.statistic}')
print(f'Faithfulness: {faithfulness_res_base.statistic}')
print(f'Completeness: {completeness_res_base.statistic}')

Sperarmans correlation - llama baseline
Relevance: 0.6697996974859218
Faithfulness: 0.18019928034806182
Completeness: 0.7621951375855086


In [858]:
# kendall tau comparison
relevance_res_base_kt = kendalltau(llama_baseline_eval_df['relevance'], llama_baseline_results_df['relevance'])
faithfulness_res_base_kt = kendalltau(llama_baseline_eval_df['faithfulness'], llama_baseline_results_df['faithfulness'])
completeness_res_base_kt = kendalltau(llama_baseline_eval_df['completeness'], llama_baseline_results_df['completeness'])

print('Kendall Tau correlation - llama baseline')
print(f'Relevance: {relevance_res_base_kt.statistic}')
print(f'Faithfulness: {faithfulness_res_base_kt.statistic}')
print(f'Completeness: {completeness_res_base_kt.statistic}')

Kendall Tau correlation - llama baseline
Relevance: 0.6049994313639846
Faithfulness: 0.16467161375070685
Completeness: 0.7013317858594427


In [859]:
print('Relevance: ', llama_baseline_eval_df['relevance'].mean())
print('Faithfulness: ', llama_baseline_eval_df['faithfulness'].mean())
print('Completeness: ', llama_baseline_eval_df['completeness'].mean())

Relevance:  3.0652173913043477
Faithfulness:  4.934782608695652
Completeness:  2.9782608695652173


In [860]:
llama_baseline_eval_df = llama_baseline_eval_df[['context','question', 'answer', 'relevance', 'faithfulness', 'completeness']]

In [861]:
# comparison
llama_base_diff = llama_baseline_eval_df.compare(llama_baseline_results_df)
llama_base_diff_count = llama_base_diff.notna().sum().sum()
print(f'{llama_base_diff_count/2}/{50*3}')
print(f'accuracy: {(llama_base_diff_count/2)/(50*3):.4f}')

c = count_big_delta(llama_base_diff)
print(f'(off by one) accuracy: {((llama_base_diff_count/2) + c)/150.0:.4f}')

llama_base_correct = (150 - (llama_base_diff_count/2))
llama_base_ob1 = (llama_base_diff_count/2) - c

print(f'accuracy: {llama_base_correct/150.0:.4f}')
print(f'accuracy +/- 1: {(llama_base_correct + llama_base_ob1)/150.0:.4f}')


106.0/150
accuracy: 0.7067
Number of self-to-other differences greater than 1: 39
(off by one) accuracy: 0.9667
accuracy: 0.2933
accuracy +/- 1: 0.7400


## Evaluate: Llama 8B fine-tuned

In [862]:
llama_tuned_eval_df = run_on_dataframe(llama_tuned_q_a_only, batch_size=10, debug=False, n_completions=3)
llama_tuned_eval_df.head()

Unnamed: 0,context,question,answer,relevance,faithfulness,completeness,relevance_explanation,faithfulness_explanation,completeness_explanation
0,"N. KASTURI Editor, Sanathana Sarathi JNANA V A...",What role do practices like Bhajan play in the...,The passage does not mention Bhajan as a pract...,2,5,5,The answer states that the passage does not me...,The answer accurately reflects the context by ...,The answer fully addresses the question by sta...
1,It is only when full knowledge is won that one...,What is necessary to overcome the challenges t...,"According to the passage, consulting the Sadgu...",5,5,5,The answer directly addresses the question by ...,"The answer is fully supported by the context, ...",The answer fully addresses the question by sta...
2,It is only when full knowledge is won that one...,What is the result of attaining Moksha as desc...,"According to the passage, the result of attain...",5,5,5,The answer accurately reflects the context by ...,"The answer is fully supported by the context, ...",The answer fully addresses the question by sta...
3,The Sadhaka (aspirant for spiritual progress) ...,What is the source of joy for the Jnani?,The passage does not mention the source of joy...,2,2,2,The answer states that the passage does not me...,The answer contradicts the context by stating ...,The answer does not address the source of joy ...
4,"Overfeeding,exhaustion through too much of mov...","What is the significance of the phrase ""Brahma...","The passage does not mention the phrase ""Brahm...",1,2,1,The answer states that the passage does not me...,The answer incorrectly states that the passage...,The answer fails to address the question meani...


In [863]:
relavance_res = spearmanr(llama_tuned_eval_df['relevance'], llama_tuned_scored_df['relevance'])
faithfulness_res = spearmanr(llama_tuned_eval_df['faithfulness'], llama_tuned_scored_df['faithfulness'])
completeness_res = spearmanr(llama_tuned_eval_df['completeness'], llama_tuned_scored_df['completeness'])

print('Sperarmans correlation - llama fine-tuned')
print(f'Relevance: {relavance_res.statistic}')
print(f'Faithfulness: {faithfulness_res.statistic}')
print(f'Completeness: {completeness_res.statistic}')

Sperarmans correlation - llama fine-tuned
Relevance: 0.7984673762811771
Faithfulness: 0.34448103660361407
Completeness: 0.5505502311991098


In [864]:
relevance_res_llama_kt = kendalltau(llama_tuned_eval_df['relevance'], llama_tuned_scored_df['relevance'])
faithfulness_res_llama_kt = kendalltau(llama_tuned_eval_df['faithfulness'], llama_tuned_scored_df['faithfulness'])
completeness_res_llama_kt = kendalltau(llama_tuned_eval_df['completeness'], llama_tuned_scored_df['completeness'])

print('Kendall Tau correlation - llama fine-tuned')
print(f'Relevance: {relevance_res_llama_kt.statistic}')
print(f'Faithfulness: {faithfulness_res_llama_kt.statistic}')
print(f'Completeness: {completeness_res_llama_kt.statistic}')

Kendall Tau correlation - llama fine-tuned
Relevance: 0.7179710521800868
Faithfulness: 0.31487641574216935
Completeness: 0.46760141215839707


In [865]:
print('Relevance: ', llama_tuned_eval_df['relevance'].mean())
print('Faithfulness: ', llama_tuned_eval_df['faithfulness'].mean())
print('Completeness: ', llama_tuned_eval_df['completeness'].mean())

Relevance:  3.347826086956522
Faithfulness:  4.5
Completeness:  3.4565217391304346


In [866]:
llama_tuned_eval_df.to_csv("/content/drive/MyDrive/Colab Notebooks/final_project_266/llama_tuned_eval_v2.csv", index=False)

In [867]:
llama_tuned_eval_df = llama_tuned_eval_df[['context', 'question', 'answer', 'relevance', 'faithfulness', 'completeness']]


In [868]:
# comparison
llama_diff_df = llama_tuned_eval_df.compare(llama_tuned_scored_df)
llama_diff_count = llama_diff_df.notna().sum().sum()

llama_ft_correct = 150 - (llama_diff_count/2)
llama_ob1 = (llama_diff_count/2) - count_big_delta(llama_diff_df)

print(f'accuracy: {llama_ft_correct/150.0:.4f}')
print(f'accuracy +/- 1: {(llama_ft_correct + llama_ob1)/150.0:.4f}')

Number of self-to-other differences greater than 1: 54
accuracy: 0.3800
accuracy +/- 1: 0.6400


In [827]:
llama_diff_df

Unnamed: 0_level_0,relevance,relevance,faithfulness,faithfulness,completeness,completeness
Unnamed: 0_level_1,self,other,self,other,self,other
0,2.0,4.0,5.0,3.0,,
1,5.0,3.0,5.0,1.0,5.0,3.0
2,,,5.0,3.0,5.0,3.0
3,2.0,4.0,5.0,2.0,2.0,3.0
4,1.0,4.0,5.0,2.0,1.0,4.0
5,,,5.0,4.0,,
7,2.0,4.0,5.0,3.0,2.0,5.0
8,,,5.0,3.0,5.0,2.0
9,2.0,4.0,5.0,3.0,1.0,3.0
10,1.0,4.0,1.0,3.0,2.0,4.0


In [None]:
# self = evaluator, other = person
llama_diff_df[['relevance']].value_counts()

In [None]:
# self = evaluator, other = person
lama_diff_df[['faithfulness']].value_counts()

In [None]:
# self = evaluator, other = person
llama_diff_df[['completeness']].value_counts()

## Evaluation: RAG

In [869]:
rag_eval_df = run_on_dataframe(rag_qa_df, batch_size=10, debug=False, n_completions=3)
rag_eval_df.head()

Unnamed: 0,context,question,answer,relevance,faithfulness,completeness,relevance_explanation,faithfulness_explanation,completeness_explanation
0,"N. KASTURI Editor, Sanathana Sarathi JNANA V A...",What role do practices like Bhajan play in the...,The passage does not explicitly mention Bhajan...,3,4,4,The answer discusses related practices like Ab...,The answer acknowledges the lack of explicit m...,The answer discusses practices related to mind...
1,It is only when full knowledge is won that one...,What is necessary to overcome the challenges t...,To overcome the challenges that arise during D...,2,2,2,The answer mentions Nishkama Karma but does no...,"The answer introduces Nishkama Karma, which is...",The answer only mentions mastering the senses ...
2,It is only when full knowledge is won that one...,What is the result of attaining Moksha as desc...,"According to the passage, the result of attain...",5,5,5,The answer clearly outlines the results of att...,The answer accurately reflects the results of ...,The answer fully addresses the result of attai...
3,The Sadhaka (aspirant for spiritual progress) ...,What is the source of joy for the Jnani?,"According to the provided context, the source ...",5,5,5,The answer accurately describes the source of ...,The answer correctly identifies that the sourc...,The answer fully explains the source of joy fo...
4,"Overfeeding,exhaustion through too much of mov...","What is the significance of the phrase ""Brahma...","The phrase ""Brahmavith Brahmaiva bhavathi"" is ...",4,1,4,The answer explains the phrase's significance ...,The answer introduces concepts not present in ...,The answer explains the significance of the ph...


In [870]:
relevance_res_rag = spearmanr(rag_eval_df['relevance'], rag_scored_df['relevance'])
faithfulness_res_rag = spearmanr(rag_eval_df['faithfulness'], rag_scored_df['faithfulness'])
completeness_res_rag = spearmanr(rag_eval_df['completeness'], rag_scored_df['completeness'])

relevance_res_rag_kt = kendalltau(rag_eval_df['relevance'], rag_scored_df['relevance'])
faithfulness_res_rag_kt = kendalltau(rag_eval_df['faithfulness'], rag_scored_df['faithfulness'])
completeness_res_rag_kt = kendalltau(rag_eval_df['completeness'], rag_scored_df['completeness'])

In [871]:
print('Sperarmans correlation - RAG')
print(f'Relevance: {relevance_res_rag.statistic}')
print(f'Faithfulness: {faithfulness_res_rag.statistic}')
print(f'Completeness: {completeness_res_rag.statistic}')

Sperarmans correlation - RAG
Relevance: 0.3467189080723678
Faithfulness: 0.16665188937123487
Completeness: 0.4350706761439559


In [872]:
print('Kendall Tau correlation - RAG')
print(f'Relevance: {relevance_res_rag_kt.statistic}')
print(f'Faithfulness: {faithfulness_res_rag_kt.statistic}')
print(f'Completeness: {completeness_res_rag_kt.statistic}')

Kendall Tau correlation - RAG
Relevance: 0.3008809228585808
Faithfulness: 0.14252421488904793
Completeness: 0.37989349191925387


In [873]:
print('Relevance: ', np.mean(rag_eval_df['relevance']))
print('Faithfulness: ', np.mean(rag_eval_df['faithfulness']))
print('Completeness: ', np.mean(rag_eval_df['completeness']))

Relevance:  3.782608695652174
Faithfulness:  3.0434782608695654
Completeness:  3.891304347826087


In [875]:
rag_eval_df = rag_eval_df[['question', 'answer', 'relevance', 'faithfulness', 'completeness']]
rag_scored_df = rag_scored_df[['question', 'answer', 'relevance', 'faithfulness', 'completeness']]

# find incorrect evaluations
rag_diff_df = rag_eval_df.compare(rag_scored_df)
rag_diff_count = rag_diff_df.notna().sum().sum()
print(f'incorrect: {rag_diff_count/2}/{50*3.0}') # number incorrect
rag_correct = (150 - (rag_diff_count/2))

c = count_big_delta(rag_diff_df)
off_by_one = rag_correct + (rag_diff_count/2 - c)

print(f'accuracy: {rag_correct/150.0:.4f}')
print(f'accuracy +/- 1: {off_by_one/150.0:.4f}')


incorrect: 100.0/150.0
Number of self-to-other differences greater than 1: 32
accuracy: 0.3333
accuracy +/- 1: 0.7867


In [874]:
rag_eval_df.to_csv("/content/drive/MyDrive/Colab Notebooks/final_project_266/rag_eval_v2.csv", index=False)

### RAG testing

In [None]:
num = 47

rag_eval_df.iloc[num]['question']

In [None]:
rag_eval_df.iloc[num]['answer']

In [None]:
rag_eval_df.iloc[num]

In [None]:
rag_scored_df.iloc[num]

In [446]:
rag_diff_df = rag_eval_df.compare(rag_scored_df)
# len(rag_compare)
rag_diff_df

Unnamed: 0_level_0,relevance,relevance,faithfulness,faithfulness,completeness,completeness
Unnamed: 0_level_1,self,other,self,other,self,other
0,,,4.0,3.0,4.0,1.0
1,2.0,4.0,2.0,3.0,2.0,3.0
2,5.0,4.0,5.0,4.0,5.0,4.0
4,2.0,4.0,1.0,4.0,5.0,4.0
5,5.0,4.0,,,5.0,4.0
6,5.0,4.0,,,5.0,4.0
7,1.0,3.0,2.0,4.0,2.0,1.0
8,,,3.0,4.0,5.0,4.0
9,2.0,5.0,2.0,4.0,2.0,4.0
10,,,3.0,4.0,,


In [447]:
rag_diff_df[['relevance']].value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
"(relevance, self)","(relevance, other)",Unnamed: 2_level_1
5.0,4.0,11
2.0,4.0,5
4.0,5.0,5
5.0,3.0,3
1.0,3.0,2
1.0,2.0,1
2.0,5.0,1
2.0,3.0,1
1.0,4.0,1
3.0,4.0,1


In [448]:
rag_diff_df[['faithfulness']].value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
"(faithfulness, self)","(faithfulness, other)",Unnamed: 2_level_1
4.0,3.0,5
3.0,4.0,5
2.0,4.0,5
5.0,4.0,4
3.0,5.0,3
4.0,5.0,3
2.0,3.0,3
1.0,3.0,2
2.0,5.0,2
1.0,4.0,1


In [449]:
rag_diff_df[['completeness']].value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
"(completeness, self)","(completeness, other)",Unnamed: 2_level_1
5.0,4.0,15
2.0,3.0,4
2.0,4.0,2
4.0,5.0,2
4.0,3.0,2
2.0,1.0,1
2.0,5.0,1
4.0,1.0,1
3.0,5.0,1
3.0,4.0,1


### Export Excel

In [422]:
with pd.ExcelWriter("/content/drive/MyDrive/Colab Notebooks/final_project_266/gpt_evaluator_scores_v2_0-explained.xlsx", engine="openpyxl") as writer:
    test_eval_df.to_excel(writer, index=False, sheet_name='unit_test_eval')
    llama_baseline_eval_df.to_excel(writer, index=False, sheet_name='llama_baseline_eval')
    llama_tuned_eval_df.to_excel(writer, index=False, sheet_name='llama_fine_tuned_eval')
    rag_eval_df.to_excel(writer, index=False, sheet_name='rag_eval')

In [878]:
with pd.ExcelWriter("/content/drive/MyDrive/Colab Notebooks/final_project_266/reference_scores_v2_8-2.xlsx", engine="openpyxl") as writer:
    unit_scores.to_excel(writer, index=False, sheet_name='unit_test_reference')
    llama_baseline_results_df.to_excel(writer, index=False, sheet_name='llama_baseline_reference')
    llama_tuned_scored_df.to_excel(writer, index=False, sheet_name='llama_fine_tuned_reference')
    rag_scored_df.to_excel(writer, index=False, sheet_name='rag_reference')

In [None]:
# rag_eval_df.to_excel("/content/drive/MyDrive/Colab Notebooks/final_project_266/rag_eval_7-30-25.xlsx", index=False, sheet_name='rag_eval')

# testing...

In [None]:
# Run evaluation on your DataFrame
# results_df = run_on_dataframe(df, batch_size=5)
# results_df.head()

In [None]:
# Save to CSV
# results_df.to_csv("faithfulness_results.csv", index=False)

In [None]:
test_eval_df.to_csv("test_results_20.csv", index=False)

In [None]:
import matplotlib.pyplot as plt

# Create a figure with 3 rows and 2 columns for side-by-side plots
fig, axes = plt.subplots(3, 2, figsize=(12, 12))

# Plot Relevance
axes[0, 0].hist(llama_tuned_scored_df['relevance'], bins=5, range=(0.5, 5.5), edgecolor='black', color='gold')
axes[0, 0].set_title('Relevance (Human Scores)')
axes[0, 0].set_xlabel('Score')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_xticks([1, 2, 3, 4, 5])

axes[0, 1].hist(llama_tuned_eval_df['relevance'], bins=5, range=(0.5, 5.5), edgecolor='black')
axes[0, 1].set_title('Relevance (Evaluator Scores)')
axes[0, 1].set_xlabel('Score')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_xticks([1, 2, 3, 4, 5])

# Plot Faithfulness
axes[1, 0].hist(llama_tuned_scored_df['faithfulness'], bins=5, range=(0.5, 5.5), edgecolor='black', color='gold')
axes[1, 0].set_title('Faithfulness (Human Scores)')
axes[1, 0].set_xlabel('Score')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_xticks([1, 2, 3, 4, 5])

axes[1, 1].hist(llama_tuned_eval_df['faithfulness'], bins=5, range=(0.5, 5.5), edgecolor='black')
axes[1, 1].set_title('Faithfulness (Evaluator Scores)')
axes[1, 1].set_xlabel('Score')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].set_xticks([1, 2, 3, 4, 5])

# Plot Completeness
axes[2, 0].hist(llama_tuned_scored_df['completeness'], bins=5, range=(0.5, 5.5), edgecolor='black', color='gold')
axes[2, 0].set_title('Completeness (Human Scores)')
axes[2, 0].set_xlabel('Score')
axes[2, 0].set_ylabel('Frequency')
axes[2, 0].set_xticks([1, 2, 3, 4, 5])

axes[2, 1].hist(llama_tuned_eval_df['completeness'], bins=5, range=(0.5, 5.5), edgecolor='black')
axes[2, 1].set_title('Completeness (Evaluator Scores)')
axes[2, 1].set_xlabel('Score')
axes[2, 1].set_ylabel('Frequency')
axes[2, 1].set_xticks([1, 2, 3, 4, 5])


plt.tight_layout()
plt.show()

In [None]:
# Create histograms for each metric
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

axes[0].hist(llama_tuned_eval_df['relevance'], bins=5, range=(0.5, 5.5), edgecolor='black')
axes[0].set_title('Relevance Score Distribution (Llama Fine-tuned)')
axes[0].set_xlabel('Score')
axes[0].set_ylabel('Frequency')
axes[0].set_xticks([1, 2, 3, 4, 5])


axes[1].hist(llama_tuned_eval_df['faithfulness'], bins=5, range=(0.5, 5.5), edgecolor='black')
axes[1].set_title('Faithfulness Score Distribution (Llama Fine-tuned)')
axes[1].set_xlabel('Score')
axes[1].set_ylabel('Frequency')
axes[1].set_xticks([1, 2, 3, 4, 5])


axes[2].hist(llama_tuned_eval_df['completeness'], bins=5, range=(0.5, 5.5), edgecolor='black')
axes[2].set_title('Completeness Score Distribution (Llama Fine-tuned)')
axes[2].set_xlabel('Score')
axes[2].set_ylabel('Frequency')
axes[2].set_xticks([1, 2, 3, 4, 5])


plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def plot_confusion_matrix(diff_df, metric):
    # Extract self and other columns for the given metric using MultiIndex
    # Use .loc to access columns by label from the MultiIndex
    compare_df = diff_df.loc[:, (metric, ['self', 'other'])].dropna()
    # Flatten the column names for easier plotting if needed, but not strictly necessary for crosstab
    compare_df.columns = ['_'.join(col).strip() for col in compare_df.columns.values]


    if compare_df.empty:
        print(f"No differences found for {metric}. Cannot create confusion matrix.")
        return

    # Create a confusion matrix using pd.crosstab
    # Use the flattened column names
    confusion_matrix = pd.crosstab(compare_df[f'{metric}_other'], compare_df[f'{metric}_self'],
                                   rownames=['Human Scores'], colnames=['Evaluator Scores'],
                                   dropna=False) # Keep all possible score combinations

    # Fill in missing combinations with 0
    # Determine all possible scores from both 'self' and 'other' columns
    all_scores = sorted(list(set(compare_df[f'{metric}_other']).union(set(compare_df[f'{metric}_self']))))
    # Reindex the confusion matrix to include all possible scores, filling missing with 0
    confusion_matrix = confusion_matrix.reindex(index=all_scores, columns=all_scores, fill_value=0)


    # Plot the confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(confusion_matrix, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix for {metric.capitalize()}')
    plt.show()

# Generate confusion matrices for each metric
plot_confusion_matrix(rag_diff_df, 'relevance')
print()
plot_confusion_matrix(rag_diff_df, 'faithfulness')
print()
plot_confusion_matrix(rag_diff_df, 'completeness')

In [None]:
rag_diff_flat = rag_diff_df.stack()
rag_diff_flat

In [None]:
score_descriptions = {
    "faithfulness": {
        5: "Fully supported by context, no hallucination",
        4: "Mostly supported, minor missing info",
        3: "Partially supported, important gaps",
        2: "Minimally supported, contains incorrect info",
        1: "Contradicts context or fabricated"
    },
    "relevance": {
        5: "Directly answers the question with full focus",
        4: "Mostly relevant but includes extra",
        3: "Somewhat relevant, partial focus",
        2: "Mostly irrelevant with some overlap",
        1: "Completely off-topic"
    },
    "completeness": {
        5: "Fully answers the question",
        4: "Almost complete, missing one part",
        3: "Answers part of the question",
        2: "Provides only one part",
        1: "Does not answer the question"
    }
}

# score_descriptions_2 = {
#     "faithfulness": {
#         5: "Fully supported by context, no hallucination",
#         4: "Mostly supported",
#         3: "Partially supported, important gaps",
#         2: "Minimally supported",
#         1: "Contradicts context or fabricated"
#     },
#     "relevance": {
#         5: "Directly answers the question with full focus",
#         4: "Mostly relevant",
#         3: "Somewhat relevant, partial focus",
#         2: "Mostly irrelevant",
#         1: "Completely off-topic"
#     },
#     "completeness": {
#         5: "Fully answers the question",
#         4: "Almost complete",
#         3: "Answers part of the question",
#         2: "Provides only a minimal part",
#         1: "Does not answer the question"
#     }
# }

score_descriptions_3 = {
    "faithfulness": {
        5: "Fully supported by context, no hallucination",
        4: "Mostly supported, minor missing info",
        3: "Partially supported, important gaps",
        2: "Minimally supported, contains incorrect info",
        1: "Contradicts context or fabricated"
    },
    "relevance": {
        5: "Fully relevant",
        4: "Mostly relevant",
        3: "Somewhat relevant",
        2: "Minimally relevant",
        1: "Irrelevant"
    },
    "completeness": {
        5: "Fully answers the question",
        4: "Almost complete, missing one part",
        3: "Answers part of the question",
        2: "Provides only one part",
        1: "Does not answer the question"
    }
}