Advanced RAG Implementation

In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, CrossEncoder
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from pymilvus import MilvusClient
from difflib import SequenceMatcher
import warnings
warnings.filterwarnings("ignore")

# Config
DB_NAME = "rag_experiments_384d.db"
COLLECTION_NAME = 'rag_mini_384d'
EMBEDDING_MODEL_NAME = 'all-MiniLM-L6-v2'

print("Loading models")
embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
llm_model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(llm_model_name)

client = MilvusClient(DB_NAME)
client.load_collection(collection_name=COLLECTION_NAME)

queries_df = pd.read_parquet(
    "hf://datasets/rag-datasets/rag-mini-wikipedia/data/test.parquet/part.0.parquet"
).head(150)

print(f"Loaded {len(queries_df)} queries\n")

#Fuzzy Matching instead of Exact Matching
def fuzzy_match(pred, truth):
    """Check if prediction is contained in truth or vice versa"""
    pred_lower = pred.lower().strip()
    truth_lower = truth.lower().strip()
    
    #Exact match
    if pred_lower == truth_lower:
        return 1.0
    
    #Substring match (answer contains ground truth or vice versa)
    if pred_lower in truth_lower or truth_lower in pred_lower:
        return 0.8
    
    #Sequence similarity
    similarity = SequenceMatcher(None, pred_lower, truth_lower).ratio()
    return similarity if similarity > 0.7 else 0.0

def calculate_metrics(predictions, ground_truths):
    """Calculate multiple evaluation metrics"""
    exact_match = 0
    fuzzy_correct = 0
    f1_scores = []
    
    for pred, truth in zip(predictions, ground_truths):
        #Exact match
        if pred.lower().strip() == truth.lower().strip():
            exact_match += 1
            fuzzy_correct += 1
            f1_scores.append(1.0)
        else:
            #Fuzzy match
            score = fuzzy_match(pred, truth)
            if score >= 0.8:
                fuzzy_correct += 1
            f1_scores.append(score)
    
    return {
        'exact_match': (exact_match / len(predictions)) * 100,
        'fuzzy_match': (fuzzy_correct / len(predictions)) * 100,
        'avg_f1': (sum(f1_scores) / len(f1_scores)) * 100
    }




#RAG function
def create_persona_prompt(context, query):
    return f"You are an expert encyclopedia. Answer the question based on the context.\n\nContext:\n{context}\n\nQuestion:\n{query}\n\nAnswer:"

def generate_naive_rag(query, top_k=3):
    query_embedding = embedding_model.encode(query)
    search_results = client.search(
        collection_name=COLLECTION_NAME,
        data=[query_embedding],
        limit=top_k,
        output_fields=["passage"]
    )
    contexts = [hit['entity']['passage'] for hit in search_results[0]]
    context_str = "\n".join(contexts)
    prompt = create_persona_prompt(context_str, query)
    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).input_ids
    outputs = model.generate(input_ids, max_length=128)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer, contexts

def rewrite_query(query):
    rewrite_prompt = f"Rephrase this question 2 different ways:\nQuestion: {query}\nRephrasings:"
    input_ids = tokenizer(rewrite_prompt, return_tensors="pt", truncation=True, max_length=512).input_ids
    outputs = model.generate(input_ids, max_length=150)
    rewritten = tokenizer.decode(outputs[0], skip_special_tokens=True)
    alternatives = [q.strip() for q in rewritten.split('\n') if q.strip() and len(q.strip()) > 10]
    return [query] + alternatives[:2]

def rerank_passages(query, passages, top_k=3):
    if len(passages) <= top_k:
        return passages
    pairs = [[query, passage] for passage in passages]
    scores = cross_encoder.predict(pairs)
    reranked = sorted(zip(scores, passages), reverse=True)
    return [passage for _, passage in reranked[:top_k]]

def generate_advanced_rag(query, top_k=3):
    queries = rewrite_query(query)
    all_passages = []
    for q in queries:
        query_embedding = embedding_model.encode(q)
        results = client.search(
            collection_name=COLLECTION_NAME,
            data=[query_embedding],
            limit=10,
            output_fields=["passage"]
        )
        all_passages.extend([hit['entity']['passage'] for hit in results[0]])
    unique_passages = list(dict.fromkeys(all_passages))
    contexts = rerank_passages(query, unique_passages, top_k=top_k)
    context_str = "\n".join(contexts)
    prompt = create_persona_prompt(context_str, query)
    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).input_ids
    outputs = model.generate(input_ids, max_length=128)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer, contexts


#Evaluation on 150 queries
print("Running both pipelines")

naive_answers = []
advanced_answers = []

for i in range(len(queries_df)):
    question = queries_df.iloc[i]['question']
    
    # Naive
    n_ans, _ = generate_naive_rag(question, top_k=3)
    naive_answers.append(n_ans)
    
    # Advanced
    a_ans, _ = generate_advanced_rag(question, top_k=3)
    advanced_answers.append(a_ans)
    
    if (i + 1) % 5 == 0:
        print(f"  Processed {i+1}/150 queries")

print(f"\nCGenerated {len(naive_answers)} predictions per pipeline.\n")




#Evaluate
ground_truths = queries_df['answer'].tolist()

naive_metrics = calculate_metrics(naive_answers, ground_truths)
advanced_metrics = calculate_metrics(advanced_answers, ground_truths)

print("Results")

results_df = pd.DataFrame({
    'Pipeline': ['Naive RAG (Baseline)', 'Advanced RAG (Rewrite+Rerank)'],
    'Exact Match (%)': [naive_metrics['exact_match'], advanced_metrics['exact_match']],
    'Fuzzy Match (%)': [naive_metrics['fuzzy_match'], advanced_metrics['fuzzy_match']],
    'Avg F1 (%)': [naive_metrics['avg_f1'], advanced_metrics['avg_f1']]
})

print(results_df.to_markdown(index=False))

em_improvement = advanced_metrics['exact_match'] - naive_metrics['exact_match']
fuzzy_improvement = advanced_metrics['fuzzy_match'] - naive_metrics['fuzzy_match']
f1_improvement = advanced_metrics['avg_f1'] - naive_metrics['avg_f1']

print(f"\nIMPROVEMENTS:")
print(f"Exact Match: {em_improvement:+.1f} percentage points")
print(f"Fuzzy Match: {fuzzy_improvement:+.1f} percentage points")
print(f"Average F1: {f1_improvement:+.1f} percentage points")



# #Detailed Comparison
# print("DETAILED ANSWER COMPARISON (First 10)")

# for i in range(10):
#     gt = ground_truths[i]
#     n_ans = naive_answers[i]
#     a_ans = advanced_answers[i]
    
#     n_match = "Right" if n_ans.lower().strip() == gt.lower().strip() else "Wrong"
#     a_match = "Right" if a_ans.lower().strip() == gt.lower().strip() else "Wrong"
    
#     print(f"\n[{i+1}] {queries_df.iloc[i]['question'][:60]}...")
#     print(f"Ground Truth: '{gt}'")
#     print(f"Naive:  '{n_ans}' {n_match}")
#     print(f"Advanced: '{a_ans}' {a_match}")


#Save Results
results_df.to_csv("advanced_rag_comparison.csv", index=False)
print("\nAdvanced_rag_comparison.csv")

detailed = pd.DataFrame({
    'question': queries_df['question'].tolist(),
    'ground_truth': ground_truths,
    'naive_answer': naive_answers,
    'advanced_answer': advanced_answers,
    'naive_exact': [n.lower().strip() == g.lower().strip() for n, g in zip(naive_answers, ground_truths)],
    'advanced_exact': [a.lower().strip() == g.lower().strip() for a, g in zip(advanced_answers, ground_truths)]
})
detailed.to_csv("advanced_rag_detailed_comparison.csv", index=False)
print("Saved advanced_rag_detailed_comparison.csv")




#Report Summary
print("Summary")

print(f"""
Results:

Naive RAG (Baseline):
   - Exact Match: {naive_metrics['exact_match']:.1f}%
   - Fuzzy Match: {naive_metrics['fuzzy_match']:.1f}%
   - Average F1: {naive_metrics['avg_f1']:.1f}%

Advanced RAG (Rewrite + Rerank):
   - Exact Match: {advanced_metrics['exact_match']:.1f}%
   - Fuzzy Match: {advanced_metrics['fuzzy_match']:.1f}%
   - Average F1: {advanced_metrics['avg_f1']:.1f}%

""")

client.close()

Loading models
Loaded 150 queries

Running both pipelines
  Processed 5/150 queries
  Processed 10/150 queries
  Processed 15/150 queries
  Processed 20/150 queries
  Processed 25/150 queries
  Processed 30/150 queries
  Processed 35/150 queries
  Processed 40/150 queries
  Processed 45/150 queries
  Processed 50/150 queries
  Processed 55/150 queries
  Processed 60/150 queries
  Processed 65/150 queries
  Processed 70/150 queries
  Processed 75/150 queries
  Processed 80/150 queries
  Processed 85/150 queries
  Processed 90/150 queries
  Processed 95/150 queries
  Processed 100/150 queries
  Processed 105/150 queries
  Processed 110/150 queries
  Processed 115/150 queries
  Processed 120/150 queries
  Processed 125/150 queries
  Processed 130/150 queries
  Processed 135/150 queries
  Processed 140/150 queries
  Processed 145/150 queries
  Processed 150/150 queries

CGenerated 150 predictions per pipeline.

Results
| Pipeline                      |   Exact Match (%) |   Fuzzy Match (%)

RAGAs

In [None]:
import os
from openai import OpenAI

os.environ["OPENAI_API_KEY"] = ""

print(f"API Key set: {bool(os.environ.get('OPENAI_API_KEY'))}")
print(f"Key starts with: {os.environ.get('OPENAI_API_KEY', '')[:10]}...")


API Key set: True
Key starts with: sk-proj-5p...


In [4]:
#RAGAs Implementation
import pandas as pd
from sentence_transformers import SentenceTransformer, CrossEncoder
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from pymilvus import MilvusClient
from datasets import Dataset
import os
import warnings
warnings.filterwarnings("ignore")


print("Importing RAGAs")
from langchain_openai import ChatOpenAI
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall
)

print("RAGAs imported successfully!\n")

#Load Models and Database
DB_NAME = "rag_experiments_384d.db"
COLLECTION_NAME = 'rag_mini_384d'
EMBEDDING_MODEL_NAME = 'all-MiniLM-L6-v2'

print("Loading models")
embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
llm_model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(llm_model_name)

client = MilvusClient(DB_NAME)
client.load_collection(collection_name=COLLECTION_NAME)

#Taking 150 queries
queries_df = pd.read_parquet(
    "hf://datasets/rag-datasets/rag-mini-wikipedia/data/test.parquet/part.0.parquet"
).head(150)

print(f"Loaded {len(queries_df)} test queries\n")


#Initializing RAGAs
print("Initializing RAGAs with gpt-4o-mini")

ragas_llm = ChatOpenAI(
    model="gpt-4o-mini",
    api_key=os.environ["OPENAI_API_KEY"],
    temperature=0
)
print("RAGAs judge model ready!\n")



#RAGAs Pipeline Model
def create_persona_prompt(context, query):
    return f"You are an expert encyclopedia. Answer the question based on the context.\n\nContext:\n{context}\n\nQuestion:\n{query}\n\nAnswer:"

def generate_naive_rag(query, top_k=3):
    query_embedding = embedding_model.encode(query)
    search_results = client.search(
        collection_name=COLLECTION_NAME,
        data=[query_embedding],
        limit=top_k,
        output_fields=["passage"]
    )
    contexts = [hit['entity']['passage'] for hit in search_results[0]]
    context_str = "\n".join(contexts)
    prompt = create_persona_prompt(context_str, query)
    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).input_ids
    outputs = model.generate(input_ids, max_length=128)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer, contexts

def rewrite_query(query):
    rewrite_prompt = f"Rephrase this question 2 different ways:\nQuestion: {query}\nRephrasings:"
    input_ids = tokenizer(rewrite_prompt, return_tensors="pt", truncation=True, max_length=512).input_ids
    outputs = model.generate(input_ids, max_length=150)
    rewritten = tokenizer.decode(outputs[0], skip_special_tokens=True)
    alternatives = [q.strip() for q in rewritten.split('\n') if q.strip() and len(q.strip()) > 10]
    return [query] + alternatives[:2]

def rerank_passages(query, passages, top_k=3):
    if len(passages) <= top_k:
        return passages
    pairs = [[query, passage] for passage in passages]
    scores = cross_encoder.predict(pairs)
    reranked = sorted(zip(scores, passages), reverse=True)
    return [passage for _, passage in reranked[:top_k]]

def generate_advanced_rag(query, top_k=3):
    queries = rewrite_query(query)
    all_passages = []
    for q in queries:
        query_embedding = embedding_model.encode(q)
        results = client.search(
            collection_name=COLLECTION_NAME,
            data=[query_embedding],
            limit=10,
            output_fields=["passage"]
        )
        all_passages.extend([hit['entity']['passage'] for hit in results[0]])
    unique_passages = list(dict.fromkeys(all_passages))
    contexts = rerank_passages(query, unique_passages, top_k=top_k)
    context_str = "\n".join(contexts)
    prompt = create_persona_prompt(context_str, query)
    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).input_ids
    outputs = model.generate(input_ids, max_length=128)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer, contexts



#Generate Predictions for Both the Naive and Advanced
print("GENERATING PREDICTIONS FOR RAGAS EVALUATION")

naive_answers = []
naive_contexts = []
advanced_answers = []
advanced_contexts = []

for i in range(len(queries_df)):
    question = queries_df.iloc[i]['question']
    
    #Naive RAG
    n_ans, n_ctx = generate_naive_rag(question, top_k=3)
    naive_answers.append(n_ans)
    naive_contexts.append(n_ctx)
    
    #Advanced RAG
    a_ans, a_ctx = generate_advanced_rag(question, top_k=3)
    advanced_answers.append(a_ans)
    advanced_contexts.append(a_ctx)
    
    if (i + 1) % 5 == 0:
        print(f"  Generated {i+1}/150 predictions")

print(f"\nGenerated {len(naive_answers)} predictions per pipeline.\n")


#Preparing Dataset for RAGAs
print("Preparing datasets for RAGAs evaluation")


base_data = {
    'question': queries_df['question'].tolist(),
    'ground_truth': queries_df['answer'].tolist()  # Note: 'ground_truth' not 'ground_truths'
}

naive_dataset = Dataset.from_dict({
    **base_data,
    'answer': naive_answers,
    'contexts': naive_contexts
})

advanced_dataset = Dataset.from_dict({
    **base_data,
    'answer': advanced_answers,
    'contexts': advanced_contexts
})

print(f"Naive dataset: {len(naive_dataset)} samples")
print(f"Advanced dataset: {len(advanced_dataset)} samples\n")



#RAGAs Evaluation
print("RUNNING RAGAS EVALUATION")
print("Using OpenAI gpt-4o-mini to judge the quality of RAG outputs.")

#Metrics we want evaluation on
metrics = [
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall
]

print("Evaluating Naive RAG (Baseline)")
naive_results = evaluate(
    naive_dataset,
    metrics=metrics,
    llm=ragas_llm
)

print("\nEvaluating Advanced RAG (Rewrite+Rerank)")
advanced_results = evaluate(
    advanced_dataset,
    metrics=metrics,
    llm=ragas_llm
)

print("\nRAGAs evaluation done\n")



#Display Results
print("RAGAS EVALUATION RESULTS")

comparison = pd.DataFrame({
    'Naive RAG': {
        'Faithfulness': naive_results['faithfulness'],
        'Answer Relevancy': naive_results['answer_relevancy'],
        'Context Precision': naive_results['context_precision'],
        'Context Recall': naive_results['context_recall']
    },
    'Advanced RAG': {
        'Faithfulness': advanced_results['faithfulness'],
        'Answer Relevancy': advanced_results['answer_relevancy'],
        'Context Precision': advanced_results['context_precision'],
        'Context Recall': advanced_results['context_recall']
    }
}).T

comparison['Avg Score'] = comparison.mean(axis=1)
print(comparison.to_markdown())

#Calculate improvements
print("IMPROVEMENTS (Advanced - Naive)")

improvements = {
    'Faithfulness': advanced_results['faithfulness'] - naive_results['faithfulness'],
    'Answer Relevancy': advanced_results['answer_relevancy'] - naive_results['answer_relevancy'],
    'Context Precision': advanced_results['context_precision'] - naive_results['context_precision'],
    'Context Recall': advanced_results['context_recall'] - naive_results['context_recall']
}

for metric, value in improvements.items():
    print(f"{metric:20s}: {value:+.4f}")


#Saving Results 
print("Saving results to CSV files")
comparison.to_csv("ragas_evaluation_comparison.csv")
print("Saved ragas_evaluation_comparison.csv")

#saved detailed per query result
naive_df = naive_results.to_pandas()
advanced_df = advanced_results.to_pandas()

detailed = pd.DataFrame({
    'question': queries_df['question'].tolist(),
    'ground_truth': queries_df['answer'].tolist(),
    'naive_answer': naive_answers,
    'advanced_answer': advanced_answers,
    'naive_faithfulness': naive_df['faithfulness'].tolist(),
    'naive_answer_relevancy': naive_df['answer_relevancy'].tolist(),
    'naive_context_precision': naive_df['context_precision'].tolist(),
    'naive_context_recall': naive_df['context_recall'].tolist(),
    'adv_faithfulness': advanced_df['faithfulness'].tolist(),
    'adv_answer_relevancy': advanced_df['answer_relevancy'].tolist(),
    'adv_context_precision': advanced_df['context_precision'].tolist(),
    'adv_context_recall': advanced_df['context_recall'].tolist()
})

detailed.to_csv("ragas_detailed_per_query.csv", index=False)
print("Saved ragas_detailed_per_query.csv")


#Final Summary
print("Final Summary")

print(f"""

Naive RAG (Baseline):
   Faithfulness:      {naive_results['faithfulness']:.4f}
   Answer Relevancy:  {naive_results['answer_relevancy']:.4f}
   Context Precision: {naive_results['context_precision']:.4f}
   Context Recall:    {naive_results['context_recall']:.4f}
   Average:           {comparison.loc['Naive RAG', 'Avg Score']:.4f}

Advanced RAG (Query Rewriting + Reranking):
   Faithfulness:      {advanced_results['faithfulness']:.4f}
   Answer Relevancy:  {advanced_results['answer_relevancy']:.4f}
   Context Precision: {advanced_results['context_precision']:.4f}
   Context Recall:    {advanced_results['context_recall']:.4f}
   Average:           {comparison.loc['Advanced RAG', 'Avg Score']:.4f}

""")

client.close()
print("Evaluations done and Files saved")

Importing RAGAs
RAGAs imported successfully!

Loading models
Loaded 150 test queries

Initializing RAGAs with gpt-4o-mini
RAGAs judge model ready!

GENERATING PREDICTIONS FOR RAGAS EVALUATION
  Generated 5/150 predictions
  Generated 10/150 predictions
  Generated 15/150 predictions
  Generated 20/150 predictions
  Generated 25/150 predictions
  Generated 30/150 predictions
  Generated 35/150 predictions
  Generated 40/150 predictions
  Generated 45/150 predictions
  Generated 50/150 predictions
  Generated 55/150 predictions
  Generated 60/150 predictions
  Generated 65/150 predictions
  Generated 70/150 predictions
  Generated 75/150 predictions
  Generated 80/150 predictions
  Generated 85/150 predictions
  Generated 90/150 predictions
  Generated 95/150 predictions
  Generated 100/150 predictions
  Generated 105/150 predictions
  Generated 110/150 predictions
  Generated 115/150 predictions
  Generated 120/150 predictions
  Generated 125/150 predictions
  Generated 130/150 predicti

Evaluating:   0%|          | 0/600 [00:00<?, ?it/s]

Exception raised in Job[1]: IndexError(list index out of range)
Exception raised in Job[9]: IndexError(list index out of range)
Exception raised in Job[5]: IndexError(list index out of range)
Exception raised in Job[13]: IndexError(list index out of range)
Exception raised in Job[21]: IndexError(list index out of range)
Exception raised in Job[17]: IndexError(list index out of range)
Exception raised in Job[29]: IndexError(list index out of range)
Exception raised in Job[37]: IndexError(list index out of range)
Exception raised in Job[49]: IndexError(list index out of range)
Exception raised in Job[61]: IndexError(list index out of range)
Exception raised in Job[57]: IndexError(list index out of range)
Exception raised in Job[65]: IndexError(list index out of range)
Exception raised in Job[69]: IndexError(list index out of range)
Exception raised in Job[73]: IndexError(list index out of range)
Exception raised in Job[77]: IndexError(list index out of range)
Exception raised in Job[81]:


Evaluating Advanced RAG (Rewrite+Rerank)


Evaluating:   0%|          | 0/600 [00:00<?, ?it/s]

Exception raised in Job[1]: IndexError(list index out of range)
Exception raised in Job[13]: IndexError(list index out of range)
Exception raised in Job[9]: IndexError(list index out of range)
Exception raised in Job[5]: IndexError(list index out of range)
Exception raised in Job[17]: IndexError(list index out of range)
Exception raised in Job[25]: IndexError(list index out of range)
Exception raised in Job[21]: IndexError(list index out of range)
Exception raised in Job[37]: IndexError(list index out of range)
Exception raised in Job[33]: IndexError(list index out of range)
Exception raised in Job[41]: IndexError(list index out of range)
Exception raised in Job[45]: IndexError(list index out of range)
Exception raised in Job[49]: IndexError(list index out of range)
Exception raised in Job[53]: IndexError(list index out of range)
Exception raised in Job[57]: IndexError(list index out of range)
Exception raised in Job[73]: IndexError(list index out of range)
Exception raised in Job[77]:


RAGAs evaluation done

RAGAS EVALUATION RESULTS


TypeError: Could not convert [list([0.5, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.5, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.6666666666666666, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.5, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.5, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.5, 1.0, 1.0, 1.0, 0.5, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, nan, nan, nan, nan, nan, nan, 0.790231405515927, nan, 0.7504596024364599, nan, 0.7436686867676713, 0.726065956526272, nan, 0.9390847505571212, nan, nan, nan, nan, nan, nan, nan, nan, 0.8092584301875235, nan, nan, nan, nan, nan, 0.703719808354991, nan, nan, nan, nan, nan, 0.7556624112350016, 0.9194750871444589, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 0.9292853286778119, nan, nan, 0.0, nan, nan, nan, 0.726841808433011, nan, nan, nan, nan, 0.7670868783182766, 0.8649944987272046, nan, 0.7409715575840807, 0.7582459694650537, nan, nan, 0.9999999999999996, 0.7685469841879057, 0.7559677223852882, nan, nan, 0.7679964351787575, 0.8280394397759703, nan, nan, nan, 0.9460970722032842, nan, nan, nan, nan, 0.7647869861120177, 0.7411100400487461, 0.7328823970774668, nan, nan, nan, 0.8324313619988138, 0.7682591745927082, nan, nan, nan, nan, 0.733340400387636, 0.0, 0.7448274706906067, 0.8035885798625167, nan, nan, nan, 0.7540554893474805, 0.0, nan, nan, 0.8778617585967566, 0.8080290900882118, 0.7961265124309943, nan, 0.7979426766389878, nan, nan, nan, 0.7645712991253117, nan, nan, nan, 0.0, nan, nan, nan, nan, 0.7643505189004799, 0.7835839424977179, 0.8143147077466034, nan, nan, nan, 0.0, 0.7816196609417391, nan, nan, 0.8136508012440276, nan, nan, nan, nan, nan, 0.8870379649364635, 0.7360329678023557, 0.0, nan, 0.7755542350754362, 0.7592308488103875, nan, nan, 0.0, 0.5833333333041666, 0.9999999999, 0.49999999995, 0.9999999999, 0.9999999999, 0.9999999999, 0.9999999999, 0.9999999999, 0.99999999995, 0.49999999995, 0.9999999999, 0.3333333333, 0.9999999999, 0.49999999995, 0.5833333333041666, 0.8333333332916666, 0.9999999999, 0.9999999999666667, 0.9999999999666667, 0.9999999999, 0.9999999999666667, 0.49999999995, 0.9999999999, 0.99999999995, 0.0, 0.3333333333, 0.8333333332916666, 0.9999999999666667, 0.0, 0.9999999999, 0.0, 0.49999999995, 0.99999999995, 0.9999999999, 0.49999999995, 0.9999999999, 0.99999999995, 0.8333333332916666, 0.99999999995, 0.0, 0.49999999995, 0.0, 0.0, 0.3333333333, 0.49999999995, 0.9999999999, 0.99999999995, 0.9999999999, 0.99999999995, 0.9999999999, 0.9999999999, 0.5833333333041666, 0.99999999995, 0.9999999999666667, 0.0, 0.0, 0.9999999999666667, 0.9999999999, 0.99999999995, 0.99999999995, 0.8333333332916666, 0.8333333332916666, 0.9999999999, 0.8333333332916666, 0.0, 0.99999999995, 0.99999999995, 0.99999999995, 0.9999999999666667, 0.99999999995, 0.99999999995, 0.9999999999666667, 0.49999999995, 0.9999999999, 0.3333333333, 0.9999999999, 0.0, 0.0, 0.99999999995, 0.9999999999, 0.9999999999, 0.8333333332916666, 0.99999999995, 0.9999999999, 0.9999999999, 0.0, 0.0, 0.9999999999, 0.49999999995, 0.0, 0.9999999999, 0.9999999999, 0.49999999995, 0.3333333333, 0.49999999995, 0.0, 0.0, 0.0, 0.0, 0.0, 0.49999999995, 0.0, 0.3333333333, 0.49999999995, 0.49999999995, 0.0, 0.9999999999, 0.9999999999, 0.8333333332916666, 0.0, 0.0, 0.49999999995, 0.0, 0.49999999995, 0.49999999995, 0.99999999995, 0.9999999999, 0.9999999999, 0.0, 0.3333333333, 0.9999999999, 0.9999999999, 0.99999999995, 0.9999999999, 0.9999999999666667, 0.8333333332916666, 0.99999999995, 0.9999999999, 0.0, 0.9999999999, 0.0, 0.99999999995, 0.9999999999, 0.9999999999, 0.0, 0.0, 0.3333333333, 0.9999999999, 0.0, 0.99999999995, 0.9999999999, 0.99999999995, 0.9999999999, 0.9999999999, 0.0, 0.49999999995, 0.3333333333, 0.9999999999, 0.99999999995, 0.99999999995, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0])
 list([1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.5, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.5, 1.0, 0.5, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.3333333333333333, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, nan, nan, nan, nan, nan, nan, nan, 0.8379166742872712, nan, nan, nan, nan, nan, nan, nan, 0.8926040363649793, 0.7444352103587205, 0.7539749817755698, nan, nan, 0.8961055324696265, 0.7289981644217085, nan, nan, 0.0, 0.7952564493906923, 0.7630672580459482, nan, nan, nan, 0.7928240981761444, nan, nan, nan, 0.7638844374706212, 0.7527254695451299, nan, nan, nan, nan, 0.806070617743562, 0.806845332748615, nan, nan, nan, 0.7299916110245647, 0.7240311716901834, 0.7500544520682663, nan, 0.0, nan, 0.7530855521563146, 0.7491680075226194, 0.7478956842528289, 0.0, nan, 0.8403994460614884, 0.7424326023228511, 0.7401133450336189, nan, nan, 0.9180463484299454, nan, 0.7670868783182766, 0.9204202145603658, nan, nan, nan, nan, nan, nan, 0.7724229264939374, 0.7570240897163054, 0.7318946491689681, nan, 0.7703183731448089, nan, 0.7849953178237253, nan, nan, 0.9496737028965653, 0.9477732896541319, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 0.7969693307792766, nan, nan, 0.7597844621104501, 0.0, nan, 0.7807415571493689, nan, nan, 0.7410679963907395, nan, nan, nan, nan, 0.7518902090685232, 0.779881094336433, 0.8778052594297933, nan, nan, nan, 0.780484669160851, nan, 0.7578349817443475, nan, nan, nan, nan, 0.7500572303737703, nan, nan, 0.8968622713536692, 0.748417615292456, nan, nan, nan, nan, nan, 0.8503178693757462, nan, nan, 0.7583090787338115, nan, nan, 0.7490051987722713, nan, 0.8307252994906489, 0.7552370780213926, nan, nan, nan, nan, 0.7460816469181258, nan, nan, nan, nan, 0.7439119525377144, 0.0, 0.99999999995, 0.9999999999, 0.9999999999, 0.9999999999, 0.9999999999, 0.9999999999, 0.9999999999, 0.9999999999, 0.9999999999, 0.8333333332916666, 0.9999999999, 0.9999999999, 0.9999999999, 0.3333333333, 0.9999999999666667, 0.8333333332916666, 0.9999999999, 0.8333333332916666, 0.99999999995, 0.9999999999, 0.9999999999666667, 0.9999999999, 0.9999999999, 0.9999999999, 0.0, 0.9999999999666667, 0.8333333332916666, 0.9999999999666667, 0.0, 0.9999999999, 0.9999999999, 0.9999999999, 0.99999999995, 0.9999999999, 0.9999999999, 0.9999999999, 0.9999999999666667, 0.9999999999666667, 0.8333333332916666, 0.0, 0.9999999999, 0.49999999995, 0.0, 0.3333333333, 0.9999999999, 0.9999999999, 0.99999999995, 0.9999999999, 0.99999999995, 0.9999999999, 0.9999999999, 0.9999999999, 0.9999999999, 0.9999999999, 0.0, 0.0, 0.9999999999666667, 0.9999999999, 0.9999999999, 0.99999999995, 0.9999999999, 0.8333333332916666, 0.9999999999, 0.9999999999666667, 0.9999999999, 0.9999999999666667, 0.9999999999666667, 0.99999999995, 0.9999999999666667, 0.8333333332916666, 0.9999999999666667, 0.9999999999666667, 0.49999999995, 0.9999999999, 0.3333333333, 0.9999999999, 0.0, 0.0, 0.99999999995, 0.9999999999, 0.99999999995, 0.9999999999666667, 0.9999999999666667, 0.9999999999, 0.9999999999666667, 0.0, 0.0, 0.8333333332916666, 0.9999999999, 0.0, 0.9999999999, 0.9999999999, 0.9999999999, 0.9999999999666667, 0.9999999999, 0.9999999999, 0.9999999999666667, 0.9999999999666667, 0.99999999995, 0.8333333332916666, 0.9999999999, 0.9999999999, 0.49999999995, 0.3333333333, 0.9999999999, 0.0, 0.9999999999, 0.9999999999, 0.99999999995, 0.9999999999, 0.9999999999, 0.9999999999, 0.0, 0.49999999995, 0.9999999999, 0.8333333332916666, 0.9999999999, 0.9999999999, 0.0, 0.8333333332916666, 0.9999999999, 0.9999999999, 0.9999999999666667, 0.9999999999, 0.9999999999666667, 0.9999999999666667, 0.99999999995, 0.9999999999, 0.0, 0.9999999999, 0.9999999999, 0.99999999995, 0.9999999999, 0.9999999999, 0.9999999999, 0.0, 0.99999999995, 0.9999999999, 0.0, 0.9999999999666667, 0.9999999999, 0.99999999995, 0.9999999999, 0.9999999999, 0.9999999999, 0.99999999995, 0.9999999999, 0.9999999999, 0.99999999995, 0.99999999995, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0])] to numeric

Extracting and Clearning RAGAs Results

In [11]:
#Extracting and Cleaning Results
import pandas as pd
import numpy as np

print("Extracting RAGAs results from your evaluation\n")

#Naive results and advanced results are in the variables. 

#Handling the Data
try:
    naive_df = naive_results.to_pandas()
    advanced_df = advanced_results.to_pandas()
    
    print("RAW RAGAS RESULTS (with NaN values)")
    print(f"\nNaive RAG - Raw scores per query:")
    print(f"  Faithfulness: {naive_df['faithfulness'].tolist()[:10]}... (showing first 10)")
    print(f"  Answer Relevancy: {naive_df['answer_relevancy'].tolist()[:10]}...")
    print(f"  Context Precision: {naive_df['context_precision'].tolist()[:10]}...")
    print(f"  Context Recall: {naive_df['context_recall'].tolist()[:10]}...")
    
    print("Cleaned RAGAs Results (NaN values removed)")
    
    naive_clean = {
        'Faithfulness': naive_df['faithfulness'].dropna().mean(),
        'Answer Relevancy': naive_df['answer_relevancy'].dropna().mean(),
        'Context Precision': naive_df['context_precision'].dropna().mean(),
        'Context Recall': naive_df['context_recall'].dropna().mean()
    }
    
    advanced_clean = {
        'Faithfulness': advanced_df['faithfulness'].dropna().mean(),
        'Answer Relevancy': advanced_df['answer_relevancy'].dropna().mean(),
        'Context Precision': advanced_df['context_precision'].dropna().mean(),
        'Context Recall': advanced_df['context_recall'].dropna().mean()
    }
    
    #Count successful evaluations
    naive_success = naive_df['faithfulness'].notna().sum()
    advanced_success = advanced_df['faithfulness'].notna().sum()
    
    print(f"\nSuccessful evaluations:")
    print(f"  Naive RAG: {naive_success}/150 queries")
    print(f"  Advanced RAG: {advanced_success}/150 queries")
    
    #Create clean comparison
    comparison = pd.DataFrame({
        'Naive RAG': naive_clean,
        'Advanced RAG': advanced_clean,
        'Improvement': {
            k: advanced_clean[k] - naive_clean[k] 
            for k in naive_clean.keys()
        }
    }).T


    print("\nFinal RAGAs Comparison")
    print(comparison.to_markdown())
    
    print("\nDETAILED METRIC BREAKDOWN")
    
    for metric in ['Faithfulness', 'Answer Relevancy', 'Context Precision', 'Context Recall']:
        naive_val = naive_clean[metric]
        adv_val = advanced_clean[metric]
        diff = adv_val - naive_val
        direction = "↑ IMPROVED" if diff > 0 else "↓ DECREASED" if diff < 0 else "→ NO CHANGE"
        
        print(f"\n{metric}:")
        print(f"  Naive:    {naive_val:.4f}")
        print(f"  Advanced: {adv_val:.4f}")
        print(f"  Change:   {diff:+.4f} ({diff*100:+.1f}%) {direction}")
    
    # Save results
    comparison.to_csv("ragas_final_comparison.csv")
    print("\nSaved ragas_final_comparison.csv")
    
    # Save detailed per-query results
    detailed = pd.DataFrame({
        'question': queries_df['question'].tolist(),
        'ground_truth': queries_df['answer'].tolist(),
        'naive_faithfulness': naive_df['faithfulness'].tolist(),
        'naive_answer_relevancy': naive_df['answer_relevancy'].tolist(),
        'naive_context_precision': naive_df['context_precision'].tolist(),
        'naive_context_recall': naive_df['context_recall'].tolist(),
        'adv_faithfulness': advanced_df['faithfulness'].tolist(),
        'adv_answer_relevancy': advanced_df['answer_relevancy'].tolist(),
        'adv_context_precision': advanced_df['context_precision'].tolist(),
        'adv_context_recall': advanced_df['context_recall'].tolist()
    })
    detailed.to_csv("ragas_per_query_details.csv", index=False)
    print("Saved ragas_per_query_details.csv")
    
    # Generate assignment summary
    print()
    print("Summary")
    
    avg_naive = np.mean(list(naive_clean.values()))
    avg_advanced = np.mean(list(advanced_clean.values()))
    
    print(f"""
RAGAS EVALUATION RESULTS

EVALUATION SETUP:
- Framework: RAGAs (Retrieval-Augmented Generation Assessment)
- Judge Model: OpenAI gpt-4o-mini

RAGAS METRICS:

Naive RAG (Baseline):
  Faithfulness:      {naive_clean['Faithfulness']:.4f}
  Answer Relevancy:  {naive_clean['Answer Relevancy']:.4f}
  Context Precision: {naive_clean['Context Precision']:.4f}
  Context Recall:    {naive_clean['Context Recall']:.4f}
  Average Score:     {avg_naive:.4f}

Advanced RAG (Query Rewriting + Reranking):
  Faithfulness:      {advanced_clean['Faithfulness']:.4f}
  Answer Relevancy:  {advanced_clean['Answer Relevancy']:.4f}
  Context Precision: {advanced_clean['Context Precision']:.4f}
  Context Recall:    {advanced_clean['Context Recall']:.4f}
  Average Score:     {avg_advanced:.4f}

Performance Change:
  Overall: {(avg_advanced - avg_naive):+.4f} ({(avg_advanced - avg_naive)*100:+.1f}%)

""")

    print("\nAll the results are extracted and saved")
    print("  1. ragas_final_comparison.csv - Summary table")
    print("  2. ragas_per_query_details.csv - Per-query breakdown")

except Exception as e:
    print(f"Error extracting results: {e}")

Extracting RAGAs results from your evaluation

RAW RAGAS RESULTS (with NaN values)

Naive RAG - Raw scores per query:
  Faithfulness: [0.5, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0]... (showing first 10)
  Answer Relevancy: [nan, nan, nan, nan, nan, nan, 0.790231405515927, nan, 0.7504596024364599, nan]...
  Context Precision: [0.5833333333041666, 0.9999999999, 0.49999999995, 0.9999999999, 0.9999999999, 0.9999999999, 0.9999999999, 0.9999999999, 0.99999999995, 0.49999999995]...
  Context Recall: [1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0]...
Cleaned RAGAs Results (NaN values removed)

Successful evaluations:
  Naive RAG: 150/150 queries
  Advanced RAG: 150/150 queries

Final RAGAs Comparison
|              |   Faithfulness |   Answer Relevancy |   Context Precision |   Context Recall |
|:-------------|---------------:|-------------------:|--------------------:|-----------------:|
| Naive RAG    |      0.704444  |          0.686802  |            0.671667 |        0.6       |
| A