In [28]:
from scripts.rag.hybridrag import hybrid_rag_sc
from scripts.rag.naiverag import naive_rag_sc
from scripts.rag.pretty_print import pretty_print_rag_answer

In [19]:
query = "Who uses ChatGPT more, men or women, and how does this change by 2025"
hybrid_answer = hybrid_rag_sc.run(query=query)

Batches: 100%|██████████| 1/1 [00:00<00:00, 11.69it/s]


In [29]:
# Test the pretty print functions
pretty_print_rag_answer(hybrid_answer, "Hybrid RAG", query)

🔍 HYBRID RAG ANSWER
📝 Query: Who uses ChatGPT more, men or women, and how does this change by 2025
--------------------------------------------------------------------------------
💬 Answer:
   Initially, a significant share (around 80%) of weekly active users of
   ChatGPT in the first few months after its release were individuals with
   typically masculine first names. However, by the first half of 2025,
   this gap began to close, and the share of active users with typically
   feminine names reached near parity with those having typically masculine
   names. By June 2025, active users were more likely to have typically
   feminine names, indicating that gender gaps in ChatGPT usage had closed
   substantially over time.

📚 Source Documents (3 found):
--------------------------------------------------
1. Source: Unknown source
   Preview: The prompts for each of these automated classifiers (with the
exception of interaction quality) are available in Appendix A. Values represent the 

In [26]:
naive_answer = naive_rag_sc.run(query=query)

Batches: 100%|██████████| 1/1 [00:00<00:00,  4.86it/s]



In [30]:
pretty_print_rag_answer(naive_answer, "Naive RAG", query)

🔍 NAIVE RAG ANSWER
📝 Query: Who uses ChatGPT more, men or women, and how does this change by 2025
--------------------------------------------------------------------------------
💬 Answer:
   Based on the provided information, initially, around 80% of the weekly
   active users (WAU) in the first few months after ChatGPT was released
   were by users with typically masculine first names. However, by the
   first half of 2025, the share of active users with typically feminine
   and typically masculine names reached near-parity. By June 2025, it was
   observed that active users were more likely to have typically feminine
   names, suggesting that gender gaps in ChatGPT usage have closed
   substantially over time.



In [32]:
from ragas import evaluation

In [143]:
from haystack.components.generators import OpenAIGenerator

from ragas import EvaluationDataset
import pandas as pd

dataset = pd.read_csv("./data_for_eval/synthetic_tests_advanced_branching_50.csv")
dataset_5 = dataset.head().copy()


In [144]:
# Create a lambda function to run either RAG system
def run_rag_system(query, rag_system):
    """Helper function to run a RAG system and return formatted response"""
    response = rag_system.run(query=query)
    return {
        'response': response['replies'][0] if 'replies' in response else '',
        'reference': [doc.content for doc in response.get('documents', [])]
    }

# Create lambda functions for each RAG system
run_hybrid_rag = lambda query: run_rag_system(query, hybrid_rag_sc)
run_naive_rag = lambda query: run_rag_system(query, naive_rag_sc)

# Apply the lambda functions to add RAG responses as columns
print("Running Hybrid RAG on all queries...")
hybrid_results = dataset_5['user_input'].apply(run_hybrid_rag)

# Extract response and retrieved_contexts into separate columns
dataset_5['response'] = hybrid_results.apply(lambda x: x['response'])
dataset_5['reference'] = hybrid_results.apply(lambda x: x['reference'])

print(f"Added columns: response, retrieved_contexts")
print(f"Dataset shape: {dataset_5.shape}")

Running Hybrid RAG on all queries...


Batches: 100%|██████████| 1/1 [00:00<00:00,  4.22it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 22.00it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 18.38it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 20.75it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 75.25it/s]



Added columns: response, retrieved_contexts
Dataset shape: (5, 5)


In [None]:
# Fix dataset format for RAGAS SingleTurnSample requirements
import ast

def parse_contexts(context_str):
    """Parse string representation of list to actual list"""
    try:
        if isinstance(context_str, str):
            return ast.literal_eval(context_str)
        elif isinstance(context_str, list):
            return context_str
        else:
            return []
    except (ValueError, SyntaxError):
        if isinstance(context_str, str):
            return [context_str]
        return []

# Create a clean dataset for RAGAS evaluation
eval_dataset = dataset_5.copy()

# Parse reference_contexts from string to list
eval_dataset['reference_contexts_parsed'] = eval_dataset['reference_contexts'].apply(parse_contexts)

# RAGAS expects specific column names for SingleTurnSample:
# - user_input (str) 
# - response (str)
# - retrieved_contexts (List[str])
# - reference (str) 

# Ensure user_input is a string, not a list
eval_dataset['user_input'] = eval_dataset['user_input'].apply(lambda x: x[0] if isinstance(x, list) else x)

# Rename and select columns to match RAGAS SingleTurnSample schema
ragas_dataset = pd.DataFrame({
    'user_input': eval_dataset['user_input'],           # Question/query as string
    'response': eval_dataset['response'],                # RAG response as string  
    'retrieved_contexts': eval_dataset['reference'],     # Retrieved contexts as list
    'reference': eval_dataset['reference_contexts_parsed'].apply(lambda x: x[0] if x else "")  # Reference answer as string
})


Fixing dataset format for RAGAS SingleTurnSample...
Dataset formatted for RAGAS!
Shape: (5, 4)
Columns: ['user_input', 'response', 'retrieved_contexts', 'reference']

Data type verification:
user_input type: <class 'str'>
response type: <class 'str'>
retrieved_contexts type: <class 'list'>
reference type: <class 'str'>


Unnamed: 0,user_input,response,retrieved_contexts,reference
0,What OpenAI do?,OpenAI is an artificial intelligence research ...,"[What is AI, how does it work and why are some...","What is AI, how does it work and why are some ..."
1,What concerns did Yann LeCun dismiss regarding...,Yann LeCun dismissed the concern expressed by ...,"[What is AI, how does it work and why are some...",Why is AI controversial?\nWhile acknowledging ...
2,What significant developments regarding AI reg...,I don't have enough information to answer.,[This article was published in 2018. To read m...,Are there laws governing AI?\nSome governments...
3,Who is Shubhendu and what role does he play in...,I don't have enough information to answer.,"[What is AI, how does it work and why are some...",This article was published in 2018. To read mo...
4,Can you explain how Vijay's research contribut...,I don't have enough information to answer.,"[What is AI, how does it work and why are some...","Intelligence, please visit the AI topic page.\..."


In [155]:
from ragas import EvaluationDataset

# Create evaluation dataset with the properly formatted data
evaluation_dataset = EvaluationDataset.from_pandas(ragas_dataset)
print("EvaluationDataset created successfully!")
print(f"Dataset size: {len(evaluation_dataset)}")
print(f"Sample type: {evaluation_dataset.get_sample_type()}")

EvaluationDataset created successfully!
Dataset size: 5
Sample type: <class 'ragas.dataset_schema.SingleTurnSample'>


In [156]:
from ragas import evaluate
from ragas.llms import HaystackLLMWrapper
from haystack.components.generators import OpenAIGenerator

evaluator_llm = HaystackLLMWrapper(OpenAIGenerator(model="gpt-4.1-mini"))

In [157]:
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity
from ragas import evaluate, RunConfig

custom_run_config = RunConfig(timeout=360)

baseline_result = evaluate(
    dataset=evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],
    llm=evaluator_llm,
    run_config=custom_run_config
)
baseline_result

Evaluating:   0%|          | 0/30 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating:   3%|▎         | 1/30 [00:02<00:59,  2.07s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating:   3%|▎         | 1/30 [00:02<00:59,  2.07s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating:  20%|██        | 6/30 [00:13<00:52,  2.19s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating:  43%|████▎     | 13/30 [00:23<00:19,  1.16s/it]LLM returned 1 generations instead of requested 3. Proceeding

{'context_recall': 0.8444, 'faithfulness': 0.6000, 'factual_correctness(mode=f1)': 0.0000, 'answer_relevancy': 0.1913, 'context_entity_recall': 0.3015, 'noise_sensitivity(mode=relevant)': 1.0000}