In [2]:
from datasets import Dataset

enterprise_eval_data = {
    "question": [
        "How many paid leave days are allowed?",
        "What is the process for IT asset return during exit?",
        "Can employees work remotely full-time?",
        "What is the timeline for expense reimbursements?"
    ],
    "contexts": [[
        "Employees are allowed 24 days of paid leave per year.",
        "Upon resignation, all company IT assets must be returned before the last working day.",
        "Remote work is permitted up to 3 days a week with manager approval.",
        "Expense claims must be submitted within 10 working days of travel."
    ] for _ in range(4)],
    "rag_answer": [
        "Employees can take 24 paid leave days in a year.",
        "IT assets need to be returned before the final day at work.",
        "Employees may work remotely three days weekly after manager approval.",
        "Claims must be filed within 10 business days post travel."
    ],
    "finetuned_answer": [
        "Employees get 30 paid leave days yearly.",
        "IT return is part of the exit checklist submitted to HR.",
        "Remote work is available full-time based on role.",
        "Reimbursements must be completed in the same quarter."
    ],
    "ground_truth": [
        "24 days of paid leave per year.",
        "IT assets must be returned before the last working day.",
        "Remote work is allowed up to 3 days per week.",
        "Reimbursements must be filed within 10 working days."
    ]
}

dataset = Dataset.from_dict(enterprise_eval_data)
dataset.to_json("ragas_rag_vs_finetune_demo.json", orient="records", lines=True)


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

2261

# 🔍 RAG vs Fine-Tuned Model Evaluation using RAGAS

This notebook evaluates a RAG pipeline against fine-tuned model outputs using:
- ✅ RAGAS Metrics (Faithfulness, Context Precision, Answer Relevancy, Correctness)
- 🔵 BLEU Score
- 🔴 ROUGE Score

Outputs are saved to CSV for further benchmarking.


In [1]:
!pip install ragas datasets evaluate transformers faiss-cpu langchain-openai -q

In [2]:

from datasets import Dataset

# Simulated RAG vs Fine-Tuned outputs
data_samples = {
    'question': [
        'When was the first super bowl?', 
        'Who won the most super bowls?'
    ],
    'answer': [
        'The first superbowl was held on Jan 15, 1967', 
        'The most super bowls have been won by The New England Patriots'
    ],
    'contexts': [
        [
            'The First AFL–NFL World Championship Game was played on January 15, 1967, at the Los Angeles Coliseum.'
        ], 
        [
            'The New England Patriots have won the Super Bowl a record six times.'
        ]
    ],
    'ground_truth': [
        'The first superbowl was held on January 15, 1967', 
        'The New England Patriots have won the Super Bowl a record six times'
    ]
}

dataset = Dataset.from_dict(data_samples)
dataset


Dataset({
    features: ['question', 'answer', 'contexts', 'ground_truth'],
    num_rows: 2
})

In [3]:

from ragas import evaluate
from ragas.metrics import faithfulness, context_precision, answer_relevancy, answer_correctness

results = evaluate(
    dataset,
    metrics=[faithfulness, context_precision, answer_relevancy, answer_correctness]
)
results


Evaluating:   0%|          | 0/8 [00:00<?, ?it/s]

{'faithfulness': 0.5000, 'context_precision': 1.0000, 'answer_relevancy': 0.9619, 'answer_correctness': 0.9901}

In [4]:

from evaluate import load

bleu = load("bleu")
rouge = load("rouge")

bleu_result = bleu.compute(
    predictions=data_samples['answer'],
    references=[[gt] for gt in data_samples['ground_truth']]
)

rouge_result = rouge.compute(
    predictions=data_samples['answer'],
    references=data_samples['ground_truth']
)

print("🔵 BLEU:", bleu_result['bleu'])
print("🔴 ROUGE:", rouge_result)


🔵 BLEU: 0.4077184582232638
🔴 ROUGE: {'rouge1': 0.7644444444444445, 'rouge2': 0.5054347826086957, 'rougeL': 0.6044444444444445, 'rougeLsum': 0.6044444444444445}


In [7]:

df = results.to_pandas()
df.to_csv("ragas_vs_finetuned_results.csv", index=False)
print("✅ Exported to ragas_vs_finetuned_results.csv")


✅ Exported to ragas_vs_finetuned_results.csv


# 🧠 Integrating RAG Pipeline with RAGAS Evaluation

In this notebook, we:
- Load a custom RAG pipeline
- Use it to generate answers from real documents
- Evaluate RAG pipeline outputs using RAGAS metrics
- Compare with fine-tuned outputs (optional)


In [5]:
!pip install langchain ragas datasets faiss-cpu python-dotenv -U -q

In [7]:

import os
from dotenv import load_dotenv
load_dotenv()

from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader

# Prepare documents (can be .txt or .md files)
loader = TextLoader("17sample_rag_corpus.txt")
docs = loader.load()
splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
chunks = splitter.split_documents(docs)

# Embed and build vector store
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(chunks, embeddings)
retriever = vectorstore.as_retriever()


In [8]:

from langchain.chains import RetrievalQA

llm = ChatOpenAI(model="gpt-3.5-turbo")
rag_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)


In [9]:

# Define RAG queries
questions = [
    "When was the first super bowl?",
    "Who introduced the theory of relativity?"
]

# Generate answers
qa_data = {
    "question": [],
    "answer": [],
    "contexts": [],
}

for q in questions:
    result = rag_chain.invoke(q)
    qa_data["question"].append(q)
    qa_data["answer"].append(result["result"])
    qa_data["contexts"].append([doc.page_content for doc in result["source_documents"]])

qa_data


{'question': ['When was the first super bowl?',
  'Who introduced the theory of relativity?'],
 'answer': ['The first Super Bowl, known as the AFL-NFL World Championship Game, was held on January 15, 1967, at the Los Angeles Memorial Coliseum.',
  'Albert Einstein introduced the theory of relativity in the early 20th century.'],
 'contexts': [['The first Super Bowl, known as the AFL-NFL World Championship Game, was held on January 15, 1967, at the Los Angeles Memorial Coliseum.\n\nAlbert Einstein proposed the theory of relativity in the early 20th century, fundamentally changing our understanding of space, time, and gravity.',
   'Ada Lovelace is widely regarded as the first computer programmer, writing the first algorithm intended for a machine.',
   "Marie Curie was a pioneering physicist and chemist who conducted groundbreaking research on radioactivity, earning two Nobel Prizes.\n\nThe Eiffel Tower is located in Paris, France, and was completed in 1889 for the World's Fair."],
  ['

In [None]:
# qa_data = [
#     {
#         "question": "Who proposed the theory of relativity?",
#         "answer": "Albert Einstein proposed the theory of relativity.",
#         "contexts": ["Albert Einstein proposed the theory of relativity, which transformed our understanding of time, space, and gravity."],
#         "ground_truth": "Albert Einstein proposed the theory of relativity."
#     },
#     {
#         "question": "Where is the Eiffel Tower located?",
#         "answer": "The Eiffel Tower is in Paris.",
#         "contexts": ["The Eiffel Tower is located in Paris, France, and was completed in 1889 for the World's Fair."],
#         "ground_truth": "The Eiffel Tower is located in Paris, France."
#     }
# ]


In [10]:
!pip install ragas datasets evaluate -q

In [11]:
from ragas import evaluate, EvaluationDataset
from ragas.metrics import faithfulness, answer_correctness, answer_relevancy, context_precision

# ✅ Correct structure for RAGAS v0.1+
qa_data = [
    {
        "user_input": "Who proposed the theory of relativity?",
        "response": "Albert Einstein proposed the theory of relativity.",
        "retrieved_contexts": [
            "Albert Einstein proposed the theory of relativity, which transformed our understanding of time, space, and gravity."
        ],
        "reference": "Albert Einstein proposed the theory of relativity."
    },
    {
        "user_input": "Where is the Eiffel Tower located?",
        "response": "The Eiffel Tower is in Paris.",
        "retrieved_contexts": [
            "The Eiffel Tower is located in Paris, France, and was completed in 1889 for the World's Fair."
        ],
        "reference": "The Eiffel Tower is located in Paris, France."
    }
]

# ✅ Convert to EvaluationDataset
dataset = EvaluationDataset.from_list(qa_data)

# ✅ Evaluate using RAGAS metrics
results = evaluate(
    dataset,
    metrics=[
        faithfulness,
        context_precision,
        answer_relevancy,
        answer_correctness
    ]
)

# ✅ View results
results.to_pandas()


Evaluating:   0%|          | 0/8 [00:00<?, ?it/s]

Unnamed: 0,user_input,retrieved_contexts,response,reference,faithfulness,context_precision,answer_relevancy,answer_correctness
0,Who proposed the theory of relativity?,[Albert Einstein proposed the theory of relati...,Albert Einstein proposed the theory of relativ...,Albert Einstein proposed the theory of relativ...,1.0,1.0,0.910911,1.0
1,Where is the Eiffel Tower located?,"[The Eiffel Tower is located in Paris, France,...",The Eiffel Tower is in Paris.,"The Eiffel Tower is located in Paris, France.",1.0,1.0,1.0,0.74353


In [12]:

df = results.to_pandas()
df.to_csv("ragas_real_rag_pipeline_eval.csv", index=False)
print("✅ Exported to ragas_real_rag_pipeline_eval.csv")


✅ Exported to ragas_real_rag_pipeline_eval.csv


In [27]:
df

Unnamed: 0,user_input,retrieved_contexts,response,reference,faithfulness,context_precision,answer_relevancy,answer_correctness
0,Who proposed the theory of relativity?,[Albert Einstein proposed the theory of relati...,Albert Einstein proposed the theory of relativ...,Albert Einstein proposed the theory of relativ...,1.0,1.0,0.91087,1.0
1,Where is the Eiffel Tower located?,"[The Eiffel Tower is located in Paris, France,...",The Eiffel Tower is in Paris.,"The Eiffel Tower is located in Paris, France.",1.0,1.0,1.0,0.74353
