<a href="https://colab.research.google.com/github/T-K-O-H/youtube_to_linkedin/blob/main/RAGAS_Airplane_Data_KnowledgeGraph_Testset_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 📦 Install latest RAGAS and dependencies (includes generate + transforms)
#!pip uninstall -y ragas -q
#!pip install --no-cache-dir --force-reinstall git+https://github.com/explodinggradients/ragas.git@main
#!pip install -U langchain openai datasets faiss-cpu sentence-transformers beautifulsoup4 feedparser rapidfuzz


In [2]:
import os
from getpass import getpass

OPENAI_API_KEY = getpass("🔑 Paste your OpenAI API key (input hidden): ")
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY


🔑 Paste your OpenAI API key (input hidden): ··········


In [3]:
!git clone https://huggingface.co/datasets/explodinggradients/ragas-airline-dataset


fatal: destination path 'ragas-airline-dataset' already exists and is not an empty directory.


In [4]:
#!pip install unstructured

In [5]:
from langchain_community.document_loaders import DirectoryLoader

path = "ragas-airline-dataset"
loader = DirectoryLoader(path, glob="**/*.md")
docs = loader.load()
print(f"✅ Loaded {len(docs)} documents")


✅ Loaded 9 documents


In [6]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model="text-embedding-3-small"))


In [7]:
from ragas.testset.graph import KnowledgeGraph, Node, NodeType

kg = KnowledgeGraph()
for doc in docs:
    kg.nodes.append(
        Node(
            type=NodeType.DOCUMENT,
            properties={"page_content": doc.page_content, "document_metadata": doc.metadata}
        )
    )
kg


KnowledgeGraph(nodes: 9, relationships: 0)

In [8]:
from ragas.testset.transforms import apply_transforms, HeadlinesExtractor, HeadlineSplitter, KeyphrasesExtractor

headline_extractor = HeadlinesExtractor(llm=generator_llm, max_num=20)
headline_splitter = HeadlineSplitter(max_tokens=1500)
keyphrase_extractor = KeyphrasesExtractor(llm=generator_llm)

transforms = [headline_extractor, headline_splitter, keyphrase_extractor]
apply_transforms(kg, transforms=transforms)


Applying HeadlinesExtractor:   0%|          | 0/9 [00:00<?, ?it/s]

Applying HeadlineSplitter:   0%|          | 0/9 [00:00<?, ?it/s]

Applying KeyphrasesExtractor:   0%|          | 0/27 [00:00<?, ?it/s]



In [9]:
from ragas.testset.persona import Persona

persona_first_time = Persona(
    name="First Time Flier",
    role_description="Is flying for the first time and may feel anxious. Needs clear guidance on flight procedures, safety protocols, and what to expect."
)
persona_frequent = Persona(
    name="Frequent Flier",
    role_description="Travels regularly and values efficiency and comfort. Interested in loyalty programs, express services, and a seamless travel experience."
)
persona_angry = Persona(
    name="Angry Business Class Flier",
    role_description="Demands top-tier service and is easily irritated by delays or issues. Expects immediate resolutions."
)
personas = [persona_first_time, persona_frequent, persona_angry]


In [10]:
from ragas.testset.synthesizers.single_hop.specific import SingleHopSpecificQuerySynthesizer

query_distibution = [
    (SingleHopSpecificQuerySynthesizer(llm=generator_llm, property_name="headlines"), 0.5),
    (SingleHopSpecificQuerySynthesizer(llm=generator_llm, property_name="keyphrases"), 0.5),
]


In [11]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator(
    llm=generator_llm,
    embedding_model=generator_embeddings,
    knowledge_graph=kg,
    persona_list=personas,
)

testset = generator.generate(testset_size=10, query_distribution=query_distibution)
testset_df = testset.to_pandas()
testset_df.head(10)


Generating Scenarios:   0%|          | 0/2 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,What does Step 1: Notify Passengers entail in ...,[Flight Delays\n\nFlight delays can be caused ...,Step 1: Notify Passengers involves Ragas Airli...,single_hop_specifc_query_synthesizer
1,What are the potential issues and resolutions ...,[Special Assistance\n\nRagas Airlines provides...,Potential issues for special assistance includ...,single_hop_specifc_query_synthesizer
2,What are the potential issues and resolutions ...,[Schedule and Flight Date Changes\n\nA schedul...,Potential issues and resolutions for schedule ...,single_hop_specifc_query_synthesizer
3,How can I manage my reservations with Ragas Ai...,[Managing Reservations\n\nManaging your reserv...,To manage your reservations with Ragas Airline...,single_hop_specifc_query_synthesizer
4,What are the potential issues and resolutions ...,[Baggage Policies\n\nThis section provides a d...,"As a first-time flier, it is important to unde...",single_hop_specifc_query_synthesizer
5,What steps does Ragas Airlines take to assist ...,[Flight Delays Flight delays can be caused by ...,Ragas Airlines takes several steps to assist p...,single_hop_specifc_query_synthesizer
6,How do I submit a reimbursement request for ex...,[2. Additional Expenses Incurred Due to Delay ...,To submit a reimbursement request for expenses...,single_hop_specifc_query_synthesizer
7,Wut services do Ragas Airlines offer for passa...,[Special Assistance Ragas Airlines provides sp...,Ragas Airlines provides special assistance ser...,single_hop_specifc_query_synthesizer
8,What steps should I take if I have Late Reques...,[Potential Issues and Resolutions for Special ...,"If you did not request assistance in advance, ...",single_hop_specifc_query_synthesizer
9,What should I do if Ragas Airlines changes my ...,[Schedule and Flight Date Changes A schedule c...,If Ragas Airlines changes your flight schedule...,single_hop_specifc_query_synthesizer


In [12]:
from ragas import evaluate
from ragas.metrics import (
    answer_relevancy,
    context_precision,
    context_recall,
    faithfulness,
    answer_correctness,
    answer_similarity,
)

In [13]:
all_metrics = [
    answer_relevancy,
    context_precision,
    context_recall,
    faithfulness,
    answer_correctness,
    answer_similarity,
]

In [14]:
from langchain.vectorstores import FAISS

def build_vs(embed_model):
    texts = [d.page_content for d in docs]
    return FAISS.from_texts(texts, embedding=embed_model)

In [16]:
from ragas.generation import generate
import pandas as pd
import matplotlib.pyplot as plt

def run_eval(embed_model, name):
    vs = build_vs(embed_model)
    retriever = vs.as_retriever()

    print(f"📡 Running generation for: {name}")
    predicted = generate(testset, retriever=retriever, llm=generator_llm)

    print("✅ Sample predictions:")
    display(predicted.to_pandas()[['user_input', 'predicted']].head())

    results = evaluate(predicted, metrics=all_metrics)
    df = results.to_pandas()
    df["model"] = name
    display(df)

    csv_name = f"ragas_eval_{name.replace(' ', '_').lower()}.csv"
    df.to_csv(csv_name, index=False)
    print(f"💾 Saved to {csv_name}")
    return df

ModuleNotFoundError: No module named 'ragas.generation'

In [None]:
# Embedding wrappers
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import OpenAIEmbeddings
from sentence_transformers import SentenceTransformer
from langchain.embeddings import HuggingFaceEmbeddings

openai_embed = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model="text-embedding-3-small"))
mpnet_base = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
finetuned_mpnet = HuggingFaceEmbeddings(model_name="Shipmaster1/finetuned_mpnet_matryoshka_mnr")

# Evaluate all
openai_results = run_eval(openai_embed, "OpenAI text-embedding-3-small")
mpnet_results = run_eval(mpnet_base, "MPNet Base")
finetuned_results = run_eval(finetuned_mpnet, "Finetuned MPNet")

In [None]:
openai_results["model"] = "OpenAI"
mpnet_results["model"] = "MPNet"
finetuned_results["model"] = "Finetuned MPNet"

merged_df = pd.concat([openai_results, mpnet_results, finetuned_results])
pivot_df = merged_df.pivot(index="metric", columns="model", values="score").round(3)
display(pivot_df)

pivot_df.plot(kind="bar", figsize=(12, 6))
plt.title("RAGAS Evaluation Scores by Embedding Model")
plt.ylabel("Score")
plt.ylim(0, 1)
plt.xticks(rotation=45)
plt.grid(True, axis='y')
plt.tight_layout()
plt.show()