## Simple RAG playground

In [1]:
%load_ext autoreload
%autoreload 2

## Customer support Agent

In [2]:
from dotenv import load_dotenv
import os

load_dotenv()
PROJECT_ID = os.environ.get("PROJECT_ID")
NOTION_TOKEN = os.environ.get("NOTION_TOKEN")

In [3]:
!pip uninstall -y ragas_annotator

Found existing installation: ragas_annotator 0.0.1
Uninstalling ragas_annotator-0.0.1:
  Successfully uninstalled ragas_annotator-0.0.1


In [4]:
# Install ragas_annotator from source
!git clone https://github.com/explodinggradients/ragas_annotator
!cd ragas_annotator && pip install -e .

Cloning into 'ragas_annotator'...
remote: Enumerating objects: 650, done.[K
remote: Counting objects: 100% (650/650), done.[K
remote: Compressing objects: 100% (398/398), done.[K
remote: Total 650 (delta 385), reused 487 (delta 234), pack-reused 0 (from 0)[K
Receiving objects: 100% (650/650), 662.23 KiB | 4.22 MiB/s, done.
Resolving deltas: 100% (385/385), done.
Obtaining file:///Users/nirantk/Desktop/scratchpad/ragas/notes/rag-playground/ragas_annotator
  Installing build dependencies ... [?25ldone
[?25h  Checking if build backend supports build_editable ... [?25ldone
[?25h  Getting requirements to build editable ... [?25ldone
[?25h  Preparing editable metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: ragas_annotator
  Building editable for ragas_annotator (pyproject.toml) ... [?25ldone
[?25h  Created wheel for ragas_annotator: filename=ragas_annotator-0.0.1-0.editable-py3-none-any.whl size=7944 sha256=26e3bd9d16b78b5845fb93d56a3883b98b28f0c3

In [5]:
!git clone https://huggingface.co/datasets/explodinggradients/ragas-airline-dataset

fatal: destination path 'ragas-airline-dataset' already exists and is not an empty directory.


In [6]:
from src import AgentAI

customer_support_agent = AgentAI()

In [7]:
response = await customer_support_agent.ask("Can i get a refund for my missed flight?")
print(response)

If you missed your flight, whether you can get a refund depends on the ticket type you purchased:

1. **Refundable Ticket**: You may be eligible for a **full refund**.
2. **Non-Refundable Ticket**: You may receive a **partial refund** or travel credit, but cancellation fees will apply.
3. **Basic Economy & Promo Fares**: Typically, these tickets are **not refundable**.

For more detailed information, you should contact Ragas Airlines customer support or check your ticket conditions in "Manage My Booking".


## Setup sdk

In [14]:
from ragas_annotator.project.core import Project
from ragas_annotator.project.experiments import *

In [15]:
project = Project(
    name="Customer support RAG",
    notion_api_key=NOTION_TOKEN,
    notion_root_page_id=PROJECT_ID,
)
project

Project(name='Customer support RAG', root_page_id=1b35d9bf94ff801792bfd1824fac0c96)

## Read dataset

In [16]:
from ragas_annotator.model.notion_model import NotionModel
from ragas_annotator.model import notion_typing as nmt


class Dataset(NotionModel):
    id: str = nmt.ID()
    query: str = nmt.Title()
    expected_answer: str = nmt.Text()

In [17]:
dataset = project.get_dataset(
    name="RAG Dataset",
    model=Dataset,
)
dataset.load()

In [18]:
dir(project)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_create_project_structure',
 '_notion_backend',
 'comparisons_page_id',
 'create_dataset',
 'create_experiment',
 'datasets_page_id',
 'experiment',
 'experiments_page_id',
 'get_dataset',
 'get_experiment',
 'initialize',
 'langfuse_experiment',
 'name']

In [19]:
project.langfuse_experiment

<bound method Project.langfuse_experiment of Project(name='Customer support RAG', root_page_id=1b35d9bf94ff801792bfd1824fac0c96)>

In [20]:
len(dataset)

15

## LLM as judge

In [21]:
from ragas_annotator.llm import ragas_llm
from ragas_annotator.metric import DiscreteMetric
from openai import AsyncOpenAI

llm = ragas_llm(provider="openai", model="gpt-4o", client=AsyncOpenAI())

my_metric = DiscreteMetric(
    llm=llm,
    name="correctness",
    prompt="Evaluate if given answer {response} is same as expected answer {expected_answer}",
    values=["pass", "fail"],
)


# test LLM as judge
result = my_metric.score(
    response="this is my response", expected_answer="this is not my response"
)
result

'fail'

### Writing custom logic with metric (optional)

In [22]:
# #| eval: false
# @discrete_metric(llm=llm,
#     prompt="Evaluate if given answer is helpful\n\n{response}",
#     name='new_metric',
#     values=["low","med","high"]
#     )
# def my_metric(llm,prompt,example_store, **kwargs):

#         class response_model(BaseModel):
#              output: t.List[bool]
#              reason: str

#         response = llm.generate(
# 	        prompt.format(**kwargs),response_model=response_model
# 	      )
#         total = sum(response.output)
#         if total < 1:
#             score = 'low'
#         else:
#             score = 'high'
#         return score,"reason",

# result = my_metric.score(response='my response') # result
# print(result)
# print(result.reason)

## Setup an experiment

In [23]:
import typing as t


class Experiment(Dataset):
    response: str = nmt.Text()
    correctness: t.Literal["pass", "fail"] = nmt.Select()
    correctness_reason: str = nmt.Text()


In [24]:
@project.langfuse_experiment(Experiment, name_prefix="Workshop")
async def run_experiment(row: Dataset):
    response = await customer_support_agent.ask(row.query)
    score = await my_metric.ascore(
        response=response, expected_answer=row.expected_answer
    )

    experiment_view = Experiment(
        id=row.id,
        query=row.query,
        expected_answer=row.expected_answer,
        response=response,
        correctness=score.result,
        correctness_reason=score.reason,
    )

    return experiment_view

## Run the experiments

In [30]:
experiment_name = "setting-up-ragas-annotator"

In [None]:
await run_experiment.run_async(name=experiment_name, dataset=dataset)

100%|██████████| 15/15 [00:09<00:00,  1.53it/s]


Experiment(name=setting-up-ragas-annotator, model=Experiment)

You may make any changes to AgentAI class like prompt, model, etc and run any number of experiments. Experiment now would have recorded in the Notion UI

### Train LLM as judge

In [26]:
from ragas_annotator.embedding import ragas_embedding

from openai import OpenAI

embedding = ragas_embedding(
    provider="openai", client=OpenAI(), model="text-embedding-3-small"
)


In [27]:
my_metric.train(
    project,
    experiment_names=[experiment_name],
    embedding_model=embedding,
    model=Experiment,
    method={},
)

Processing examples: 100%|██████████| 15/15 [00:00<00:00, 145635.56it/s]


### Compare experiments
Hack to do in notebook, will be done in the UI once we have the UI ready

In [32]:
import matplotlib.pyplot as plt
import numpy as np

In [35]:
def compare_and_plot(exp_x: str, exp_y: str, metric):
    # Load experiments
    exp_x_data = project.get_experiment(exp_x, Experiment)
    exp_y_data = project.get_experiment(exp_y, Experiment)
    exp_x_data.load()
    exp_y_data.load()

    # Compare experiments (assuming this is a function that exists)
    project.compare_experiments(exp_x_data, exp_y_data)

    # Extract metrics from both experiments
    results = {
        "exp_x": [],
        "exp_y": [],
    }
    for i in range(len(exp_x_data)):
        results["exp_x"].append(getattr(exp_x_data[i], metric.name))
        results["exp_y"].append(getattr(exp_y_data[i], metric.name))

    # Calculate counts for each category
    exp_x_counts = {}
    exp_y_counts = {}

    # For categorical data like 'pass'/'fail' or 'good'/'okay'/'bad'
    # Get unique categories
    all_categories = set(results["exp_x"] + results["exp_y"])

    # Count occurrences of each category
    for category in all_categories:
        exp_x_counts[category] = results["exp_x"].count(category)
        exp_y_counts[category] = results["exp_y"].count(category)

    # Create stacked bar chart


    # Set up colors based on categories
    if all(cat in ["pass", "fail"] for cat in all_categories):
        colors = {"pass": "#2196F3", "fail": "#FF5722"}
    elif all(cat in ["good", "okay", "bad"] for cat in all_categories):
        colors = {"good": "#4CAF50", "okay": "#FFC107", "bad": "#F44336"}
    else:
        # Generate colors if categories are unknown
        import matplotlib.colors as mcolors

        colors = {
            cat: list(mcolors.TABLEAU_COLORS.values())[i % len(mcolors.TABLEAU_COLORS)]
            for i, cat in enumerate(all_categories)
        }

    # Set up the figure
    fig, ax = plt.subplots(figsize=(10, 6))

    # Plot stacked bars
    experiments = [exp_x, exp_y]
    exp_counts = [exp_x_counts, exp_y_counts]

    # Calculate totals for percentage
    totals = [sum(counts.values()) for counts in exp_counts]

    # Sort categories for consistent stacking (e.g., 'pass' always at bottom, then 'fail')
    sorted_categories = sorted(all_categories)

    # Plot each category as a segment in the stack
    bottoms = np.zeros(len(experiments))
    for category in sorted_categories:
        values = [
            counts.get(category, 0) / total * 100
            for counts, total in zip(exp_counts, totals)
        ]
        ax.bar(
            experiments,
            values,
            bottom=bottoms,
            label=category.capitalize(),
            color=colors[category],
        )

        # Add text labels inside the bars
        for i, v in enumerate(values):
            if v > 5:  # Only add label if segment is large enough
                ax.text(
                    i,
                    bottoms[i] + v / 2,
                    f"{int(exp_counts[i].get(category, 0))}\n({v:.1f}%)",
                    ha="center",
                    va="center",
                    color="white",
                    fontweight="bold",
                )

        bottoms += values

    # Customize the chart
    ax.set_title(
        f"Comparison of {metric.name.capitalize()} between Experiments", fontsize=14
    )
    ax.set_ylabel("Percentage (%)", fontsize=12)
    ax.set_ylim(0, 100)
    ax.legend(title=metric.name.capitalize())

    # Add totals on top of each bar
    for i, total in enumerate(totals):
        ax.text(i, 101, f"Total: {total}", ha="center", va="bottom")

    plt.tight_layout()
    plt.show()
    plt.savefig(f"comparison_{metric.name}.png")

compare_and_plot(exp_x="setting-up-ragas-annotator", exp_y="setting-up-ragas-annotator", metric=my_metric)