In [42]:
import opik
opik.configure(use_local=False)

OPIK: Existing Opik clients will not use updated values for "url", "api_key", "workspace".
OPIK: Opik is already configured. You can check the settings by viewing the config file at /Users/akshay/.opik.config


In [43]:
from dotenv import load_dotenv
import nest_asyncio

nest_asyncio.apply()
load_dotenv()

True

# Setup Workflow

In [44]:
import os
from llama_index.llms.groq import Groq

def load_llm(model_option):
    if model_option == "Llama-4":
        llm = Groq(model="meta-llama/llama-4-scout-17b-16e-instruct", api_key=os.getenv("GROQ_API_KEY"))
    else:
        llm = Groq(model="deepseek-r1-distill-llama-70b", api_key=os.getenv("GROQ_API_KEY"))
    return llm

In [45]:
model_name = 'Llama-4'
# model_name = 'DeepSeek-R1'
llm  = load_llm(model_name)

# Trace RAG calls 

In [46]:
from llama_index.core import Settings
from llama_index.core.callbacks import CallbackManager
from opik.integrations.llama_index import LlamaIndexCallbackHandler

# A callback handler tp automatically log all LlamaIndex operations to Opik
opik_callback_handler = LlamaIndexCallbackHandler()

# Integrate handler into LlamaIndex's settings
Settings.callback_manager = CallbackManager([opik_callback_handler])

# Evaluation

In [47]:
from opik import Opik

client = Opik()
dataset = client.get_or_create_dataset(name="Test dataset")

In [48]:
import pandas as pd

df = pd.read_csv("./eval-data/test.csv")

In [49]:
# insert the data into the dataset

qa_pairs = [
    {"input": row["Question"], "expected_output": row["Answer"], "context": row["Context"]} 
    for _, row in df.iterrows()
]
qa_pairs[0]


{'input': 'What was the very first programming language Paul Graham used when he began learning to program on the IBM 1401?',
 'expected_output': 'He used an early version of Fortran on the IBM 1401.',
 'context': 'The language we used was an early version of Fortran. You had to type programs on punch cards, then stack them in the card reader and press a button to load the program into memory and run it.'}

Use insert if you're creating the dataset for the first time

In [50]:

# dataset.insert(qa_pairs)

In [51]:
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader


Settings.llm = llm
Settings.embed_model = FastEmbedEmbedding(model_name="nomic-ai/nomic-embed-text-v1")

documents = SimpleDirectoryReader("./eval-data/paul_graham").load_data()
index = VectorStoreIndex.from_documents(documents)

query_engine = index.as_query_engine()

OPIK: Started logging traces to the "Default Project" project at https://www.comet.com/opik/api/v1/session/redirect/projects/?trace_id=01960a26-2400-7d46-8307-f339aa10934c&path=aHR0cHM6Ly93d3cuY29tZXQuY29tL29waWsvYXBpLw==.


In [52]:
from opik import track

@track
def my_llm_application(input: str) -> str:
    response = query_engine.query(input)
    return str(response)

def evaluation_task(x):
    return {
        "output": my_llm_application(x['input'])
    }

In [53]:
from opik.evaluation.metrics import (
    Hallucination,
    AnswerRelevance,
    ContextPrecision,
    ContextRecall
)

# Define the metrics
hallucination_metric = Hallucination()
answer_relevance_metric = AnswerRelevance()
context_precision_metric = ContextPrecision()
context_recall_metric = ContextRecall() 

In [54]:
from opik.evaluation import evaluate

evaluation = evaluate(
    dataset=dataset,
    task=evaluation_task,
    experiment_name = model_name,
    scoring_metrics=[hallucination_metric, answer_relevance_metric, context_precision_metric, context_recall_metric],
    experiment_config={
        "model": "gpt-3.5-turbo"
    }
)

Evaluation:   0%|          | 0/5 [00:00<?, ?it/s]Retrying llama_index.llms.openai.base.OpenAI._chat in 1.0 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jr4pf4d3fy5sdn50h7p56rqm` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Used 11726, Requested 2376. Please try again in 1m21.024s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}.
Retrying llama_index.llms.openai.base.OpenAI._chat in 1.5539399740503337 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jr4pf4d3fy5sdn50h7p56rqm` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Used 11616, Requested 2376. Please try again in 1m19.927s. Need more tokens? Upgrade to D

RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jr4pf4d3fy5sdn50h7p56rqm` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Used 11451, Requested 2376. Please try again in 1m18.278s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}