In [32]:
%pip install llama-index-llms-openai
!pip install llama-index

from tqdm.asyncio import tqdm_asyncio


Note: you may need to restart the kernel to use updated packages.


In [33]:
import os
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding

# Configure Ollama LLM
ollama_llm = Ollama(
    model="llama3.2:latest",
    base_url="http://localhost:11434",
    temperature=0.4
)

# Configure embedding model
ollama_embedding = OllamaEmbedding(
    model_name="nomic-embed-text:latest",
    base_url="http://localhost:11434",
    ollama_additional_kwargs={"mirostat": 0}
)

Settings.llm = ollama_llm
Settings.embed_model = ollama_embedding

In [34]:
def displayify_df(df):
    """For pretty displaying DataFrame in a notebook."""
    display_df = df.style.set_properties(
        **{
            "inline-size": "300px",
            "overflow-wrap": "break-word",
        }
    )
    display(display_df)

In [35]:
from llama_index.core.llama_dataset import download_llama_dataset
from llama_index.core.llama_pack import download_llama_pack
from llama_index.core import VectorStoreIndex

# download and install dependencies for benchmark dataset
rag_dataset, documents = download_llama_dataset(
    "EvaluatingLlmSurveyPaperDataset", "./data_survey_paper"
)

In [36]:
len(documents)

111

In [37]:
rag_dataset.to_pandas()[:5]


Unnamed: 0,query,reference_contexts,reference_answer,reference_answer_by,query_by
0,What are the potential risks associated with l...,[Evaluating Large Language Models: A\nComprehe...,"According to the context information, the pote...",ai (gpt-3.5-turbo),ai (gpt-3.5-turbo)
1,How does the survey categorize the evaluation ...,[Evaluating Large Language Models: A\nComprehe...,The survey categorizes the evaluation of LLMs ...,ai (gpt-3.5-turbo),ai (gpt-3.5-turbo)
2,What are the different types of reasoning disc...,[Contents\n1 Introduction 4\n2 Taxonomy and Ro...,The different types of reasoning discussed in ...,ai (gpt-3.5-turbo),ai (gpt-3.5-turbo)
3,How is toxicity evaluated in language models a...,[Contents\n1 Introduction 4\n2 Taxonomy and Ro...,Toxicity is evaluated in language models accor...,ai (gpt-3.5-turbo),ai (gpt-3.5-turbo)
4,"In the context of specialized LLMs evaluation,...",[5.1.3 Alignment Robustness . . . . . . . . . ...,"In the context of specialized LLMs evaluation,...",ai (gpt-3.5-turbo),ai (gpt-3.5-turbo)


In [38]:
index = VectorStoreIndex.from_documents(documents=documents)
query_engine = index.as_query_engine()

In [20]:
rag_dataset_subset = rag_dataset.model_copy()
rag_dataset_subset.examples = rag_dataset.examples[:35]  # Update with the first 40 examples


# Run predictions on the subset
prediction_dataset_subset = await rag_dataset_subset.amake_predictions_with(
    predictor=query_engine, batch_size=10, show_progress=True
)

In [None]:
# Get the size of the rag_dataset_subset
size = len(rag_dataset_subset.examples)

print(f"Size of rag_dataset_subset: {size}")

In [None]:
prediction_dataset = await rag_dataset.amake_predictions_with(
    predictor=query_engine, batch_size=20, show_progress=True
)

In [22]:
# instantiate the gpt-4 judges
from llama_index.llms.openai import OpenAI
from llama_index.core.evaluation import (
    AnswerRelevancyEvaluator,
    ContextRelevancyEvaluator,
)

judges = {}

judges["answer_relevancy"] = AnswerRelevancyEvaluator(
    llm=ollama_llm,
)

judges["context_relevancy"] = ContextRelevancyEvaluator(
    llm=ollama_llm,
)

In [29]:
eval_tasks = []
for example, prediction in zip(
    rag_dataset_subset.examples, prediction_dataset_subset.predictions
):
    eval_tasks.append(
        judges["answer_relevancy"].aevaluate(
            query=example.query,
            response=prediction.response,
            sleep_time_in_seconds=1.0,
        )
    )
    eval_tasks.append(
        judges["context_relevancy"].aevaluate(
            query=example.query,
            contexts=prediction.contexts,
            sleep_time_in_seconds=1.0,
        )
    )

In [None]:
eval_results1 = await tqdm_asyncio.gather(*eval_tasks[:50])
