In [None]:
%pip install llama-index-llms-openai
!pip install llama-index

from tqdm.asyncio import tqdm_asyncio

import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

import nest_asyncio
nest_asyncio.apply()


In [None]:
import os
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding

# Configure Ollama LLM
ollama_llm = Ollama(
    #model="llama3.2:latest",
    model="mistral:7b",
    base_url="http://localhost:11434",
    temperature=0.4
)

# Configure embedding model
ollama_embedding = OllamaEmbedding(
    model_name="nomic-embed-text:latest",
    base_url="http://localhost:11434",
    ollama_additional_kwargs={"mirostat": 0}
)

Settings.llm = ollama_llm
Settings.embed_model = ollama_embedding

In [28]:
def displayify_df(df):
    """For pretty displaying DataFrame in a notebook."""
    display_df = df.style.set_properties(
        **{
            "inline-size": "300px",
            "overflow-wrap": "break-word",
        }
    )
    display(display_df)

In [None]:
from llama_index.core.llama_dataset import download_llama_dataset
from llama_index.core.llama_pack import download_llama_pack
from llama_index.core import VectorStoreIndex

# download and install dependencies for benchmark dataset
rag_dataset, documents = download_llama_dataset(
    "EvaluatingLlmSurveyPaperDataset", "./data_survey_paper"
)

In [None]:
len(documents)

In [None]:
rag_dataset.to_pandas()[:5]


In [None]:
index = VectorStoreIndex.from_documents(documents=documents)
query_engine = index.as_query_engine()

In [None]:
rag_dataset_subset = rag_dataset.model_copy()
rag_dataset_subset.examples = rag_dataset.examples[:15]  # Update with the first 40 examples


# Run predictions on the subset
prediction_dataset_subset = rag_dataset_subset.make_predictions_with(
    predictor=query_engine, batch_size=10, show_progress=True
)

In [None]:
# Get the size of the rag_dataset_subset
size = len(rag_dataset_subset.examples)

print(f"Size of rag_dataset_subset: {size}")

In [None]:

#prediction_dataset = await rag_dataset.amake_predictions_with(
#    predictor=query_engine, batch_size=20, show_progress=True
#)

In [36]:
# instantiate the gpt-4 judges
from llama_index.llms.openai import OpenAI
from llama_index.core.evaluation import (
    AnswerRelevancyEvaluator,
    ContextRelevancyEvaluator,
)

judges = {}

judges["answer_relevancy"] = AnswerRelevancyEvaluator(
    llm=ollama_llm,
)

judges["context_relevancy"] = ContextRelevancyEvaluator(
    llm=ollama_llm,
)

In [None]:
eval_tasks = []
for example, prediction in zip(
    rag_dataset_subset.examples, prediction_dataset_subset.predictions
):
    eval_tasks.append(
        judges["answer_relevancy"].evaluate(
            query=example.query,
            response=prediction.response,
            sleep_time_in_seconds=1.0,
        )
    )
    eval_tasks.append(
        judges["context_relevancy"].evaluate(
            query=example.query,
            contexts=prediction.contexts,
            sleep_time_in_seconds=1.0,
        )
    )

In [None]:
eval_results1 = tqdm_asyncio.gather(*eval_tasks[:20])


In [None]:
eval_results2 = tqdm_asyncio.gather(*eval_tasks[20:])

In [43]:
eval_results = eval_results1 + eval_results2

In [None]:
evals = {
    "answer_relevancy": eval_results[::2],
    "context_relevancy": eval_results[1::2],
}

In [20]:
from llama_index.core.evaluation.notebook_utils import get_eval_results_df
import pandas as pd

deep_dfs = {}
mean_dfs = {}
for metric in evals.keys():
    deep_df, mean_df = get_eval_results_df(
        names=["baseline"] * len(evals[metric]),
        results_arr=evals[metric],
        metric=metric,
    )
    deep_dfs[metric] = deep_df
    mean_dfs[metric] = mean_df

In [None]:
mean_scores_df = pd.concat(
    [mdf.reset_index() for _, mdf in mean_dfs.items()],
    axis=0,
    ignore_index=True,
)
mean_scores_df = mean_scores_df.set_index("index")
mean_scores_df.index = mean_scores_df.index.set_names(["metrics"])
mean_scores_df

In [None]:
deep_dfs["answer_relevancy"]["scores"].value_counts()

In [None]:
deep_dfs["context_relevancy"]["scores"].value_counts()

In [None]:
displayify_df(deep_dfs["context_relevancy"].head(2))

In [None]:
cond = deep_dfs["context_relevancy"]["scores"] < 1
displayify_df(deep_dfs["context_relevancy"][cond].head(5))