In [None]:
!pip install llama-index



import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.ERROR)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

import nest_asyncio
nest_asyncio.apply()


In [3]:
import os
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding

# Configure Ollama LLM
ollama_llm = Ollama(
    #model="llama3.2:latest",
    model="mistral:7b",
    base_url="http://localhost:11434",
    temperature=0.1
)

# Configure embedding model
ollama_embedding = OllamaEmbedding(
    model_name="nomic-embed-text:latest",
    base_url="http://localhost:11434",
    ollama_additional_kwargs={"mirostat": 0}
)

Settings.llm = ollama_llm
Settings.embed_model = ollama_embedding

In [4]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Response
from llama_index.llms.openai import OpenAI
from llama_index.core.evaluation import PairwiseComparisonEvaluator
from llama_index.core.node_parser import SentenceSplitter
import pandas as pd

pd.set_option("display.max_colwidth", 0)

In [5]:
evaluator_pairwise = PairwiseComparisonEvaluator(llm=ollama_llm)

In [6]:
documents = SimpleDirectoryReader("./test_wiki_data/").load_data()

In [None]:
# create vector index
splitter_512 = SentenceSplitter(chunk_size=512,chunk_overlap=64)
vector_index1 = VectorStoreIndex.from_documents(
    documents, transformations=[splitter_512]
)

splitter_128 = SentenceSplitter(chunk_size=128,chunk_overlap=64)
vector_index2 = VectorStoreIndex.from_documents(
    documents, transformations=[splitter_128]
)

In [9]:
query_engine1 = vector_index1.as_query_engine(similarity_top_k=2)
query_engine2 = vector_index2.as_query_engine(similarity_top_k=8)

In [10]:
# define jupyter display function
def display_eval_df(query, response1, response2, eval_result) -> None:
    eval_df = pd.DataFrame(
        {
            "Query": query,
            "Reference Response (Answer 1)": response2,
            "Current Response (Answer 2)": response1,
            "Score": eval_result.score,
            "Reason": eval_result.feedback,
        },
        index=[0],
    )
    eval_df = eval_df.style.set_properties(
        **{
            "inline-size": "300px",
            "overflow-wrap": "break-word",
        },
        subset=["Current Response (Answer 2)", "Reference Response (Answer 1)"]
    )
    display(eval_df)

In [11]:
# query_str = "How did New York City get its name?"
query_str = "What was the role of NYC during the American Revolution?"
# query_str = "Tell me about the arts and culture of NYC"
response1 = str(query_engine1.query(query_str))
response2 = str(query_engine2.query(query_str))

In [None]:
eval_result = await evaluator_pairwise.aevaluate(
    query_str, response=response1, second_response=response2
)

In [None]:
display_eval_df(query_str, response1, response2, eval_result)

In [15]:
evaluator_pairwise_nc = PairwiseComparisonEvaluator(
    llm=ollama_llm, enforce_consensus=False
)

In [16]:
eval_result = await evaluator_pairwise_nc.aevaluate(
    query_str, response=response1, second_response=response2
)

In [None]:
display_eval_df(query_str, response1, response2, eval_result)

In [None]:
eval_result = await evaluator_pairwise_nc.aevaluate(
    query_str, response=response2, second_response=response1
)

In [None]:
display_eval_df(query_str, response2, response1, eval_result)

In [21]:
query_str = "Tell me about the arts and culture of NYC"
response1 = str(query_engine1.query(query_str))
response2 = str(query_engine2.query(query_str))

In [None]:
eval_result = await evaluator_pairwise.aevaluate(
    query_str, response=response1, second_response=response2
)

In [None]:
display_eval_df(query_str, response1, response2, eval_result)