# Rag Lab

The aim of this notebook is to evaluate the retrieval and RAG model
- Retrieval evaluation
- RAG evaluation

In [1]:
from dotenv import load_dotenv
from openai import OpenAI
import pandas as pd
from tqdm import tqdm

In [2]:
# Create previously the environment variable OPENAI_API_KEY with your personal OpenAI API key
# in the command line or in a .env file
load_dotenv()

client = OpenAI()

### Data indexing: Elasticsearch

In [3]:
df = pd.read_csv('../../data/data.csv')

In [4]:
documents = df[['id', 'title', 'tags', 'description']].to_dict('records')

In [5]:
# Run ElasticSearch previously in docker
# docker run -it \
#     --rm \
#     --name elasticsearch \
#     -m 4GB \
#     -p 9200:9200 \
#     -p 9300:9300 \
#     -e "discovery.type=single-node" \
#     -e "xpack.security.enabled=false" \
#     docker.elastic.co/elasticsearch/elasticsearch:8.4.3

from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 
es_client.info()

ObjectApiResponse({'name': '52a926cb09df', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'lGJGYGlNREaNO19N8Hgfzw', 'version': {'number': '8.5.0', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'c94b4700cda13820dad5aa74fae6db185ca5c304', 'build_date': '2022-10-24T16:54:16.433628434Z', 'build_snapshot': False, 'lucene_version': '9.4.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [6]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "title": {"type": "text"},
            "description": {"type": "text"},
            "tags": {"type": "text"},
            "id": {"type" : "keyword"},
        }
    }
}

index_name = "youtube-titles"

try:
    es_client.indices.create(index=index_name, body=index_settings)
except:
    es_client.options(ignore_status=[400,404]).indices.delete(index=index_name)
    es_client.indices.create(index=index_name, body=index_settings)

In [7]:
from tqdm import tqdm 
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 200/200 [00:10<00:00, 20.00it/s]


## Retrieval evaluation

In [8]:
df_queries = pd.read_csv('../../data/ground-truth-retrieval.csv')

In [9]:
df_queries.shape

(1000, 2)

In [10]:
df_queries.head()

Unnamed: 0,id,query
0,39,best Pokémon Let's Go Pikachu trailer reaction
1,39,Pokémon Let's Go Eevee gameplay review
2,39,Pokémon Let's Go features explained
3,39,how to catch rare Pokémon in Let's Go
4,39,ultimate guide to Pokémon Let's Go on Nintendo...


In [11]:
ground_truth = df_queries.to_dict(orient='records')

In [12]:
ground_truth[0]

{'id': 39, 'query': "best Pokémon Let's Go Pikachu trailer reaction"}

In [13]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [14]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [15]:
def search_evaluation(query):
    
    query = query['query']

    search_query = {
        "size": 10,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["description", "text", "tags"],
                    }
                },
            }
        }
    }

    search_results = es_client.search(index=index_name, body=search_query)
    return [r['_source'] for r in search_results['hits']['hits']]

In [16]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [17]:
evaluate(ground_truth=ground_truth, search_function=search_evaluation)

100%|██████████| 1000/1000 [00:50<00:00, 19.93it/s]


{'hit_rate': 0.941, 'mrr': 0.8764996031746036}

## RAG evaluation

In [18]:
prompt2_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given query.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

query: {query}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [19]:
prompt_template = """
You're a professional youtuber. Answer with a youtube video title to the QUERY which is based on the CONTEXT from the video database.
Use only the facts from the CONTEXT when answering the QUERY

QUERY:
{query}

CONTEXT:
{context}
""".strip()

entry_template = """
video_title: {title},
video_description: {description},
video_tags: {tags}
""".strip()

def build_prompt(query, search_results, prompt_template, entry_template):
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(id=doc["id"],
                                                  title=doc["title"],
                                                  description=doc["description"],
                                                  tags=doc["tags"]) + "\n\n"
        
    prompt = prompt_template.format(query=query, context=context).strip()
    return prompt

def search(query):
    search_query = {
        "size": 10,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["description", "text", "tags"],
                        "type": "best_fields"
                    }
                },
            }
        }
    }

    search_results = es_client.search(index=index_name, body=search_query)
    return [r['_source'] for r in search_results['hits']['hits']]

def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

def rag(query, model='gpt-4o-mini'):
    search_results = search(query=query)
    prompt = build_prompt(query=query,
                        search_results=search_results,
                        entry_template=entry_template,
                        prompt_template=prompt_template)
    answer = llm(prompt=prompt, model=model)
    return answer

### gpt-4o-mini

In [20]:
import json

evaluations = []

for record in tqdm(ground_truth):
    query = record['query']
    answer_llm = rag(query, model='gpt-4o-mini')
    
    prompt = prompt2_template.format(
        query=query,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt, model='gpt-4o-mini')
    evaluation = json.dumps(evaluation)
    evaluation = json.loads(fr"{evaluation}")

    evaluations.append((record, answer_llm, evaluation))

100%|██████████| 1000/1000 [34:29<00:00,  2.07s/it]


In [22]:
df_eval.head()

Unnamed: 0,record,answer,evaluation,id,query
0,"{'id': 39, 'query': 'best Pokémon Let's Go Pik...","**""Epic Reaction to Pokémon: Let's Go, Pikachu...","{\n ""Relevance"": ""RELEVANT"",\n ""Explanation""...",39,best Pokémon Let's Go Pikachu trailer reaction
1,"{'id': 39, 'query': 'Pokémon Let's Go Eevee ga...","""Pokémon Let's Go Eevee: A Fresh Take on a Cla...","{\n ""Relevance"": ""PARTLY_RELEVANT"",\n ""Expla...",39,Pokémon Let's Go Eevee gameplay review
2,"{'id': 39, 'query': 'Pokémon Let's Go features...",**Title:** Discover the Magic of Pokémon Let's...,"{\n ""Relevance"": ""PARTLY_RELEVANT"",\n ""Expla...",39,Pokémon Let's Go features explained
3,"{'id': 39, 'query': 'how to catch rare Pokémon...","""Ultimate Guide to Catching Rare Pokémon in Le...","{\n ""Relevance"": ""PARTLY_RELEVANT"",\n ""Expla...",39,how to catch rare Pokémon in Let's Go
4,"{'id': 39, 'query': 'ultimate guide to Pokémon...","""Ultimate Guide to Pokémon: Let's Go, Pikachu!...","{\n ""Relevance"": ""RELEVANT"",\n ""Explanation""...",39,ultimate guide to Pokémon Let's Go on Nintendo...


In [25]:
import ast

df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])
df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['query'] = df_eval.record.apply(lambda d: d['query'])
df_eval['relevance'] = df_eval.evaluation.apply(lambda d: ast.literal_eval(d)['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: ast.literal_eval(d)['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [26]:
df_eval.head()

Unnamed: 0,answer,id,query,relevance,explanation
0,"**""Epic Reaction to Pokémon: Let's Go, Pikachu...",39,best Pokémon Let's Go Pikachu trailer reaction,RELEVANT,The generated answer succinctly captures a pos...
1,"""Pokémon Let's Go Eevee: A Fresh Take on a Cla...",39,Pokémon Let's Go Eevee gameplay review,PARTLY_RELEVANT,The generated answer includes the title 'Pokém...
2,**Title:** Discover the Magic of Pokémon Let's...,39,Pokémon Let's Go features explained,PARTLY_RELEVANT,The generated answer includes a title suggesti...
3,"""Ultimate Guide to Catching Rare Pokémon in Le...",39,how to catch rare Pokémon in Let's Go,PARTLY_RELEVANT,The generated answer provides a title that sug...
4,"""Ultimate Guide to Pokémon: Let's Go, Pikachu!...",39,ultimate guide to Pokémon Let's Go on Nintendo...,RELEVANT,The generated answer directly addresses the qu...


In [27]:
df_eval.relevance.value_counts(normalize=True)

relevance
PARTLY_RELEVANT    0.599
RELEVANT           0.330
NON_RELEVANT       0.071
Name: proportion, dtype: float64

In [28]:
df_eval.relevance.value_counts()

relevance
PARTLY_RELEVANT    599
RELEVANT           330
NON_RELEVANT        71
Name: count, dtype: int64