In [3]:
import json
from elasticsearch import Elasticsearch
import pandas as pd
from tqdm.auto  import tqdm
from sentence_transformers import SentenceTransformer
from openai import OpenAI

In [4]:
open_client = OpenAI(api_key="")

In [5]:
embedded_model = SentenceTransformer("all-mpnet-base-v2")

In [6]:
e_client = Elasticsearch('http://localhost:9200')

e_client.info()

ObjectApiResponse({'name': '699e94d444a1', 'cluster_name': 'docker-cluster', 'cluster_uuid': '4No4U7IcRFSxM5CuiGnr_g', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [7]:
index_name = "comparative-guide-vector"

In [8]:
def compute_rrf(rank, k=60):
    """ Our own implementation of the relevance score """
    return 1 / (k + rank)

def elastic_search_hybrid_rrf(field, query, vector, k=60):
    knn_query = {
        "field": field,
        "query_vector": vector,
        "k": 10,
        "num_candidates": 10000,
        "boost": 0.5,
    }

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["Google_Cloud_Product^7","Google_Cloud_Product_Description^3","Service_Type"],
                    "type": "best_fields",
                    "boost": 0.5,
                }
            }
        }
    }

    knn_results = e_client.search(
        index=index_name, 
        body={
            "knn": knn_query, 
            "size": 5
        }
    )['hits']['hits']
    
    keyword_results = e_client.search(
        index=index_name, 
        body={
            "query": keyword_query, 
            "size": 5
        }
    )['hits']['hits']
    
    rrf_scores = {}
    # Calculate RRF using vector search results
    for rank, hit in enumerate(knn_results):
        doc_id = hit['_id']
        rrf_scores[doc_id] = compute_rrf(rank + 1, k)

    # Adding keyword search result scores
    for rank, hit in enumerate(keyword_results):
        doc_id = hit['_id']
        if doc_id in rrf_scores:
            rrf_scores[doc_id] += compute_rrf(rank + 1, k)
        else:
            rrf_scores[doc_id] = compute_rrf(rank + 1, k)

    # Sort RRF scores in descending order
    reranked_docs = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Get top-K documents by the score
    final_results = []
    for doc_id, score in reranked_docs[:5]:
        doc = e_client.get(index=index_name, id=doc_id)
        final_results.append(doc['_source'])
    
    return final_results


In [14]:
def search(q):
    question = q
   

    v_q = embedded_model.encode(question)

    
    return elastic_search_hybrid_rrf('General_Vector',question,v_q)

In [10]:
prompt_template = """
You're a multi cloud architect i.e google cloud, AWS, Azure. 
Answer the QUESTION based on the CONTEXT from our cloud comparative guide database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()


entry_template = """
Service category: {Service_Category}
Service type: {Service_Type}
Link to Documentation: {Link_to_Documentation}
Google Cloud product: {Google_Cloud_Product}
Google Cloud product description: {Google_Cloud_Product_Description}
AWS offering: {AWS_Offering}
Azure offering: {Azure_Offering}
""".strip()


def build_prompt(query, search_results):

    context = ""

    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"


    prompt = prompt_template.format(question=query,context=context).strip()
    return prompt

In [11]:
def llm(prompt,model):

    response = open_client.chat.completions.create(
        model=model,
        messages=[{"role":"user","content": prompt}]
    )



    return response.choices[0].message.content

In [12]:
def rag(query,model='gpt-4o-mini'):

    search_results = search(query)


    prompt = build_prompt(query,search_results)


    result = llm(prompt,model)

    return result

In [17]:
df_ground_truth = pd.read_csv('../data/ground-truth-data.csv')


In [23]:
rag_eval_promt_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [18]:
data_sample = df_ground_truth.sample(n=200,random_state=1)

In [19]:
sample = data_sample.to_dict(orient='records')

In [20]:
sample[0]

{'question': 'Which service provides the best support for detecting emotion in images and how do their performance metrics compare?',
 'Google_Cloud_Product': 'Vision AI',
 'document_id': '1b0928da8b53602b5d854bed08641790'}

In [21]:
evaluations = []

In [24]:
for doc in tqdm(sample):
    question = doc['question']
    answer_llm = rag(question)

    prompt = rag_eval_promt_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = json.loads(llm(prompt,'gpt-4o-mini'))

    evaluations.append((doc,answer_llm,evaluation))

100%|██████████| 200/200 [24:11<00:00,  7.26s/it]  


In [25]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

In [32]:
df_eval.record[0]

{'question': 'Which service provides the best support for detecting emotion in images and how do their performance metrics compare?',
 'Google_Cloud_Product': 'Vision AI',
 'document_id': '1b0928da8b53602b5d854bed08641790'}

In [33]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval['document_id'] = df_eval.record.apply(lambda d: d['document_id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [34]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.64
PARTLY_RELEVANT    0.31
NON_RELEVANT       0.05
Name: proportion, dtype: float64

In [35]:
df_eval.to_csv('../data/rag-eval-gpt-4o-mini.csv', index=False)