In [2]:
import json
from elasticsearch import Elasticsearch
import pandas as pd
from tqdm.auto  import tqdm
from sentence_transformers import SentenceTransformer

In [3]:
embedded_model = SentenceTransformer("all-mpnet-base-v2")

In [4]:
e_client = Elasticsearch('http://localhost:9200')

e_client.info()

ObjectApiResponse({'name': '699e94d444a1', 'cluster_name': 'docker-cluster', 'cluster_uuid': '4No4U7IcRFSxM5CuiGnr_g', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [5]:
index_name = "comparative-guide-vector"

In [6]:
def compute_rrf(rank, k=60):
    """ Our own implementation of the relevance score """
    return 1 / (k + rank)

def elastic_search_hybrid_rrf(field, query, vector, k=60):
    knn_query = {
        "field": field,
        "query_vector": vector,
        "k": 10,
        "num_candidates": 10000,
        "boost": 0.5,
    }

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["Google_Cloud_Product^7","Google_Cloud_Product_Description^3","Service_Type"],
                    "type": "best_fields",
                    "boost": 0.5,
                }
            }
        }
    }

    knn_results = e_client.search(
        index=index_name, 
        body={
            "knn": knn_query, 
            "size": 5
        }
    )['hits']['hits']
    
    keyword_results = e_client.search(
        index=index_name, 
        body={
            "query": keyword_query, 
            "size": 5
        }
    )['hits']['hits']
    
    rrf_scores = {}
    # Calculate RRF using vector search results
    for rank, hit in enumerate(knn_results):
        doc_id = hit['_id']
        rrf_scores[doc_id] = compute_rrf(rank + 1, k)

    # Adding keyword search result scores
    for rank, hit in enumerate(keyword_results):
        doc_id = hit['_id']
        if doc_id in rrf_scores:
            rrf_scores[doc_id] += compute_rrf(rank + 1, k)
        else:
            rrf_scores[doc_id] = compute_rrf(rank + 1, k)

    # Sort RRF scores in descending order
    reranked_docs = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Get top-K documents by the score
    final_results = []
    for doc_id, score in reranked_docs[:5]:
        doc = e_client.get(index=index_name, id=doc_id)
        final_results.append(doc['_source'])
    
    return final_results


In [13]:
def question_hybird(q):
    question = q['question']
   

    v_q = embedded_model.encode(question)

    
    return elastic_search_hybrid_rrf('General_Vector',question,v_q)
 

In [8]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [9]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [10]:
def evaluate(ground_truth,search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document_id']
        results = search_function(q)
        relevance = [d['Id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [11]:
df_ground_truth = pd.read_csv('../data/ground-truth-data.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

In [14]:
evaluate(ground_truth,question_hybird)

100%|██████████| 1101/1101 [00:53<00:00, 20.72it/s]


{'hit_rate': 0.9518619436875567, 'mrr': 0.8482742960944603}