In [1]:
import json
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "multi-qa-MiniLM-L6-cos-v1"
model = SentenceTransformer(model_name)

e_client = Elasticsearch('http://localhost:9201')

In [3]:
with open("../03-vector-search/documents_id.json","rt") as id_in:
    documents = json.load(id_in)

ground_truth = (pd.read_csv('../03-vector-search/ground-truth-data.csv')).to_dict(orient='records')

In [4]:
ground_truth[0]

{'question': 'On what date and time does the course commence?',
 'course': 'data-engineering-zoomcamp',
 'document': '23cb47db'}

In [5]:
documents [0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': '23cb47db'}

In [6]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [7]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [8]:
index_name = "general-questions-vector"

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "text_vector": {"type": "dense_vector","dims": 384,"index": True,"similarity":"cosine"},
            "question_vector": {"type": "dense_vector","dims": 384,"index": True,"similarity":"cosine"},
            "question_text_vector": {"type": "dense_vector","dims": 384,"index": True,"similarity":"cosine"}



        }
    }
}

e_client.indices.delete(index=index_name,ignore_unavailable=True)
e_client.indices.create(index=index_name,body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'general-questions-vector'})

In [9]:
for doc in tqdm(documents):

    question = doc['question']
    text = doc['text']
    qt = question + ' ' + text
    doc['question_vector'] = model.encode(question)
    doc['text_vector'] = model.encode(text)
    doc['question_text_vector'] = model.encode(qt)

100%|██████████| 948/948 [02:41<00:00,  5.88it/s]


In [10]:
documents[2]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp',
 'id': 'bc44dd09',
 'question_vector': array([ 3.03588388e-03, -2.38723564e-03,  3.58816721e-02,  2.09988300e-02,
        -1.82823837e-02,  6.71509132e-02, -1.02773197e-01, -1.15095422e-01,
        -6.60675466e-02, -4.97332914e-03, -2.86176545e-03,  1.05431505e-01,
        -8.14314582e-04,  8.41836780e-02,  2.70471442e-02, -3.13537605e-02,
        -5.15432134e-02, -4.94899228e-02,  5.34984693e-02,  4.74149734e-03,
        -1.36108533e-01,  1.65415760e-02, -7.78471828e-02,  6.46223873e-02,
         3.81476022e-02, -4.09362204e-02,  3.23658325e-02, -1.70550421e-02,
         5.00197038e-02, -3.75342462e-03, -4

In [11]:
for doc in tqdm(documents):
    e_client.index(index=index_name,document=doc)

100%|██████████| 948/948 [00:04<00:00, 228.86it/s]


In [13]:
def elastic_search_hybird(field,query,vector,course):

    knn_query = {
        "field" : field,
        "query_vector" : vector,
        "k" : 5,
        "num_candidates" : 10000,
        "boost" : 0.5,
        "filter" : {
            "term": {
                "course": course
            }
        }
    }

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question", "text", "section"],
                    "type": "best_fields",
                    "boost": 0.5,
                }
            },
            "filter": {
                "term": {
                    "course": course
                }
            }
        }
    }

    search_query = {
    "knn": knn_query,
    "query": keyword_query,
    "size": 5,
    "_source" : ["text","section","question","course","id"]
   
   }
    es_results = e_client.search(
         index=index_name,
         body=search_query,
        
   )


    result_docs = []

    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

### With Reranking

In [14]:
def compute_rrf(rank, k=60):
    """ Our own implementation of the relevance score """
    return 1 / (k + rank)

def elastic_search_hybrid_rrf(field, query, vector, course, k=60):
    knn_query = {
        "field": field,
        "query_vector": vector,
        "k": 10,
        "num_candidates": 10000,
        "boost": 0.5,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question", "text", "section"],
                    "type": "best_fields",
                    "boost": 0.5,
                }
            },
            "filter": {
                "term": {
                    "course": course
                }
            }
        }
    }

    knn_results = e_client.search(
        index=index_name, 
        body={
            "knn": knn_query, 
            "size": 10
        }
    )['hits']['hits']
    
    keyword_results = e_client.search(
        index=index_name, 
        body={
            "query": keyword_query, 
            "size": 10
        }
    )['hits']['hits']
    
    rrf_scores = {}
    # Calculate RRF using vector search results
    for rank, hit in enumerate(knn_results):
        doc_id = hit['_id']
        rrf_scores[doc_id] = compute_rrf(rank + 1, k)

    # Adding keyword search result scores
    for rank, hit in enumerate(keyword_results):
        doc_id = hit['_id']
        if doc_id in rrf_scores:
            rrf_scores[doc_id] += compute_rrf(rank + 1, k)
        else:
            rrf_scores[doc_id] = compute_rrf(rank + 1, k)

    # Sort RRF scores in descending order
    reranked_docs = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Get top-K documents by the score
    final_results = []
    for doc_id, score in reranked_docs[:5]:
        doc = e_client.get(index=index_name, id=doc_id)
        final_results.append(doc['_source'])
    
    return final_results


In [15]:
def question_hybird(q,vector_type,rerank):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    if rerank:
        return elastic_search_hybrid_rrf(vector_type,question,v_q,course)
    
    return elastic_search_hybird(vector_type,question,v_q,course)

In [16]:
def evaluate(ground_truth,search_function,vector_type,rerank=False):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q,vector_type,rerank)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'vector_type': vector_type,
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [17]:
vector_types = ["text_vector","question_vector","question_text_vector"]


for vector_type in vector_types:
    result = evaluate(ground_truth,question_hybird,vector_type,True)
    print(result)

100%|██████████| 4664/4664 [04:18<00:00, 18.04it/s]


{'vector_type': 'text_vector', 'hit_rate': 0.9337478559176673, 'mrr': 0.8046919668381937}


100%|██████████| 4664/4664 [04:25<00:00, 17.56it/s]


{'vector_type': 'question_vector', 'hit_rate': 0.9339622641509434, 'mrr': 0.7852737278444839}


100%|██████████| 4664/4664 [04:24<00:00, 17.65it/s]

{'vector_type': 'question_text_vector', 'hit_rate': 0.9626929674099486, 'mrr': 0.8678173241852496}





In [18]:
vector_types = ["text_vector","question_vector","question_text_vector"]


for vector_type in vector_types:
    result = evaluate(ground_truth,question_hybird,vector_type)
    print(result)

100%|██████████| 4664/4664 [02:55<00:00, 26.53it/s]


{'vector_type': 'text_vector', 'hit_rate': 0.9187392795883362, 'mrr': 0.8196433676386513}


100%|██████████| 4664/4664 [02:56<00:00, 26.44it/s]


{'vector_type': 'question_vector', 'hit_rate': 0.9195969125214408, 'mrr': 0.8249285305889086}


100%|██████████| 4664/4664 [02:55<00:00, 26.53it/s]

{'vector_type': 'question_text_vector', 'hit_rate': 0.9223842195540308, 'mrr': 0.8255253001715269}



