In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import json
import pandas as pd
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch

In [4]:
with open('documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [5]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
for doc in tqdm(documents):
    question = doc['question']
    text = doc['text']
    qt = question + ' ' + text

    doc['question_vector'] = model.encode(question)
    doc['text_vector'] = model.encode(text)
    doc['question_text_vector'] = model.encode(qt)

  0%|          | 0/948 [00:00<?, ?it/s]

In [7]:
es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [8]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

### Vector Search

In [13]:
query = 'I just discovered the course. Can I still join it?'
course = "data-engineering-zoomcamp"

In [14]:
v_q = model.encode(query)

In [16]:
knn_query = {
        "field": "text_vector",
        "query_vector": v_q,
        "k": 5,
        "num_candidates": 10000,
        "boost": 0.5,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

In [18]:
keyword_query =  {
                "bool": {
                    "must": {
                        "multi_match": {
                            "query": query,
                            "fields": ["question", "text", "section"],
                            "type": "best_fields",
                            "boost": 0.5,
                        }
                    },
                    "filter": {
                        "term": {
                            "course": course
                        }
                    }
                }
           }
               

In [19]:
response = es_client.search(
    index=index_name,
    query = keyword_query,
    knn=knn_query,
    size=5
)

In [21]:
response["hits"]["hits"]

[{'_index': 'course-questions',
  '_id': '6UgMGpIBhwKo4aCqWRmq',
  '_score': 12.141544,
  '_source': {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
   'section': 'General course-related questions',
   'question': 'Course - Can I still join the course after the start date?',
   'course': 'data-engineering-zoomcamp',
   'id': '7842b56a',
   'question_vector': [0.003035864559933543,
    -0.0023871962912380695,
    0.03588166832923889,
    0.020998835563659668,
    -0.018282348290085793,
    0.06715094298124313,
    -0.10277319699525833,
    -0.11509545892477036,
    -0.06606756895780563,
    -0.004973354283720255,
    -0.002861752174794674,
    0.10543151944875717,
    -0.0008143138256855309,
    0.08418368548154831,
    0.02704714611172676,
    -0.03135378286242485,
    -0.0515432171523571,
    -0.04948994889855385

### Hybrid Search

In [23]:
df_ground_truth = pd.read_csv('ground-truth-data.csv')

In [24]:
ground_truth = df_ground_truth.to_dict(orient='records')

In [25]:
ground_truth[0]

{'question': 'When does the course begin?',
 'course': 'data-engineering-zoomcamp',
 'document': 'c02e79ef'}

In [26]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [27]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [44]:
def elastic_search_hybrid(field, query, vector, course):
    knn_query ={
        "field": field,
        "query_vector":vector,
        "k": 5,
        "num_candidates": 10000,
        "boost": 0.5,
        "filter": {
        "term":{
            "course": course
        }
        }  
    }

    keyword_query =  {
                    "bool": {
                        "must": {
                            "multi_match": {
                                "query": query,
                                "fields": ["question", "text", "section"],
                                "type": "best_fields",
                                "boost": 0.5,
                            }
                        },
                        "filter": {
                            "term": {
                                "course": course
                            }
                        }
                    }
               }

    search_query = {
        "knn": knn_query,
        "query": keyword_query,
        "size":5,
        "_source":["text", "section", "question", "id"]
    }

    es_response = es_client.search(
    index=index_name,
    body = search_query)

    result_docs = []
    for hit in es_response["hits"]["hits"]:
        result_docs.append(hit['_source'])

    return result_docs

In [39]:
def question_text_hybrid(q):
    question = q['question']
    course = q['course']

    v_q = model.encode( question)

    return elastic_search_hybrid('question_text_vector', question, v_q, course)

In [40]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [45]:
evaluate(ground_truth, question_text_hybrid)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.9250054030689432, 'mrr': 0.8506231539514445}

In [46]:
def text_hybrid(q):
    question = q['question']
    course = q['course']

    v_q = model.encode( question)

    return elastic_search_hybrid('text_vector', question, v_q, course)

In [47]:
evaluate(ground_truth, text_hybrid)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.9234925437648585, 'mrr': 0.8461710251422809}

In [48]:
def question_hybrid(q):
    question = q['question']
    course = q['course']

    v_q = model.encode( question)

    return elastic_search_hybrid('question_vector', question, v_q, course)

In [49]:
evaluate(ground_truth, question_hybrid)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.9234925437648585, 'mrr': 0.8481665586052878}