In [2]:
import json


with open("documents_id.json","rt") as id_in:
    documents = json.load(id_in)

In [3]:
documents[3]

{'text': "You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.",
 'section': 'General course-related questions',
 'question': 'Course - I have registered for the Data Engineering Bootcamp. When can I expect to receive the confirmation email?',
 'course': 'data-engineering-zoomcamp',
 'id': '162ef410'}

In [4]:
from elasticsearch import Elasticsearch

e_client = Elasticsearch('http://localhost:9200')

e_client.info()

ObjectApiResponse({'name': '5be13035b0f9', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'sGmOkZVAT3GhdEu0nrOzMg', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [5]:
index_name = "course-questions"

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},


        }
    }
}

e_client.indices.delete(index=index_name,ignore_unavailable=True)
e_client.indices.create(index=index_name,body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [6]:
from tqdm.auto import tqdm


for doc in documents:
    try: 
       e_client.index(index=index_name,document=doc)
    except Exception as e:
        print(e)

In [7]:
def elastic_search(query,course):
    search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": course
                }
            }
        }
    }
   }
    
    a_res = e_client.search(index=index_name, body=search_query)

    result_docs = []

    for hit in a_res["hits"]["hits"]:
        result_docs.append(hit["_source"])

    return result_docs

In [8]:
elastic_search(
    query="I just discovered the course. Can I still join?",
    course="data-engineering-zoomcamp"
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'id': 'bc44dd09'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp',
  'id': 'ff75b05e'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it fin

In [9]:
import pandas as pd

In [10]:
df_ground_truth = pd.read_csv('ground-truth-data.csv')


ground_truth = df_ground_truth.to_dict(orient='records')

In [12]:
ground_truth[0]

{'question': 'On what date and time does the course commence?',
 'course': 'data-engineering-zoomcamp',
 'document': '23cb47db'}

In [17]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = elastic_search(query=q['question'],course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)


100%|██████████| 4664/4664 [00:07<00:00, 642.73it/s] 


In [20]:
test = [[False, False, False, False, True],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, True, False, False, False],
 [False, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, True],
 [True, False, False, False, False],
 [False, True, False, False, False]]

In [21]:
len(test)

13

In [22]:
def hit_rate(relevance_total):
    count = 0

    for line in relevance_total:
        if True in line:
            count = count + 1

    return count/len(relevance_total)


In [25]:
def mrr(relevance_total):
    score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                score = score +  1/(rank + 1)
      

    return score/len(relevance_total)

In [24]:
hit_rate(relevance_total)

0.7420668953687821

In [26]:
mrr(relevance_total)

0.599356775300172

In [28]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.Index at 0x106bc27d0>

In [29]:
def minsearch_search(query, course):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [30]:
relevance_total_m = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total_m.append(relevance)

100%|██████████| 4664/4664 [00:05<00:00, 868.06it/s]


In [31]:
hit_rate(relevance_total_m)

0.7800171526586621

In [32]:
mrr(relevance_total_m)

0.664461835334477

In [34]:
def evaluate(ground_truth, search_function):

    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total)
    }

In [35]:
evaluate(ground_truth,lambda q: elastic_search(query=q['question'], course=q['course']))

100%|██████████| 4664/4664 [00:06<00:00, 721.21it/s] 


{'hit_rate': 0.7420668953687821, 'mrr': 0.599356775300172}

In [36]:
evaluate(ground_truth,lambda q: minsearch_search(query=q['question'], course=q['course']))

100%|██████████| 4664/4664 [00:05<00:00, 816.13it/s]


{'hit_rate': 0.7800171526586621, 'mrr': 0.664461835334477}