In [35]:
# Import Libraries

import json
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch, exceptions

import warnings
warnings.filterwarnings("ignore")

#### **Step 1. Prepare Documents**

In [2]:
# Read JSON file and prepare documents

with open ('./documents.json', 'rt') as f_in:
    doc_file = json.load(f_in)

In [3]:
documents = []

for course in doc_file:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

documents[-1]

{'text': 'Problem description\nInfrastructure created in AWS with CD-Deploy Action needs to be destroyed\nSolution description\nFrom local:\nterraform init -backend-config="key=mlops-zoomcamp-prod.tfstate" --reconfigure\nterraform destroy --var-file vars/prod.tfvars\nAdded by Erick Calderin',
 'section': 'Module 6: Best practices',
 'question': 'How to destroy infrastructure created via GitHub Actions',
 'course': 'mlops-zoomcamp'}

#### **Step 2. Create Embeddings using Pretrained Models**

In [4]:
# Load a pretrained Sentence Transformer model

model = SentenceTransformer("all-mpnet-base-v2")

In [5]:
len(model.encode('Hello there, how are you?')) # No. of dimensions

768

In [17]:
# Create the dense vector using the pre-trained model

vectors = []
for doc in documents:
    doc['vector'] = model.encode(doc['text']).tolist()
    vectors.append(doc)

In [34]:
# vectors[1]

#### **Step 3. Initiate the ElasticSearch Connection**

In [8]:
es = Elasticsearch("http://localhost:9200")
es.info()

ObjectApiResponse({'name': '53a3780b3782', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'IpunYg9bR72IX4hI3cLCkg', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

#### **Step 4. Create Mappings and Index**

In [19]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} ,
            "vector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"},
        }
    }
}

In [20]:
index_name = "course-questions"

es.indices.delete(index=index_name, ignore_unavailable=True)
es.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

#### **Step 5. Add documents into index**

In [21]:
for doc in vectors:
    try:
        es.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

#### **Step 6. Create End User Query**

In [25]:
search_term = "which os is recommended for this course"
vector_search_term = model.encode(search_term)

In [26]:
query = {
    "field": "vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000
}

In [30]:
results = es.search(index=index_name,
                    knn=query,
                    source=["text", "section", "question", "course"])
results["hits"]

{'total': {'value': 5, 'relation': 'eq'},
 'max_score': 0.8581744,
 'hits': [{'_index': 'course-questions',
   '_id': 'pF2LppABJ3hVyfZqZVRc',
   '_score': 0.8581744,
   '_source': {'question': 'Environment - Is the course [Windows/mac/Linux/...] friendly?',
    'course': 'data-engineering-zoomcamp',
    'section': 'General course-related questions',
    'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully'}},
  {'_index': 'course-questions',
   '_id': 'l12LppABJ3hVyfZqZFQQ',
   '_score': 0.7520733,
   '_source': {'question': 'Environment - Do we really have to use GitHub codespaces? I already have PostgreSQL & Docker installed.',
    'course': 'data-engineering-zoomcamp',
    'section': 'General course-related questions',
    'text': "It's up to you which platform and environment you use for the course.\nGithub codespaces or GCP VM are just possible options, but you can do the entire course from your laptop."}},
  {'_index':

##### *Keyword Searches*

In [37]:
# Specified matches

knn_query = {
    "field": "vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000
}

response = es.search(
    index=index_name,
    query={
        "match": {"course": "data-engineering-zoomcamp"},
    },
    knn=knn_query,
    size=5,
    explain=True)  # add score explanation

response["hits"]

{'total': {'value': 436, 'relation': 'eq'},
 'max_score': 1.6370883,
 'hits': [{'_shard': '[course-questions][0]',
   '_node': '4JW8LGyWSV2u1-rIZRezrg',
   '_index': 'course-questions',
   '_id': 'pF2LppABJ3hVyfZqZVRc',
   '_score': 1.6370883,
   '_source': {'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully',
    'section': 'General course-related questions',
    'question': 'Environment - Is the course [Windows/mac/Linux/...] friendly?',
    'course': 'data-engineering-zoomcamp',
    'vector': [-0.026965461671352386,
     -0.000626126304268837,
     -0.01662949100136757,
     0.05285150930285454,
     0.05476527288556099,
     -0.03133990615606308,
     0.029942581430077553,
     -0.04808562621474266,
     0.04467551037669182,
     0.005839474033564329,
     0.016233040019869804,
     0.012001154012978077,
     -0.031222281977534294,
     0.016600528731942177,
     -0.04886901378631592,
     -0.06496307998895645,
     0.