In [8]:
# Import Libraries

import json
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch, exceptions

import warnings
warnings.filterwarnings("ignore")

#### **Step 1. Prepare Documents**

In [2]:
# Read JSON file and prepare documents

with open ('./documents.json', 'rt') as f_in:
    doc_file = json.load(f_in)

In [3]:
documents = []

for course in doc_file:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

documents[-1]

{'text': 'Problem description\nInfrastructure created in AWS with CD-Deploy Action needs to be destroyed\nSolution description\nFrom local:\nterraform init -backend-config="key=mlops-zoomcamp-prod.tfstate" --reconfigure\nterraform destroy --var-file vars/prod.tfvars\nAdded by Erick Calderin',
 'section': 'Module 6: Best practices',
 'question': 'How to destroy infrastructure created via GitHub Actions',
 'course': 'mlops-zoomcamp'}

#### **Step 2. Create Embeddings using Pretrained Models**

In [4]:
# Load a pretrained Sentence Transformer model

model = SentenceTransformer("all-MiniLM-L6-v2")

In [11]:
len(model.encode('Hello there, how are you?'))

384

In [5]:
# Create the dense vector using the pre-trained model

vectors = []

for doc in documents:
    doc['vector'] = model.encode(doc['text']).tolist()
    vectors.append(doc)

In [6]:
vectors[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp',
 'vector': [-0.04736119881272316,
  -0.09438014775514603,
  0.0618455670773983,
  -0.008167649619281292,
  -0.01798100583255291,
  -0.08878041058778763,
  -0.006855203304439783,
  -0.03232594206929207,
  -0.11949394643306732,
  0.054699961096048355,
  0.028057672083377838,
  -0.017750833183526993,
  0.018729494884610176,
  -0.05629878118634224,
  -0.0064017451368272305,
  0.0615239255130291,
  0.047158777713775635,
  -0.0756465345621109,
  -0.007995684631168842,
  -0.00015650922432541847,
  0.03369096294045448,
  -0.040679242461919785,
  -0.02791259065270424,
  -0.03343534469604492,
  0.05897246301174164,
  0.03704631328582764,
  0.05647413432598114,
  0.03762651979923248,
  0.04742882028222084,
  0.01496149878948927,
  -0.03078882396221161,
  0.088440887629

#### **Step 3. Initiate the ElasticSearch Connection**

In [7]:
es = Elasticsearch("http://localhost:9200")
es.info()

ObjectApiResponse({'name': '0f65040b75bb', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'UfszSCGvSwOX9M_PNJzdiw', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

#### **Step 4. Create Mappings and Index**

In [12]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} ,
            "text_vector": {"type": "dense_vector", "dims": 384, "index": True, "similarity": "cosine"},
        }
    }
}

In [14]:
index_name = "course-questions"

es.indices.delete(index=index_name, ignore_unavailable=True)
es.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

#### **Step 5. Add documents into index**

In [15]:
for doc in vectors:
    try:
        es.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

#### **Step 6. Create End User Query**

In [29]:
search_term = "windows or mac?"
vector_search_term = model.encode(search_term)

In [30]:
knn_query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000
}

In [33]:
response = es.search(
    index=index_name,
    knn=knn_query,
    size=5
)

In [34]:

response["hits"]["hits"]

[]