In [4]:
import json
from elasticsearch import Elasticsearch
from openai import OpenAI
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from pymilvus import connections, db
from dotenv import load_dotenv
import os
import time
import json
from tqdm import tqdm
load_dotenv()

True

## STEP-1 Prepare Documents

In [5]:
with open('../documents.json') as file:
    data = json.load(file)

In [141]:
course_name = data[0]['course']
i = 1
for document in data[0]['documents']:
    document['document_id'] = i
    document['course'] = course_name
    i = i+1

In [8]:
json_data = data[0]['documents']

In [9]:
json_data[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?'}

In [145]:
len(json_data)

435

In [146]:
with open('data.json', 'w') as f_out:
    json.dump(json_data, f_out) 

### Creating Ground Truth dataset

In [16]:
OpenAI_KEY = os.getenv("OPEN_AI_API_KEY")

In [147]:
client = OpenAI(
    api_key = OpenAI_KEY,
)

In [148]:
def create_groundtruth(data, client):
    groundtruth_dataset = []
    prompt = """
        You are a course assistant. Given the following text, generate 5 potential questions that could be asked from the content. Return your response as a JSON array of strings, where each string is a question. Do not include any explanations or additional text outside the JSON structure.

        Text: {text_field}

        Response format:
        ["Question 1", "Question 2", "Question 3", "Question 4", "Question 5"]
        """
    for json_document in tqdm(data):
        try:
            text = json_document["text"]
            section = json_document["section"]
            document_id = json_document["document_id"]
            course = json_document["course"]
            prompt_injected = prompt.format(text_field = text)
            completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt_injected,
                }
            ],
                model = 'gpt-4o-mini',
            )
            response = completion.choices[0].message.content
            questions = json.loads(response)
            for question in questions:
                dictionary = dict()
                dictionary["text"] = text
                dictionary["question"] = question
                dictionary["section"] = section
                dictionary["document_id"] = document_id
                dictionary["course"] = course
                groundtruth_dataset.append(dictionary)
        except Exception as e:
            print(e)
            continue        
    return groundtruth_dataset
        

In [149]:



def create_groundtruth(data, client):
    groundtruth_dataset = []
    prompt = """
        You are a course assistant. Given the following text, generate 5 potential questions that could be asked from the content. Return your response as a JSON array of strings, where each string is a question. Do not include any explanations or additional text outside the JSON structure.

        Text: {text_field}

        Response format:
        ["Question 1", "Question 2", "Question 3", "Question 4", "Question 5"]
        """

    @backoff.on_exception(backoff.expo, Exception, max_tries=5)
    def make_api_call(prompt_injected):
        return client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt_injected,
                }
            ],
            model='gpt-4o-mini',
        )

    total_documents = len(data)
    successful_documents = 0

    for i, json_document in enumerate(tqdm(data, desc="Processing documents")):
        try:
            text = json_document["text"]
            section = json_document["section"]
            document_id = json_document["document_id"]
            course = json_document["course"]
            prompt_injected = prompt.format(text_field=text)

            completion = make_api_call(prompt_injected)
            response = completion.choices[0].message.content
            questions = json.loads(response)

            for question in questions:
                dictionary = {
                    "text": text,
                    "question": question,
                    "section": section,
                    "document_id": document_id,
                    "course": course
                }
                groundtruth_dataset.append(dictionary)

            successful_documents += 1

        except json.JSONDecodeError as e:
            print(f"JSON parsing error in document {i}: {e}")
        except Exception as e:
            print(f"Error processing document {i}: {e}")

        if (i + 1) % 10 == 0:
            print(f"Processed {i + 1}/{total_documents} documents. Successful: {successful_documents}")
            time.sleep(75)
    print(f"Finished processing. Total documents: {total_documents}, Successful: {successful_documents}")
    return groundtruth_dataset

In [150]:
groundtruth_dataset = create_groundtruth(json_data, client)

Processing documents:   2%|▏         | 9/435 [00:18<15:20,  2.16s/it]

Processed 10/435 documents. Successful: 10


Processing documents:   4%|▍         | 19/435 [01:59<19:06,  2.75s/it]  

Processed 20/435 documents. Successful: 20


Processing documents:   7%|▋         | 29/435 [03:30<16:51,  2.49s/it]  

Processed 30/435 documents. Successful: 30


Processing documents:   9%|▉         | 39/435 [05:01<16:37,  2.52s/it]  

Processed 40/435 documents. Successful: 40


Processing documents:  11%|█▏        | 49/435 [06:36<18:28,  2.87s/it]  

Processed 50/435 documents. Successful: 50


Processing documents:  14%|█▎        | 59/435 [08:11<17:31,  2.80s/it]  

Processed 60/435 documents. Successful: 60


Processing documents:  15%|█▍        | 64/435 [09:34<45:34,  7.37s/it]  

JSON parsing error in document 63: Expecting value: line 1 column 1 (char 0)


Processing documents:  15%|█▍        | 65/435 [09:37<36:14,  5.88s/it]

JSON parsing error in document 64: Expecting value: line 1 column 1 (char 0)


Processing documents:  16%|█▌        | 69/435 [09:44<17:06,  2.80s/it]

Processed 70/435 documents. Successful: 68


Processing documents:  18%|█▊        | 79/435 [11:15<14:54,  2.51s/it]  

Processed 80/435 documents. Successful: 78


Processing documents:  20%|██        | 89/435 [12:46<14:25,  2.50s/it]  

Processed 90/435 documents. Successful: 88


Processing documents:  21%|██        | 91/435 [14:04<1:42:13, 17.83s/it]

JSON parsing error in document 90: Expecting value: line 1 column 1 (char 0)


Processing documents:  23%|██▎       | 99/435 [14:18<15:05,  2.69s/it]  

Processed 100/435 documents. Successful: 97


Processing documents:  25%|██▌       | 109/435 [15:48<12:46,  2.35s/it]  

Processed 110/435 documents. Successful: 107


Processing documents:  27%|██▋       | 119/435 [17:19<13:36,  2.58s/it]  

Processed 120/435 documents. Successful: 117


Processing documents:  30%|██▉       | 129/435 [18:49<12:44,  2.50s/it]  

Processed 130/435 documents. Successful: 127


Processing documents:  32%|███▏      | 139/435 [20:19<11:24,  2.31s/it]  

Processed 140/435 documents. Successful: 137


Processing documents:  34%|███▎      | 146/435 [21:44<19:36,  4.07s/it]  

JSON parsing error in document 145: Expecting value: line 1 column 1 (char 0)


Processing documents:  34%|███▍      | 149/435 [21:48<10:42,  2.25s/it]

Processed 150/435 documents. Successful: 146


Processing documents:  37%|███▋      | 159/435 [23:20<11:29,  2.50s/it]  

Processed 160/435 documents. Successful: 156


Processing documents:  39%|███▉      | 169/435 [24:49<11:05,  2.50s/it]  

Processed 170/435 documents. Successful: 166


Processing documents:  41%|████      | 179/435 [26:26<12:36,  2.96s/it]  

Processed 180/435 documents. Successful: 176


Processing documents:  43%|████▎     | 189/435 [27:56<09:34,  2.34s/it]  

JSON parsing error in document 189: Expecting value: line 1 column 1 (char 0)
Processed 190/435 documents. Successful: 185


Processing documents:  46%|████▌     | 199/435 [29:27<09:35,  2.44s/it]  

Processed 200/435 documents. Successful: 195


Processing documents:  48%|████▊     | 209/435 [30:56<08:56,  2.38s/it]  

Processed 210/435 documents. Successful: 205


Processing documents:  50%|█████     | 219/435 [32:27<08:56,  2.48s/it]  

Processed 220/435 documents. Successful: 215


Processing documents:  52%|█████▏    | 226/435 [33:53<14:27,  4.15s/it]  

JSON parsing error in document 225: Expecting value: line 1 column 1 (char 0)


Processing documents:  53%|█████▎    | 229/435 [33:58<08:18,  2.42s/it]

Processed 230/435 documents. Successful: 224


Processing documents:  55%|█████▍    | 239/435 [35:26<07:17,  2.23s/it]  

Processed 240/435 documents. Successful: 234


Processing documents:  57%|█████▋    | 249/435 [36:53<07:24,  2.39s/it]  

Processed 250/435 documents. Successful: 244


Processing documents:  60%|█████▉    | 259/435 [38:21<05:56,  2.02s/it]  

Processed 260/435 documents. Successful: 254


Processing documents:  62%|██████▏   | 269/435 [39:54<07:46,  2.81s/it]  

Processed 270/435 documents. Successful: 264


Processing documents:  64%|██████▍   | 279/435 [41:23<05:32,  2.13s/it]  

Processed 280/435 documents. Successful: 274


Processing documents:  66%|██████▋   | 289/435 [42:53<05:46,  2.37s/it]  

Processed 290/435 documents. Successful: 284


Processing documents:  69%|██████▊   | 299/435 [44:21<04:58,  2.19s/it]

Processed 300/435 documents. Successful: 294


Processing documents:  71%|███████   | 309/435 [45:50<04:48,  2.29s/it]

Processed 310/435 documents. Successful: 304


Processing documents:  73%|███████▎  | 319/435 [47:16<04:13,  2.18s/it]

Processed 320/435 documents. Successful: 314


Processing documents:  74%|███████▍  | 321/435 [48:34<33:09, 17.45s/it]

JSON parsing error in document 320: Expecting value: line 1 column 1 (char 0)


Processing documents:  76%|███████▌  | 329/435 [48:44<03:45,  2.12s/it]

Processed 330/435 documents. Successful: 323


Processing documents:  78%|███████▊  | 339/435 [50:12<03:36,  2.25s/it]

Processed 340/435 documents. Successful: 333


Processing documents:  80%|████████  | 349/435 [51:41<03:27,  2.41s/it]

Processed 350/435 documents. Successful: 343


Processing documents:  81%|████████▏ | 354/435 [53:03<09:20,  6.92s/it]

JSON parsing error in document 353: Expecting value: line 1 column 1 (char 0)


Processing documents:  83%|████████▎ | 359/435 [53:09<02:49,  2.23s/it]

Processed 360/435 documents. Successful: 352


Processing documents:  85%|████████▍ | 369/435 [54:41<02:52,  2.62s/it]

Processed 370/435 documents. Successful: 362


Processing documents:  87%|████████▋ | 379/435 [56:16<02:36,  2.80s/it]

Processed 380/435 documents. Successful: 372


Processing documents:  89%|████████▉ | 389/435 [57:48<02:00,  2.61s/it]

Processed 390/435 documents. Successful: 382


Processing documents:  92%|█████████▏| 399/435 [59:24<01:35,  2.65s/it]

Processed 400/435 documents. Successful: 392


Processing documents:  94%|█████████▍| 409/435 [1:00:59<01:17,  2.96s/it]

Processed 410/435 documents. Successful: 402


Processing documents:  96%|█████████▌| 416/435 [1:02:25<01:24,  4.44s/it]

JSON parsing error in document 415: Expecting value: line 1 column 1 (char 0)


Processing documents:  96%|█████████▋| 419/435 [1:02:30<00:42,  2.65s/it]

Processed 420/435 documents. Successful: 411


Processing documents:  99%|█████████▊| 429/435 [1:04:05<00:17,  2.87s/it]

Processed 430/435 documents. Successful: 421


Processing documents: 100%|█████████▉| 434/435 [1:05:30<00:07,  7.66s/it]

JSON parsing error in document 433: Expecting value: line 1 column 1 (char 0)


Processing documents: 100%|██████████| 435/435 [1:05:32<00:00,  9.04s/it]

Finished processing. Total documents: 435, Successful: 425





In [152]:
with open('groundtruth_dataset.json', 'w', encoding='utf-8') as f:
    json.dump(groundtruth_dataset, f, ensure_ascii=False, indent=4)

## STEP 2 - Create Embeddings using Pretrained Models

In [163]:
model = SentenceTransformer("dunzhang/stella_en_1.5B_v5")

You try to use a model that was created with version 3.0.1, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [165]:
vector = model.encode("This is a test")
len(vector)

1024

In [181]:
operations = []
for doc in tqdm(json_data):
    doc["text_vector"] = model.encode(doc["text"]).tolist()
    operations.append(doc)

  0%|          | 0/435 [00:00<?, ?it/s]

100%|██████████| 435/435 [2:56:12<00:00, 24.31s/it]  


In [195]:
with open('operations.json', 'w', encoding='utf-8') as f:
    json.dump(operations, f, ensure_ascii=False, indent=4)

### Implementing Elastic Search Retrieval

In [184]:
es = Elasticsearch('http://localhost:9200')

In [185]:
es.info()

ObjectApiResponse({'name': 'baae846d5e1f', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'h2iYJMQHQouhleDTcCFGtA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [186]:
index_settings = {
    "settings" : {
        "number_of_shards" : 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "text_vector": {"type": "dense_vector", "dims": 1024, "index" :True, "similarity": "cosine"},
        }
    }
}

In [187]:
index_name = "course-questions"

es.indices.delete(index= index_name, ignore_unavailable=True)
es.indices.create(index= index_name, body= index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [188]:
for doc in operations:
    try:
        es.index(index= index_name, body= doc)
    except Exception as e:
        print(f"Error is {e}")

In [189]:
search_query = "windows or mac?"
vector_query = model.encode(search_query)

In [190]:
ques = {
    "field" : "text_vector",
    "query_vector": vector_query,
    "k": 5,
    "num_candidates": 1000,
}

In [191]:
res = es.search(index=index_name, knn=ques, source=["text", "section", "question", "course"])


In [192]:
a = res["hits"]["hits"][0]

In [193]:
a.items()

dict_items([('_index', 'course-questions'), ('_id', 'BENg8ZABRJ5_F5RWVrGj'), ('_score', 0.9188564), ('_source', {'question': 'When configuring the profiles.yml file for dbt-postgres with jinja templates with environment variables, I\'m getting "Credentials in profile "PROFILE_NAME", target: \'dev\', invalid: \'5432\'is not of type \'integer\'', 'course': 'data-engineering-zoomcamp', 'section': 'Module 5: pyspark', 'text': 'Update the line:\nWith:'})])

### Milvus Implementation

In [196]:
conn = connections.connect(host="127.0.0.1", port=19530)

database = db.create_database("my_database")


In [197]:
db.using_database("my_database")

In [200]:
from pymilvus import FieldSchema, CollectionSchema, DataType

id_field = FieldSchema(name="document_id", dtype=DataType.INT64, is_primary=True, description="primary id")
text_field = FieldSchema(name='text', dtype= DataType.VARCHAR)
question_field = FieldSchema(name='question', dtype= DataType.VARCHAR)
course_field = FieldSchema(name='course', dtype= DataType.VARCHAR)

embedding_field = FieldSchema(name="text_vector", dtype=DataType.FLOAT_VECTOR, dim=1024, description="vector")


schema = CollectionSchema(fields=[id_field, text_field, question_field, course_field, embedding_field], auto_id=False, enable_dynamic_field=True, description="desc of a collection")


In [201]:
from pymilvus import MilvusClient, DataType

# 1. Set up a Milvus client
client = MilvusClient(
    uri="http://localhost:19530"
)