# vector_db.py

In [61]:
import os
import sys
import uuid
import chromadb
from dotenv import find_dotenv, load_dotenv
from data_preprocessing.text_preprocessing import topic_content
import chromadb.utils.embedding_functions as embedding_functions

In [44]:
load_dotenv(find_dotenv())

sys_path = os.environ['sys_path']
sys.path.append(sys_path)

HF_KEY = os.environ['HUGGINGFACE_API_KEY']

In [35]:
print(topic_content)

[{'topic': 'FOREWORD', 'content': 'FOREWORD\nThe pandemic of COVID-19 had detrimental impact on Nepal’s school education. A nation-wide \nlockdown of public movement and services was initiated by the Government upon the confirmation of \nthe first cases of COVID-19 in Nepal in March 2020. In line with this, the Ministry of Education, Science \nand Technology (MOEST) was forced to shut-down schools. Schools remained closed for almost eight \nmonths until the end of October 2020. Due to surge in COVID-19 cases as a result of the emerge of the \nDelta variant, schools were re-closed in April 2021, only to re-open gradually in September that year. \nAnother surge in cases a result of the Omicron variant caused schools to close yet again for a month \nat the beginning of 2022. To reduce the effect of the disruptions of services, the MoEST approved the \nnational School reopening Framework in November 2020, providing guidance for local governments \nand schools on the safe reopening of their

In [30]:
# Connecting to Chroma DB server through HTTP client
chroma_client = chromadb.HttpClient(host="localhost", port=8000)
chroma_client.list_collections()

[]

In [51]:
# LLM for embedding function
hf_ef = embedding_functions.HuggingFaceEmbeddingFunction(
    api_key=HF_KEY,
    model_name="sentence-transformers/all-mpnet-base-v2"
)
hf_ef

<chromadb.utils.embedding_functions.HuggingFaceEmbeddingFunction at 0x22986c5e8a0>

In [52]:
# Creating Chroma DB list to store embeddings
pdf_coll = chroma_client.get_or_create_collection(name="pdf_embedding_collection",
                                                  embedding_function=hf_ef)
pdf_coll.peek()

{'ids': [],
 'embeddings': [],
 'metadatas': [],
 'documents': [],
 'data': None,
 'uris': None}

In [53]:
# Adding embeddings to collection
def embeddings(pdf_coll, topic_content):
    for i in topic_content:
        id = uuid.uuid1()
        metadata = {'topic': i['topic']}
        docs = i['content']
        # print(id, metadata, docs)
        # print("....................")

        # Adding to collection
        pdf_coll.add(ids=[str(id)],
                    documents=docs,
                    metadatas=[metadata])
    
embeddings(pdf_coll, topic_content)

In [60]:
# Displaying metadata of collection
pdf_coll.get(include=["metadatas"])

{'ids': ['1e7a6166-12da-11ef-ae8c-9c2f9d50747a',
  '287d29e4-12da-11ef-be94-9c2f9d50747a',
  '28c19ba0-12da-11ef-b549-9c2f9d50747a',
  '291278b7-12da-11ef-8670-9c2f9d50747a'],
 'embeddings': None,
 'metadatas': [{'topic': 'FOREWORD'},
  {'topic': 'ABBREVIATIONS'},
  {'topic': 'Recovery and Accelerated Learning (ReAL)'},
  {'topic': 'The Road Map for Recovery and Accelerated Learning'}],
 'documents': None,
 'data': None,
 'uris': None}

In [None]:
# Creating pdf_coll_dict to contain information about collection
pdf_coll_dict = {
    "collection_name": "pdf_embedding_collection",
    "LLM_model": "sentence-transformers/all-mpnet-base-v2"
}