## Embedding

In [None]:
model_name = 'all-mpnet-base-v2'

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(
    model_name, 
    trust_remote_code=True,
    cache_folder="./models"   # explicitly setting cache location
)
emb_dimensions = model.get_sentence_embedding_dimension()

In [None]:
from qdrant_client import QdrantClient, models

In [None]:
client = QdrantClient("http://localhost:6333") #connecting to local Qdrant instance

In [None]:
data_folder = "data/"
filename = data_folder + "bfp-a3447q.pdf"
content_path= filename.split('.')[0]+'_v2_chunked.txt'
context_path= filename.split('.')[0]+'_context.txt'
image_path = "data/images"

In [None]:
import pathlib, json
json_read = pathlib.Path(content_path).read_text()
data_content = json.loads(json_read)
json_read = pathlib.Path(context_path).read_text()
data_context = json.loads(json_read)
data_context = {int(k): v for k, v in data_context.items()}

In [None]:
collection_name = "bfp-a3447q_v2"

if client.collection_exists(collection_name):
    client.get_collection(collection_name)
    print(f"Collection {collection_name} restored")
else:
    # Create the collection with specified vector parameters
    client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(
            size=emb_dimensions,  # Dimensionality of the vectors
            distance=models.Distance.COSINE  # Distance metric for similarity search
        )
    )
    print(f"New collection {collection_name} created")
    
points = []
parent_chapter = ""
id = 0
title = 'RH-3CH-Sxx/RH-6CH-Sxx Special Specifications Manual' # can be obtained from doc metadata
for index, chapter in enumerate(data_content):
    # elements of data list:
        # 0 - chapter level
        # 1 - chapter name
        # 2 - page number (1-based)
        # 3 - chunk of text
    if chapter[0] == 1:
        root_chapter = chapter[1]
    if 3*len(chapter[1]) > len(chapter[-1]): 
        print(f'{index}) Paragraphs not generated for chapter: {chapter[1]}')
        continue    
    # if index not in data_context.keys(): # if context not created, skip embedding
    #     print(f'\nChapter "{chapter[1]}" skipped', end='')
    #     continue
    context = "" # data_context[index]
    text =  context + chapter[-1]
    print(f'\n\tChapter "{chapter[1]}" ', end='')
    point = models.PointStruct(
        id=id,
        vector=model.encode(text).tolist(),
        payload={
            "content": chapter[-1],
            "main_chapter": root_chapter,
            "chapter": chapter[1],
            "manual": title,
            "page": chapter[2]
        } #save all needed metadata fields
    )
    print("encoded... ", end='')
    points.append(point)
    id += 1
print(f"Collection points gathered")
client.upsert(
    collection_name=collection_name,
    points=points
)
print(f"Collection {collection_name} upserted.")
