## Embedding

In [8]:
model_name = 'Snowflake/snowflake-arctic-embed-m'

In [9]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(
    model_name, 
    trust_remote_code=True,
    cache_folder="./models"   # jawny katalog cache
)
emb_dimensions = model.get_sentence_embedding_dimension()

In [11]:
from qdrant_client import QdrantClient, models

In [12]:
client = QdrantClient("http://localhost:6333") #connecting to local Qdrant instance

In [14]:
data_folder = "data/"
filename = data_folder + "bfp-a3447q.pdf"
content_path= filename.split('.')[0]+'_chunked.txt'
context_path= filename.split('.')[0]+'_context.txt'
image_path = "data/images"

In [15]:
import pathlib, json
json_read = pathlib.Path(content_path).read_text()
data_content = json.loads(json_read)
json_read = pathlib.Path(context_path).read_text()
data_context = json.loads(json_read)
data_context = {int(k): v for k, v in data_context.items()}

In [21]:
collection_name = "bfp-a3447q_context"

if client.collection_exists(collection_name):
    client.get_collection(collection_name)
    print(f"Collection {collection_name} restored")
else:
    # Create the collection with specified vector parameters
    client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(
            size=emb_dimensions,  # Dimensionality of the vectors
            distance=models.Distance.COSINE  # Distance metric for similarity search
        )
    )
    print(f"New collection {collection_name} created")
    
points = []
parent_chapter = ""
id = 0
title = 'RH-3CH-Sxx/RH-6CH-Sxx Special Specifications Manual' # can be obtained from doc metadata
for index, chapter in enumerate(data_content):
    # elements of data list:
        # 0 - chapter level
        # 1 - chapter name
        # 2 - page number (1-based)
        # 3 - list of chunks of text
    if chapter[0] == 1:
        root_chapter = chapter[1]
    
    if index not in data_context.keys(): # if context not created, skip embedding
        print(f'\nChapter {chapter[1]} skipped', end='')
        continue
    context = data_context[index]
    text =  context + '\n' + chapter[-1]
    #print(f'\nChapter {chapter[1]} ', end='')
    point = models.PointStruct(
        id=id,
        # vector=models.Document(text=text, model=model_name),
        vector=model.encode(text).tolist(),
        payload={
            "content": chapter[-1],
            "main_chapter": root_chapter,
            "chapter": chapter[1],
            "manual": title,
            "page": chapter[2]
        } #save all needed metadata fields
    )
    #print("encoded... ", end='')
    points.append(point)
    id += 1
print(f"Collection points gathered")
client.upsert(
    collection_name=collection_name,
    points=points
)
print(f"Collection {collection_name} upserted.")


Collection bfp-a3447q_context restored

Chapter 1 General configuration skipped
Chapter 1.1 Structural equipment encoded... 
Chapter 1.1.1 Standard structural equipment encoded... 
Chapter 1.1.2 Special specifications encoded... 
Chapter 1.1.3 Options encoded... 
Chapter 1.1.4 Maintenance parts encoded... 
Chapter 1.2 Model type name of robot encoded... 
Chapter 1.2.1 How to identify the robot model encoded... 
Chapter 1.2.2 Combination of the robot arm and the controller encoded... 
Chapter 1.2.2 Combination of the robot arm and the controller encoded... 
Chapter 1.2.2 Combination of the robot arm and the controller encoded... 
Chapter 1.2.2 Combination of the robot arm and the controller encoded... 
Chapter 1.2.2 Combination of the robot arm and the controller encoded... 
Chapter 1.3 CE marking specifications encoded... 
Chapter 1.3 CE marking specifications encoded... 
Chapter 1.3 CE marking specifications encoded... 
Chapter 1.3 CE marking specifications encoded... 
Chapter 1.3 CE 