## Installing dependencies

In [None]:
%pip install -q weaviate-client sentence-transformers pyvi pymupdf langchain

## Connecting to Weaviate

In [None]:
import weaviate

client = weaviate.connect_to_local("localhost")

### Create Document Collection

In [None]:
from weaviate.classes.config import Configure, Property, DataType
try:
    client.collections.create(
        name="Document",
        vectorizer_config=Configure.Vectorizer.none(),
        properties=[Property(name="text", data_type=DataType.TEXT)],
    )
except Exception as e:
    print(f"Error creating collection: {e}")

### Closing the connection (if needed)

In [None]:
client.close()

## Embedding

### Test to check the distance of tokenized and non-tokenized (optional)

In [None]:
from numpy import dot
from numpy.linalg import norm
import time
from sentence_transformers import SentenceTransformer
from pyvi.ViTokenizer import tokenize


# model = SentenceTransformer('dangvantuan/vietnamese-embedding')
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

raw = "Hà Nội là thủ đô của Việt Nam"
tokenized = "Hà_Nội là thủ_đô của Việt_Nam"

vec1 = model.encode(raw)
vec2 = model.encode(tokenized)

cos_sim = dot(vec1, vec2) / (norm(vec1) * norm(vec2))
print(f"Cosine similarity between raw and tokenized: {cos_sim:.4f}")


### Embedding function

In [None]:
from typing import List

def embed(text) -> List[List[float]]:
    return model.encode(text).tolist()

def import_texts_and_embeds_to_db(texts: List[str], embeddings: List[List],collection_name="Document"):
    for text, embedding in zip(texts, embeddings):
        client.collections.get(collection_name).data.insert(
            properties={"text": text}, vector=embedding
        )

### Clear Document Collection

In [None]:
def clear_document_collection():
    client.collections.delete("Document")

In [None]:

clear_document_collection()

### Query

In [None]:
from weaviate.classes.query import MetadataQuery

query = "Tinh thể?"
query_tokenized = tokenize(query)
query_vector = model.encode(query_tokenized).tolist()

result = client.collections.get("Document").query.near_vector(
    near_vector=query_vector,
    limit=5,
    return_metadata=MetadataQuery(distance=True)
)

retrieved_objects = [obj for obj in result.objects]

print("Query results:")
for i, obj in enumerate(retrieved_objects, 1):
    print(f"{i}. Dist: {obj.metadata.distance} - {obj.properties['text'][:140]}...")
    

## PDF to vector

In [None]:
file_name = "test-pdf/OS_C4_File and Disk management.pdf"

In [None]:
import json
import pymupdf
from typing import List
from langchain.schema import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter


def pdf_to_raw_doc(file_name) -> List[Document]:
    doc = pymupdf.open(file_name)
    pages: List[Document] = []
    for pg_num, page in enumerate(doc, start=1):
        pages.append(
            Document(
                page_content=page.get_text("text"),
                metadata={"source": file_name, "page": pg_num},
            )
        )
    return pages


def split_doc(doc: Document, chunk_size: int, chunk_overlap: int) -> List[Document]:
    """
    Splits a Document into smaller chunks based on the specified chunk size and overlap.
    """
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""],
    )

    page_contents = splitter.split_text(doc.page_content)

    splitted_docs: List[Document] = []
    for i, page_content in enumerate(page_contents):
        splitted_docs.extend(
            Document(
                page_content=page_content,
                metadata={
                    "source": doc.metadata.get("source", ""),
                    "page": doc.metadata.get("page", 1),
                    "chunk_index": i,
                },
            )
        )

    return splitted_docs


def save_to_json(data, output_file):
    """Save the processed data to a JSON file."""
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)


def docs_to_json(docs: List[Document]) -> dict:
    """
    Including preprocessing and chunking.
    """
    return [
        {
            "source": doc.metadata.get("source", ""),
            "page": doc.metadata.get("page", 1),
            "chunk_index": doc.metadata.get("chunk_index", 0),
            "content": doc.page_content,
        }
        for doc in docs
    ]


def json_to_docs(file_name: str) -> List[Document]:
    """
    Load documents from a JSON file.
    """
    with open(file_name, "r", encoding="utf-8") as f:
        data = json.load(f)
    return [
        Document(
            page_content=item["content"],
            metadata={
                "source": item.get("source", ""),
                "page": item.get("page", 1),
                "chunk_index": item.get("chunk_index", None),
            },
        )
        for item in data
    ]


def docs_to_strings(docs: List[Document]) -> List[str]:
    """
    Convert a list of Document objects to a list of strings.
    """
    return [str(doc) for doc in docs]


raw_docs = pdf_to_raw_doc(file_name)
processed_docs: List[Document] = []
for doc in raw_docs:
    if len(doc.page_content) > 800:
        print("Split")
        sub_docs = split_doc(doc, chunk_size=800, chunk_overlap=100)
        processed_docs.extend(sub_docs)
    else:
        processed_docs.append(doc)

json_docs = docs_to_json(processed_docs)
save_to_json(json_docs, "json/raw_docs.json")

In [None]:
embeddings = embed(docs_to_strings(processed_docs))
import_texts_and_embeds_to_db(
    docs_to_strings(processed_docs), embeddings, collection_name="Document"
)

In [None]:
from weaviate.classes.query import MetadataQuery

query = "Scheduling?"
query_tokenized = tokenize(query)
query_vector = model.encode(query).tolist()

result = client.collections.get("Document").query.near_vector(
    near_vector=query_vector,
    limit=5,
    return_metadata=MetadataQuery(distance=True)
)

retrieved_objects = [obj for obj in result.objects]

print("Query results:")
for i, obj in enumerate(retrieved_objects, 1):
    print(f"{i}. Dist: {obj.metadata.distance}:\n {obj.properties['text'][:240]}...")
    

### Check Document Collection size

In [None]:
# clear_document_collection()
client.collections.get("Document").aggregate.over_all(total_count=True)