## Installing dependencies

In [None]:
%pip install -q weaviate-client sentence-transformers pyvi pymupdf langchain

## Connecting to Weaviate

In [1]:
import weaviate

client = weaviate.connect_to_local("localhost")

### Create Document Collection

In [None]:
from weaviate.classes.config import Configure, Property, DataType
try:
    client.collections.create(
        name="Document",
        vectorizer_config=Configure.Vectorizer.none(),
        properties=[Property(name="text", data_type=DataType.TEXT)],
    )
except Exception as e:
    print(f"Error creating collection: {e}")

### Closing the connection (if needed)

In [None]:
client.close()

## Embedding

### Test to check the distance of tokenized and non-tokenized (optional)

In [None]:
from numpy import dot
from numpy.linalg import norm
import time
from sentence_transformers import SentenceTransformer
from pyvi.ViTokenizer import tokenize


# model = SentenceTransformer('dangvantuan/vietnamese-embedding')
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

raw = "Hà Nội là thủ đô của Việt Nam"
tokenized = "Hà_Nội là thủ_đô của Việt_Nam"

vec1 = model.encode(raw)
vec2 = model.encode(tokenized)

cos_sim = dot(vec1, vec2) / (norm(vec1) * norm(vec2))
print(f"Cosine similarity between raw and tokenized: {cos_sim:.4f}")


### Embedding function

In [None]:
from sentence_transformers import SentenceTransformer

# model = SentenceTransformer('dangvantuan/vietnamese-embedding')
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

In [4]:
from typing import List

def embed(text) -> List[List[float]]:
    return model.encode(text).tolist()

def import_texts_and_embeds_to_db(texts: List[str], embeddings: List[List],collection_name="Document"):
    for text, embedding in zip(texts, embeddings):
        client.collections.get(collection_name).data.insert(
            properties={"text": text}, vector=embedding
        )

### Clear Document Collection

In [24]:
def clear_document_collection():
    client.collections.delete("Document")

In [None]:

clear_document_collection()

### Query

In [None]:
from weaviate.classes.query import MetadataQuery

query = "Tinh thể?"
query_tokenized = tokenize(query)
query_vector = model.encode(query_tokenized).tolist()

result = client.collections.get("Document").query.near_vector(
    near_vector=query_vector,
    limit=5,
    return_metadata=MetadataQuery(distance=True)
)

retrieved_objects = [obj for obj in result.objects]

print("Query results:")
for i, obj in enumerate(retrieved_objects, 1):
    print(f"{i}. Dist: {obj.metadata.distance} - {obj.properties['text'][:140]}...")
    

## PDF to vector

In [5]:
file_name = "test-pdf/OS_C4_File and Disk management.pdf"

In [None]:
import json
import pymupdf
from typing import List
from langchain.schema import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter


def pdf_to_raw_doc(file_name) -> List[Document]:
    doc = pymupdf.open(file_name)
    pages: List[Document] = []
    for pg_num, page in enumerate(doc, start=1):
        pages.append(
            Document(
                page_content=page.get_text("text"),
                metadata={"source": file_name, "page": pg_num},
            )
        )
    return pages


def split_doc(doc: Document, chunk_size: int, chunk_overlap: int) -> List[Document]:
    """
    Splits a Document into smaller chunks based on the specified chunk size and overlap.
    """
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""],
    )

    page_contents = splitter.split_text(doc.page_content)

    splitted_docs: List[Document] = []
    for i, page_content in enumerate(page_contents):
        splitted_docs.extend(
            Document(
                page_content=page_content,
                metadata={
                    "source": doc.metadata.get("source", ""),
                    "page": doc.metadata.get("page", 1),
                    "chunk_index": i,
                },
            )
        )

    return splitted_docs


def save_to_json(data, output_file):
    """Save the processed data to a JSON file."""
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)


def docs_to_json(docs: List[Document]) -> dict:
    """
    Including preprocessing and chunking.
    """
    return [
        {
            "source": doc.metadata.get("source", ""),
            "page": doc.metadata.get("page", 1),
            "chunk_index": doc.metadata.get("chunk_index", 0),
            "content": doc.page_content,
        }
        for doc in docs
    ]


def json_to_docs(file_name: str) -> List[Document]:
    """
    Load documents from a JSON file.
    """
    with open(file_name, "r", encoding="utf-8") as f:
        data = json.load(f)
    return [
        Document(
            page_content=item["content"],
            metadata={
                "source": item.get("source", ""),
                "page": item.get("page", 1),
                "chunk_index": item.get("chunk_index", None),
            },
        )
        for item in data
    ]


def docs_to_strings(docs: List[Document]) -> List[str]:
    """
    Convert a list of Document objects to a list of strings.
    """
    return [str(doc) for doc in docs]


raw_docs = pdf_to_raw_doc(file_name)
processed_docs: List[Document] = []
for doc in raw_docs:
    if len(doc.page_content) > 800:
        print("Split")
        sub_docs = split_doc(doc, chunk_size=800, chunk_overlap=100)
        processed_docs.extend(sub_docs)
    else:
        processed_docs.append(doc)

json_docs = docs_to_json(processed_docs)
save_to_json(json_docs, "json/raw_docs.json")

In [29]:
embeddings = embed(docs_to_strings(processed_docs))
import_texts_and_embeds_to_db(
    docs_to_strings(processed_docs), embeddings, collection_name="Document"
)

In [8]:
from weaviate.classes.query import MetadataQuery

query = "Scheduling?"
# query_tokenized = tokenize(query)
query_vector = model.encode(query).tolist()

result = client.collections.get("Document").query.near_vector(
    near_vector=query_vector,
    limit=5,
    return_metadata=MetadataQuery(distance=True)
)

retrieved_objects = [obj for obj in result.objects]


In [9]:

print("Query results:")
for i, obj in enumerate(retrieved_objects, 1):
    print(f"{i}. Dist: {obj.metadata.distance}:\n {obj.properties['text'][:]}...")
    

Query results:
1. Dist: 0.5142076015472412:
 page_content='Disk Scheduling
Disk Storage |  Disk Scheduling Algorithms
5 3
• Disk Scheduling Algorithms
ü FCFS (First-Come First-Served)
ü SSTF (Shortest Seek Time First)
ü SCAN (Elevator Algorithm)
ü C-SCAN (Circular SCAN)
ü LOOK (an optimized version of SCAN)
ü C-LOOK (Circular LOOK)
' metadata={'source': 'test-pdf/OS_C4_File and Disk management.pdf', 'page': 53}...
2. Dist: 0.5761232972145081:
 page_content='Disk Access Time
Disk Storage |  Disk Scheduling Algorithms
5 2
Disk Access Time = Seek Time + Rotational Time + Data Transfer Time
Time to move 
Read/Write Head to the 
desired track/cylinder
Time to rotate the 
desired sector to the 
Read/Write Head
Time to transfer data from the disk
Most dominant
= Transferred Data/Transfer Rate
' metadata={'source': 'test-pdf/OS_C4_File and Disk management.pdf', 'page': 52}...
3. Dist: 0.5768440365791321:
 page_content='SSTF (Shortest Seek Time First)
Disk Storage |  Disk Scheduling Algorithms


### Check Document Collection size

In [None]:
# clear_document_collection()
client.collections.get("Document").aggregate.over_all(total_count=True)

## LLMs

In [None]:
%pip install -q openai python-dotenv

### The Thinker - Reasoning model

In [None]:
import os
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()

api_key = os.getenv("OPENROUTER_DEEPSEEK_KEY")

llm_client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=api_key,
)

stream = llm_client.chat.completions.create(
    model="deepseek/deepseek-r1:free",
    messages=[{"role": "user", "content": "What is the meaning of life? Make it short."}],
    extra_body={"include_reasoning": True},
    stream=True,
)

for chunk in stream:
    if chunk.choices and getattr(chunk.choices[0].delta, "reasoning", None):
        print(chunk.choices[0].delta.reasoning, end="", flush=True)
    if chunk.choices and chunk.choices[0].delta.content:
        print(chunk.choices[0].delta.content, end="", flush=True)


### The Finder - Retrieval Augmented Generation (RAG) model

In [15]:
import os
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()

api_key = os.getenv("OPENROTER_MISTRAL_7B_KEY")

llm_client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=api_key,
)

stream = llm_client.chat.completions.create(
    model="mistralai/mistral-7b-instruct:free",
    messages=[{"role": "user", "content": "What is the meaning of life? Make it short."}],
    # extra_body={"include_reasoning": True}, # No reason to add it because it is not supported
    stream=True,
)

for chunk in stream:
    if chunk.choices and getattr(chunk.choices[0].delta, "reasoning", None):
        print(chunk.choices[0].delta.reasoning, end="", flush=True)
    if chunk.choices and chunk.choices[0].delta.content:
        print(chunk.choices[0].delta.content, end="", flush=True)


 The meaning of life is subjective and can vary greatly among individuals, cultures, and beliefs. However, one common interpretation is that life's meaning emerges through personal growth, relationships, love, discovery, and contributions to the betterment of oneself and the world around you. Ultimately, your unique journey helps shape your perspective on the meaning of your life.

### Check LLM API limits

In [None]:
import requests

api_key = os.getenv("OPENROTER_MISTRAL_7B_KEY")
api_key = os.getenv("OPENROUTER_DEEPSEEK_KEY")

response = requests.get(
  url="https://openrouter.ai/api/v1/auth/key",
  headers={
    "Authorization": f"Bearer {api_key}"
  }
)

print(json.dumps(response.json(), indent=2))


## RAG pipeline integration with LLM

In [60]:
def query_with_rag(query: str, llm_client: OpenAI = None):
    if llm_client is None:
        return "LLM client is not provided."
        
    query_embed = embed([query])[0]
    result = client.collections.get("Document").query.near_vector(
        near_vector=query_embed, limit=5, return_metadata=MetadataQuery(distance=True)
    )

    retrieved_objects = [obj for obj in result.objects]
    retrived_texts = [
        f"Rank {i}. Dist: {obj.metadata.distance}:\n {obj.properties['text'][:]}..."
        for i, obj in enumerate(retrieved_objects, 1)
    ]
    processed_retrived_text = "\n--------------\n".join(retrived_texts)

    user_query = f"""
    Questions: {query}
    Context: {processed_retrived_text}
    Answer the question based on the context provided.
    """

    system_prompt = """
    You are a helpful assistant that answers questions based on the provided context.
    If the context does not provide enough information, you should say "I don't know".
    """

    response = llm_client.chat.completions.create(
        model="deepseek/deepseek-r1:free",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_query},
        ],
        extra_body={"include_reasoning": True},
        stream=True,
    )

    first_content_chunk = True
    reasoning_dur = 0
    final_answer_dur = 0
    start = time.time()
    for chunk in response:
        if chunk.choices and getattr(chunk.choices[0].delta, "reasoning", None):
            print(chunk.choices[0].delta.reasoning, end="", flush=True)
        if chunk.choices and chunk.choices[0].delta.content:
            if first_content_chunk:
                reasoning_dur = time.time() - start
                print("\n----------------\n# Final answer: ", flush=True)
                first_content_chunk = False
            print(chunk.choices[0].delta.content, end="", flush=True)
            
    print("\n----------------\n", flush=True)
    final_answer_dur = time.time() - start - reasoning_dur
    print(f"\n# Reasoning taken: {reasoning_dur:.2f} seconds", flush=True)
    print(f"\n# Final answer taken: {final_answer_dur:.2f} seconds", flush=True)


## Test the RAG system

In [59]:
# Test 1: Basic question about the PDF content
result1 = query_with_rag(
    "What is file scheduling in operating systems?", llm_client=llm_client
)

Okay, let's see. The user is asking about file scheduling in operating systems. Hmm, but the context provided is all about disk scheduling algorithms like FCFS, SSTF, SCAN, etc. Wait, are file scheduling and disk scheduling the same thing here? Maybe the user meant disk scheduling. The context mentions Disk Scheduling Algorithms as part of Disk Storage, so it's about how the OS manages disk access requests.

Looking at the context, Rank 2 lists several disk scheduling algorithms. The other ranks talk about disk access time components and examples like FCFS. There's no mention of "file scheduling" directly, just disk scheduling. Since the question is phrased as "file scheduling," but the context doesn't address file scheduling specifically, only disk scheduling, I should check if the terms are being used interchangeably or if there's a misunderstanding. However, in standard OS terminology, disk scheduling and file scheduling aren't the same. File management would involve organization an

In [61]:
# Test 2: More complex question
result2 = query_with_rag(
    "How do disk allocation algorithms work and what are their trade-offs?",
    llm_client=llm_client,
)

Okay, the user is asking about disk allocation algorithms and their trade-offs. Let me start by looking at the provided context. The context mentions different allocation methods like contiguous, linked list, indexed, and specific cases like Windows and Unix-based systems. 

First, contiguous allocation assigns blocks consecutively. That should be fast for access but might lead to fragmentation, right? Then linked lists use pointers, which avoids fragmentation but could have overhead from storing pointers and slower access since you have to traverse the list. 

Indexed allocation, especially with Unix's inodes, uses a multilevel approach. Direct blocks for small files and indirect blocks for larger ones. That offers flexibility but with added complexity. Oh, and Windows uses a file-table linked list, which is like a hybrid maybe?

Wait, the user mentioned disk allocation algorithms, but some of the context refers to disk scheduling algorithms like FCFS, SSTF, SCAN, etc. But the questio

In [62]:
# Test 3: Vietnamese question (to test multilingual capabilities)
result3 = query_with_rag(
    "Các phương pháp quản lý tệp trong hệ điều hành là gì?",
    llm_client=llm_client,
)

Okay, let's see. The user is asking about the methods of file management in operating systems. I need to base my answer on the provided context. 

Looking through the context snippets, I notice that there are mentions of "File System Interface" and "File System Implementation". Rank 2 context talks about the user's perspective (naming, structure, manipulation) and the system's perspective (storing files on disk blocks). Also, there's mention of disk management which might include allocation/deallocation, read/write operations, protection, and sharing. 

However, the specific methods like contiguous allocation, linked lists, file allocation tables (FAT), inode structures, or journaling aren't explicitly listed. The context provided seems to outline the high-level aspects rather than enumerating specific techniques. The answer should probably stick to what's mentioned in the context without adding outside knowledge, since the user wants the answer based solely on the provided material.

