This notebook is a collection of functions and code snipptes that can be used for handling the pdf files and creating the knowledge base.

In [1]:
def chunk_text(text: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> list[str]:
    """
    Splits a long text into smaller, overlapping chunks.

    Args:
        text: The input text to be chunked.
        chunk_size: The desired size of each chunk.
        chunk_overlap: The number of characters to overlap between chunks.

    Returns:
        A list of text chunks.
    """
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - chunk_overlap
    return chunks

In [2]:
# Example Usage
text ="""
    Splits a long text into smaller, overlapping chunks.

    Args:
        text: The input text to be chunked.
        chunk_size: The desired size of each chunk.
        chunk_overlap: The number of characters to overlap between chunks.

    Returns:
        A list of text chunks.
    """

chunk_text(text,20,5)[:5]

['\n    Splits a long t',
 'ong text into smalle',
 'maller, overlapping ',
 'ping chunks.\n\n    Ar',
 '   Args:\n        tex']

Now to extract data from pdf files and create a knowledge base

In [4]:
import fitz, os
data_dir = r"..\data\Papers"
knowledge_file = r"..\data\knowledge.txt"

In [None]:
def extract_text(papers_dir: str, out_path: str) -> str:
    all_text = ""
    for file in os.listdir(papers_dir):
        if file.endswith(".pdf"):
            filepath = os.path.join(papers_dir, file)
            doc = fitz.open(filepath)
            for page in doc:
                all_text += page.get_text()
            doc.close()

    assert out_path.endswith(".txt"), "Can only save to a text file"
    with open(out_path, 'w', encoding='utf-8') as f:
        f.write(all_text)

In [None]:
# extract_text(papers_dir=data_dir, out_path=knowledge_file)

After the text is extarcted from the pdf files into `knowledge_file`, it can be chunked into smaller pieces and stored in a vector database like ChromaDB.

In [5]:
text = open(knowledge_file, mode='r',encoding='utf-8').read()
len(text)

1770730

In [6]:
chunks = chunk_text(text, chunk_size=3500, chunk_overlap=300)
len(chunks)

554

In [7]:
from openai import OpenAI
import numpy as np
oclient = OpenAI(api_key=os.getenv("API_KEY"))

In [8]:
def chunks2embs(chunks:list[str]) -> list[list[float]]:

    if (s := sum(len(i) for i in chunks) / 3.5) > 300000:
        n = int(np.ceil(s/300000))
        k, m = divmod(len(chunks), n)
        chunks_ = [chunks[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)]

    embds = []
    for hehe in chunks_:
        resp = oclient.embeddings.create(
            input=hehe,
            model="text-embedding-3-small")
        
        embds.extend([resp.data[i].embedding for i in range(len(hehe))])

    return np.array(embds)

In [10]:
# x = chunks2embs(chunks)
# x.shape
# np.save(r"../data/embds.npy", x)
embds = np.load(r"../data/embds.npy", 'r')

Now that we have the chunks, persist them in a vector database like ChromaDB.


In [None]:
import chromadb

db_dir = r"../chromadb"
os.makedirs(db_dir, exist_ok=True)

chroma_client = chromadb.PersistentClient(path=db_dir)
collection = chroma_client.create_collection(name="my_collection")

In [17]:
collection.add(ids=[f'chunk{i}' for i in range(len(chunks))],
               documents=chunks,
               embeddings=embds)

In [22]:
query = "What is deepseek V3?"
query_emb = oclient.embeddings.create(input=query, model="text-embedding-3-small").data[0].embedding

In [23]:
y = collection.query(
    query_embeddings=query_emb,
    n_results=3)