# Dependencies

In [None]:
#!pip install pymupdf
#!pip install pdfplumber
!pip install chromadb
#!pip install bijoy2unicode
#!pip install bijoy-to-unicode-file-converter

Collecting chromadb
  Downloading chromadb-1.0.15-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.35.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.35.0-py3-none-any.whl.metadata (2.4 kB)
Collecting opentelemetry-sdk>=1.2.0 (from chromadb)
  Downloading opentelemetry_sdk-1.35.0-py3-none-any.whl.metadata (1.5 k

# Imports

In [None]:
import os
import importlib.util
import sys
import re
import unicodedata
import pdfplumber
import fitz
import numpy as np
from typing import List
import chromadb
from chromadb.config import Settings

# Cleaning the document

In [None]:
def extract_text_fitz(pdf_path):
    doc = fitz.open(pdf_path)
    all_text = ""
    for page in doc:
        text = page.get_text()

        # With this clean PDF, do I even need these preprocessing?
        # What happens with the raw text?
        if text:
            # Normalize Bangla unicode
            text = unicodedata.normalize("NFC", text)

            # Remove unwanted single newlines
            text = re.sub(r'(?<!\n)[\r\n]+(?!\n)', ' ', text)

            # Collaspe multiple spaces
            text = re.sub(r'\s+', ' ', text)

            all_text += text.strip() + "\n\n"

    return all_text

In [None]:
# def extract_and_clean_bangla_text(pdf_path):
#     all_text = ""

#     with pdfplumber.open(pdf_path) as pdf:
#         for i, page in enumerate(pdf.pages):
#             text = page.extract_text()

#             if text:
#                 # Normalize Bangla unicode
#                 text = unicodedata.normalize("NFC", text)

#                 # Remove unwanted single newlines
#                 text = re.sub(r'(?<!\n)[\r\n]+(?!\n)', ' ', text)

#                 # Collaspe multiple spaces
#                 text = re.sub(r'\s+', ' ', text)

#                 # Remove likey page number at start/end
#                 text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)

#                 all_text += text.strip() + "\n\n"

#     return all_text

# Chunking

In [None]:
def chunk_text(text: str, chunk_size: int = 500, overlap: int = 100) -> List[str]:
    """
    Chunks text into overlapping segments using sentence boundaries.
    Also splits on 3+ whitespace chars if no punctuation is available.
    """

    # Normalize Unicode and collapse internal whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Split on Bangla/English sentence endings OR 3+ whitespace (e.g., newlines)
    sentence_delimiters = re.compile(r'(?<=[।!?])\s+|[\n\r]{2,}|\s{3,}')
    sentences = sentence_delimiters.split(text)

    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= chunk_size:
            current_chunk += " " + sentence
        else:
            chunks.append(current_chunk.strip())

            # Optional overlap
            if overlap > 0:
                overlap_text = current_chunk[-overlap:]
                current_chunk = overlap_text + " " + sentence
            else:
                current_chunk = sentence

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

# Embedding

In [None]:
from sentence_transformers import SentenceTransformer

def get_embedding(text_chunks, embedding_model_name = 'sentence-transformers/LaBSE') -> np.ndarray:
    """
    Returns embeddings for a list of text chunks using the specified multilingual model.

    Parameters:
        text_chunks (List[str]): List of text strings (chunks).
        model_name (str): Name of the HuggingFace model to load.

    Returns:
        np.ndarray: Embeddings array of shape (num_chunks, embedding_dim)
    """
    model = SentenceTransformer(embedding_model_name)
    embeddings = model.encode(text_chunks, show_progress_bar = True)

    return np.array(embeddings)

# Vector Storage / Database

In [None]:
def store_in_vector_db(chunks, embeddings, collection_name = "Bangla_RAG_Chunks", persist_dir = "./chromadb"):
    """
    Stores precomputed embeddings + their corresponding text chunks into ChromaDB.

    Args:
        chunks (List[str]): List of text chunks.
        embeddings (np.ndarray): Embeddings for each chunk (shape: [n_chunks, embedding_dim]).
        collection_name (str): Name of the ChromaDB collection.
        persist_dir (str): Directory for ChromaDB persistence.

    Returns:
        chromadb.Collection: The ChromaDB collection object.
    """
    # Step 1 - Setup ChromaDB client
    client = chromadb.Client(Settings(
        chroma_db_impl = "duckdb + parquet",
        persist_directory = persist_dir
    ))

    # Step 2 - Create or get collection
    collection = client.get_or_create_collection(name = collection_name)

    # Step 3 - Add chunks and embeddings
    collection.add(
        docuements = chunks,
        embeddings = embeddings.tolist(),   # Ensure list format
        ids = [f"chunk_{i}" for i in range(len(chunks))]
    )

    print(f"Stored {len(chunks)} chunks in ChromaDB collection '{collection_name}'.")

    return collection

# Final Pipeline

In [None]:
pdf_path = "/content/Sherlock-Holmes-Bangla-Wikipedia.pdf"

clean_text = extract_text_fitz(pdf_path)
chuncked_text = chunk_text(clean_text)
embedded = get_embedding(chuncked_text)
chormadb_collection = store_in_vector_db(chuncked_text, embedded)
print(chormadb_collection)
#print(chuncked_text)
#clean_bangla_text = extract_and_clean_bangla_text(pdf_path)
#print(clean_bangla_text)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

ValueError: [91mYou are using a deprecated configuration of Chroma.

[94mIf you do not have data you wish to migrate, you only need to change how you construct
your Chroma client. Please see the "New Clients" section of https://docs.trychroma.com/deployment/migration.
________________________________________________________________________________________________

If you do have data you wish to migrate, we have a migration tool you can use in order to
migrate your data to the new Chroma architecture.
Please `pip install chroma-migrate` and run `chroma-migrate` to migrate your data and then
change how you construct your Chroma client.

See https://docs.trychroma.com/deployment/migration for more information or join our discord at https://discord.gg/MMeYNTmh3x for help![0m

In [None]:
print("Shape:", embedded.shape)
print("First vector (truncated):", embedded[0][:10])

# Shape: (12, 768) means we have 12 text chuncks and each chunk is embedded into 768-dimensional vector

Shape: (12, 768)
First vector (truncated): [-0.04484368 -0.05162629 -0.01035499 -0.05596629 -0.0338166  -0.06088515
 -0.07117505 -0.04448484  0.00836606  0.01921627]
