In [1]:
#!pip install pdfplumber nltk sentence-transformers qdrant-client

import pdfplumber
import nltk
from sentence_transformers import SentenceTransformer
import numpy as np
from qdrant_client import QdrantClient
from qdrant_client.http import models as rest
from tqdm import tqdm
import re
import uuid

#nltk.download('punkt')

In [2]:
COLLECTION_NAME = "document_chunks"
QDRANT_URL = "http://localhost:6333"
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L12-v2"
MAX_CHUNK_WORDS = 200

# Initialize Qdrant
client = QdrantClient(url=QDRANT_URL)

In [3]:
# Extract text from PDF with page numbers
def extract_text_with_pages(pdf_path):
    text_with_pages = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if text:
                text_with_pages.append((page_num, text))
    return text_with_pages


In [4]:
# Semantic dynamic chunking
def semantic_chunking(text_with_pages, threshold=0.75):
    model = SentenceTransformer(EMBEDDING_MODEL)
    chunks = []

    for page_number, text in text_with_pages:
        sentences = nltk.sent_tokenize(text)
        sentence_embeddings = model.encode(sentences)
        current_chunk = []
        current_chunk_words = 0

        for i, (sentence, embedding) in enumerate(zip(sentences, sentence_embeddings)):
            if not current_chunk:
                current_chunk.append(sentence)
                current_chunk_words = len(sentence.split())
                continue

            sim = np.dot(embedding, model.encode([' '.join(current_chunk)])[0]) / (
                np.linalg.norm(embedding) * np.linalg.norm(model.encode([' '.join(current_chunk)])[0])
            )

            if sim > threshold and current_chunk_words + len(sentence.split()) <= MAX_CHUNK_WORDS:
                current_chunk.append(sentence)
                current_chunk_words += len(sentence.split())
            else:
                chunks.append({
                    "text": " ".join(current_chunk),
                    "page_number": page_number
                })
                current_chunk = [sentence]
                current_chunk_words = len(sentence.split())

        if current_chunk:
            chunks.append({
                "text": " ".join(current_chunk),
                "page_number": page_number
            })

    return chunks



In [5]:
# Generate embeddings
def generate_embeddings(chunks):
    model = SentenceTransformer(EMBEDDING_MODEL)
    texts = [chunk['text'] for chunk in chunks]
    embeddings = model.encode(texts, normalize_embeddings=True)
    return embeddings

# Create collection if not exists
def create_collection():
    if COLLECTION_NAME not in [c.name for c in client.get_collections().collections]:
        client.create_collection(
            collection_name=COLLECTION_NAME,
            vectors_config=rest.VectorParams(size=384, distance=rest.Distance.COSINE)
        )



In [6]:
# Store chunks and metadata in Qdrant
def store_chunks(chunks, embeddings):
    points = []
    payloads = []
    for chunk, embedding in zip(chunks, embeddings):
        chunk_id = str(uuid.uuid4())
        points.append(rest.PointStruct(
            id=chunk_id,
            vector=embedding.tolist()
        ))
        payloads.append({
            "chunk_id": chunk_id,
            "text": chunk['text'],
            "page_number": chunk['page_number']
        })

    client.upsert(collection_name=COLLECTION_NAME, points=points)

    # Set payloads separately (for compatibility)
    for point, payload in zip(points, payloads):
        client.set_payload(
            collection_name=COLLECTION_NAME,
            payload=payload,
            points=[point.id]
        )



In [7]:
# Perform vector search and keyword overlap
def search(query, top_k=5):
    model = SentenceTransformer(EMBEDDING_MODEL)
    query_embedding = model.encode([query], normalize_embeddings=True)[0]
    search_results = client.search(
        collection_name=COLLECTION_NAME,
        query_vector=query_embedding,
        limit=top_k
    )

    results = []
    for result in search_results:
        payload = result.payload
        chunk_text = payload["text"]
        keywords_query = set(re.findall(r'\w+', query.lower()))
        keywords_chunk = set(re.findall(r'\w+', chunk_text.lower()))
        keyword_overlap = len(keywords_query & keywords_chunk) / max(len(keywords_query), 1)
        results.append({
            "chunk": chunk_text,
            "page_number": payload["page_number"],
            "score": result.score,
            "keyword_overlap": keyword_overlap
        })

    return rerank_results(query, results)



In [8]:
# Rerank results
def rerank_results(query, results):
    model = SentenceTransformer(EMBEDDING_MODEL)
    query_embedding = model.encode([query], normalize_embeddings=True)[0]

    reranked = []
    for result in results:
        chunk_embedding = model.encode([result["chunk"]], normalize_embeddings=True)[0]
        cos_sim = np.dot(query_embedding, chunk_embedding)
        final_score = 0.7 * cos_sim + 0.3 * result["keyword_overlap"]

        reranked.append({
            "chunk": result["chunk"],
            "page_number": result["page_number"],
            "final_score": final_score
        })

    return sorted(reranked, key=lambda x: x["final_score"], reverse=True)



In [9]:
# Run the full process
pdf_path = "alice.pdf"
text_with_pages = extract_text_with_pages(pdf_path)
chunks = semantic_chunking(text_with_pages)
embeddings = generate_embeddings(chunks)

create_collection()
store_chunks(chunks, embeddings)



In [10]:
# Search example
query = "What is the Cheshire Cat's role in the story?"
results = search(query)

for i, res in enumerate(results, 1):
    print(f"Result {i}: (Page {res['page_number']})\n{res['chunk']}\nFinal Score: {res['final_score']:.4f}\n{'-'*50}")



modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Result 1: (Page 70)
began, in rather a complaining tone, “and they “It’s a friend of mine—a Cheshire Cat,” said
all quarrel so dreadfully one can’t hear one’s-self Alice: “allow me to introduce it.”
speak—and they don’t seem to have any rules “I don’t like the look of it at all,” said the
in particular; at least, if there are, nobody King: “however, it may kiss my hand if it
attends to them—and you’ve no idea how con- likes.”
fusing it is all the things being alive; for in- “I’d rather not,” the Cat remarked.
Final Score: 0.5542
--------------------------------------------------
Result 2: (Page 49)
whether it was good manners for her to speak While she was trying to fix on one, the cook
first, “why your cat grins like that?” took the cauldron of soup off the fire, and at
“It’s a Cheshire cat,” said the Duchess, once set to work throwing everything within
“and that’s why.
Final Score: 0.5455
--------------------------------------------------
Result 3: (Page 52)
89
Alice was just beginni