In [24]:
!pip install sentence-transformers faiss-cpu PyPDF2


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [32]:
import os
import json
import numpy as np
import faiss
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer


In [33]:
print("Loading embedding model...")
model = SentenceTransformer("all-MiniLM-L6-v2")
print("Model loaded successfully")


Loading embedding model...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Model loaded successfully


In [34]:
def extract_text_with_metadata(pdf_path):
    reader = PdfReader(pdf_path)
    pages = []
    
    for page_number, page in enumerate(reader.pages):
        page_text = page.extract_text()
        if page_text:
            pages.append({
                "page_number": page_number,
                "text": page_text
            })
    
    return pages


In [35]:
def chunk_pages(pages, pdf_name, chunk_size=300, overlap=50):
    metadata_store = []
    
    for page in pages:
        text = page["text"]
        page_number = page["page_number"]
        
        start = 0
        chunk_index = 0
        
        while start < len(text):
            end = start + chunk_size
            chunk_text = text[start:end]
            
            metadata_store.append({
                "chunk_id": f"{pdf_name}_page_{page_number}_chunk_{chunk_index}",
                "document_name": pdf_name,
                "page_number": page_number,
                "chunk_index": chunk_index,
                "text": chunk_text
            })
            
            start += chunk_size - overlap
            chunk_index += 1
    
    return metadata_store


In [36]:
pdf_path = "sample.pdf"
pdf_name = os.path.basename(pdf_path)

print("Extracting text...")
pages = extract_text_with_metadata(pdf_path)

print("Chunking text...")
metadata_store = chunk_pages(pages, pdf_name)

print("Total chunks created:", len(metadata_store))


Extracting text...
Chunking text...
Total chunks created: 6


In [37]:
print("Generating embeddings...")

chunk_texts = [item["text"] for item in metadata_store]

embeddings = model.encode(chunk_texts)
embeddings = np.array(embeddings).astype("float32")

print("Embedding shape:", embeddings.shape)


Generating embeddings...
Embedding shape: (6, 384)


In [38]:
# Normalize embeddings for cosine similarity
faiss.normalize_L2(embeddings)

dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)

index.add(embeddings)

print("FAISS index built successfully")
print("Total vectors:", index.ntotal)


FAISS index built successfully
Total vectors: 6


In [39]:
def search(query, k=3):
    print(f"\nQuery: {query}")
    
    query_embedding = model.encode([query])
    query_embedding = np.array(query_embedding).astype("float32")
    
    # Normalize query embedding
    faiss.normalize_L2(query_embedding)
    
    distances, indices = index.search(query_embedding, k)
    
    print("\nTop Results:\n")
    
    for rank, idx in enumerate(indices[0]):
        result = metadata_store[idx]
        
        print(f"Result {rank+1}")
        print("Chunk ID:", result["chunk_id"])
        print("Document:", result["document_name"])
        print("Page:", result["page_number"])
        print("Chunk Index:", result["chunk_index"])
        print("Cosine Similarity:", distances[0][rank])
        print("Text Preview:", result["text"][:200])
        print("-" * 60)


In [40]:
search("machine learning applications")
search("recommendation systems")
search("neural networks")



Query: machine learning applications

Top Results:

Result 1
Chunk ID: sample.pdf_page_0_chunk_2
Document: sample.pdf
Page: 0
Chunk Index: 2
Cosine Similarity: 0.63057566
Text Preview: eled data, and reinforcement learning
learns through rewards and penalties. Applications of machine learning include recommendation
systems, fraud detection, natural language processing, computer visi
------------------------------------------------------------
Result 2
Chunk ID: sample.pdf_page_0_chunk_0
Document: sample.pdf
Page: 0
Chunk Index: 0
Cosine Similarity: 0.5897626
Text Preview: Introduction to Machine Learning
Machine learning is a subset of artificial intelligence that focuses on building systems that learn from
data. Instead of being explicitly programmed with rules, machi
------------------------------------------------------------
Result 3
Chunk ID: sample.pdf_page_0_chunk_1
Document: sample.pdf
Page: 0
Chunk Index: 1
Cosine Similarity: 0.50739974
Text Preview: sions based on historical