In [1]:
import faiss
import numpy as np
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
from typing import List
import os


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def extract_text_from_pdf(pdf_path: str) -> str:
    reader = PdfReader(pdf_path)
    text = []
    for page in reader.pages:
        if page.extract_text():
            text.append(page.extract_text())
    return "\n".join(text)


In [3]:
def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
    words = text.split()
    chunks = []

    start = 0
    while start < len(words):
        end = start + chunk_size
        chunk = words[start:end]
        chunks.append(" ".join(chunk))
        start += chunk_size - overlap

    return chunks


In [4]:
def create_faiss_index(chunks: List[str], model_name: str = "all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)

    embeddings = model.encode(
        chunks,
        show_progress_bar=True,
        convert_to_numpy=True
    )

    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)

    return index, embeddings


In [5]:
def save_faiss_index(index, chunks, index_path="faiss_index"):
    os.makedirs(index_path, exist_ok=True)

    faiss.write_index(index, f"{index_path}/index.faiss")

    with open(f"{index_path}/chunks.txt", "w", encoding="utf-8") as f:
        for chunk in chunks:
            f.write(chunk.replace("\n", " ") + "\n")


In [6]:

pdf_path = "pdf.pdf"

# Extract text
text = extract_text_from_pdf(pdf_path)

# Chunk text
chunks = chunk_text(text)

# Create FAISS index
index, embeddings = create_faiss_index(chunks)

# Save index and chunks
save_faiss_index(index, chunks)

print("FAISS index created and saved successfully.")


Batches: 100%|██████████| 8/8 [00:13<00:00,  1.69s/it]

FAISS index created and saved successfully.





In [7]:
def search_faiss(query: str, index, chunks, model, top_k=5):
    query_embedding = model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, top_k)

    results = []
    for idx in indices[0]:
        results.append(chunks[idx])

    return results


In [10]:
# Load sentence transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Load FAISS index
index = faiss.read_index("faiss_index/index.faiss")

# Load chunks
with open("faiss_index/chunks.txt", "r", encoding="utf-8") as f:
    chunks = [line.strip() for line in f.readlines()]


In [11]:
query = "How does FAISS help in semantic search?"

results = search_faiss(
    query=query,
    index=index,
    chunks=chunks,
    model=model,
    top_k=3
)

for i, res in enumerate(results, 1):
    print(f"\nResult {i}:\n{res}")



Result 1:
speaking, for stainers with cervical exposure, a dentifrice with a safety index as high as is consistent with prevention of stain accumulation is desirable. Reliable current information on denti- frice abrasion, determined by testing whole denti- frices on dentin, should be available (Kitchen and Robinson 1948 ). Later efforts, primarily in the 1950s and 1960s, led to the development of formal abrasiv- ity testing procedures, and these served as the basis for standards currently recommended by the American Dental Association (ADA) and the International Standards Organization (ISO). Given the growing consumer expectation for multi-beneﬁ t products, dentifrice formulations need to deliver a level of abrasiveness sufﬁ cient to control staining and plaque buildup, without risking the use of overly aggressive abrasive sys- tems that could be deleterious to hard tissues after long-term use. Some researchers have pro- posed the use of a Cleaning Efﬁ ciency Index as one way to asses

In [12]:
import google.generativeai as genai

genai.configure(api_key='AIzaSyD_DsKtHUVSgkqQYz7lSIMks8qIAXv5ClE')

for model in genai.list_models():
    # Only print models supporting content generation
    if "generateContent" in getattr(model, "supported_generation_methods", []):
        print(model.name)


models/gemini-2.5-flash
models/gemini-2.5-pro
models/gemini-2.0-flash-exp
models/gemini-2.0-flash
models/gemini-2.0-flash-001
models/gemini-2.0-flash-exp-image-generation
models/gemini-2.0-flash-lite-001
models/gemini-2.0-flash-lite
models/gemini-2.0-flash-lite-preview-02-05
models/gemini-2.0-flash-lite-preview
models/gemini-exp-1206
models/gemini-2.5-flash-preview-tts
models/gemini-2.5-pro-preview-tts
models/gemma-3-1b-it
models/gemma-3-4b-it
models/gemma-3-12b-it
models/gemma-3-27b-it
models/gemma-3n-e4b-it
models/gemma-3n-e2b-it
models/gemini-flash-latest
models/gemini-flash-lite-latest
models/gemini-pro-latest
models/gemini-2.5-flash-lite
models/gemini-2.5-flash-image-preview
models/gemini-2.5-flash-image
models/gemini-2.5-flash-preview-09-2025
models/gemini-2.5-flash-lite-preview-09-2025
models/gemini-3-pro-preview
models/gemini-3-pro-image-preview
models/nano-banana-pro-preview
models/gemini-robotics-er-1.5-preview
models/gemini-2.5-computer-use-preview-10-2025
models/deep-resear

In [13]:
# Suppose you saw this in the list:
#   models/gemini-2.5-flash
model_gemini = genai.GenerativeModel(model_name="models/gemini-2.5-flash")

response = model_gemini.generate_content(
    "Explain FAISS vector search in one paragraph"
)
print(response.text)


FAISS (Facebook AI Similarity Search) is an open-source library designed for efficient similarity search and clustering of dense vectors. It tackles the challenge of finding the *k* nearest neighbors to a query vector within massive datasets (millions or billions) of high-dimensional data, where brute-force comparison is computationally infeasible. Instead, FAISS offers a rich collection of optimized approximate nearest neighbor (ANN) algorithms that employ techniques like vector quantization, inverted file indexing, and graph-based approaches to significantly speed up the search while maintaining acceptable accuracy, often leveraging GPU acceleration for even greater performance.
