# PDF Text Embedding, FAISS Search, and Summarization


This notebook demonstrates how to:
1. Extract text from a PDF file.
2. Split text into manageable chunks for embedding.
3. Generate embeddings with SentenceTransformers.
4. Use FAISS for similarity-based text search.
5. Summarize retrieved content using a summarization model.


## Step 1: Extract Text from PDF

In [None]:
from PyPDF2 import PdfReader

def read_pdf(file_path):
    reader = PdfReader(file_path)
    content = ""
    for page in reader.pages:
        content += page.extract_text() + "\n"  # Append text from each page
    return content

file_path = "documents/LLM.pdf"  # Replace with your PDF file path
pdf_content = read_pdf(file_path)

## Step 2: Split Text into Chunks

In [None]:
def split_text_into_chunks(text, chunk_size=300):
    words = text.split()
    return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

chunks = split_text_into_chunks(pdf_content, chunk_size=64)

## Step 3: Embed Text Chunks Using SentenceTransformers

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

def embed_text_chunks(chunks, embedding_model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(embedding_model_name)
    embeddings = model.encode(chunks, convert_to_numpy=True, show_progress_bar=True)
    return embeddings

embedding_model_name = "all-mpnet-base-v2"
embeddings = embed_text_chunks(chunks, embedding_model_name)

## Step 4: Build a FAISS Index for Embeddings

In [None]:
import faiss

def build_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

faiss_index = build_faiss_index(embeddings)

## Step 5: Query the FAISS Index

In [None]:
query = "Foundation Language Models vs. Fine-Tuned Language Models"  # Your search query
query_embedding = SentenceTransformer(embedding_model_name).encode([query], convert_to_numpy=True)

# Retrieve top-3 closest chunks
distances, indices = faiss_index.search(query_embedding, k=3)
response_chunks = '\n'.join([chunks[i] for i in indices[0]])

print(response_chunks)

## Step 6: Summarize Retrieved Chunks

In [None]:
from transformers import pipeline

summarize_model = pipeline("summarization", model="models/bart-large-cnn")
summary = summarize_model(response_chunks, max_length=100, min_length=30, do_sample=False)
print(summary)