In [3]:
pip install PyPDF2 faiss-cpu sentence-transformers transformers langchain


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [8]:
import os
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import pipeline

# STEP 1: Extract Text from PDF
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text
    return text

# STEP 2: Split Text into Chunks
def split_text_into_chunks(text, chunk_size=500, chunk_overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_text(text)
    print(f"Number of chunks: {len(chunks)}")
    return chunks

# STEP 3: Generate Embeddings and Store in FAISS
def embed_and_store(chunks, model_name):
    print("Generating embeddings...")
    model = SentenceTransformer(model_name)
    embeddings = model.encode(chunks, show_progress_bar=True)
    
    # Build FAISS index
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(embeddings).astype('float32'))
    print(f"FAISS index size: {index.ntotal}")
    return index, model

# STEP 4: Search FAISS for Relevant Chunks
def search_similar_chunks(query, chunks, index, model, top_k=3):
    query_embedding = model.encode([query]).astype('float32')
    distances, indices = index.search(query_embedding, top_k)
    
    print(f"Search Results - Distances: {distances}, Indices: {indices}")
    results = [chunks[i] for i in indices[0] if i < len(chunks)]
    return results

# STEP 5: Use Hugging Face Model for Final Response Generation
def generate_response(context_chunks, query, hf_model_name="distilgpt2"):
    context = "\n".join(context_chunks)
    input_text = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"
    print("Generating response...")

    hf_pipeline = pipeline("text-generation", model=hf_model_name, max_new_tokens=200)
    response = hf_pipeline(input_text)[0]['generated_text']
    return response

# MAIN PIPELINE
def main(pdf_path, user_query, embedding_model_name="all-MiniLM-L6-v2", hf_model_name="distilgpt2"):
    # Step 1: Extract text
    print("Extracting text from PDF...")
    pdf_text = extract_text_from_pdf(pdf_path)
    
    # Step 2: Split text
    print("Splitting text into chunks...")
    chunks = split_text_into_chunks(pdf_text)
    if not chunks:
        print("No text chunks found. Exiting.")
        return
    
    # Step 3: Embed and store in FAISS
    print("Embedding and storing chunks...")
    index, model = embed_and_store(chunks, embedding_model_name)
    
    # Step 4: Search for relevant chunks
    print("Searching relevant chunks...")
    similar_chunks = search_similar_chunks(user_query, chunks, index, model)
    if not similar_chunks:
        print("No relevant chunks found. Exiting.")
        return
    
    # Step 5: Generate response
    print("Generating response...")
    response = generate_response(similar_chunks, user_query, hf_model_name)
    print("\nFinal Response:")
    print(response)

# RUN THE PIPELINE
if __name__ == "__main__":
    pdf_path = "sample.pdf"  # Your uploaded file path
    user_query = "How does the Transformer compare to RNNs and LSTMs?"  # Example query
    main(pdf_path, user_query)


Extracting text from PDF...
Splitting text into chunks...
Number of chunks: 101
Embedding and storing chunks...
Generating embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

FAISS index size: 101
Searching relevant chunks...
Search Results - Distances: [[0.9001344  0.99176013 1.0147076 ]], Indices: [[16  4 68]]
Generating response...
Generating response...


Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Final Response:
Context: To the best of our knowledge, however, the Transformer is the first transduction model relying
entirely on self-attention to compute representations of its input and output without using sequence-
aligned RNNs or convolution. In the following sections, we will describe the Transformer, motivate
self-attention and discuss its advantages over models such as [17, 18] and [9].
3 Model Architecture
Most competitive neural sequence transduction models have an encoder-decoder structure [ 5,2,35].
best models from the literature. We show that the Transformer generalizes well to
other tasks by applying it successfully to English constituency parsing both with
large and limited training data.
∗Equal contribution. Listing order is random. Jakob proposed replacing RNNs with self-attention and started
the effort to evaluate this idea. Ashish, with Illia, designed and implemented the first Transformer models and
constraints and is significantly longer than the input. Furthe