In [2]:
# Cell 2
from pypdf import PdfReader

def load_pdf(file_path):
    """
    Loads text content from a PDF file.
    """
    try:
        reader = PdfReader(file_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"
        print(f"Loaded PDF from: {file_path}")
        print(f"Total characters loaded: {len(text)}")
        return text
    except Exception as e:
        print(f"Error loading PDF: {e}")
        return None

# Replace 'sample.pdf' with the actual path to your PDF file
pdf_path = 'note.pdf' # Make sure you have a sample.pdf in your directory or provide full path
pdf_text = load_pdf(pdf_path)

if pdf_text:
    print("\nFirst 500 characters of the loaded PDF text:")
    print(pdf_text[:500])
else:
    print("PDF loading failed. Please check the file path and ensure the PDF is valid.")

Ignoring wrong pointing object 98 0 (offset 0)


Loaded PDF from: note.pdf
Total characters loaded: 3490

First 500 characters of the loaded PDF text:
 
SHRIJITH  S  MENON  shrijithsmenon@gmail.com
 
|
 +91-8891874928  |  linkedin.com/in/shrijithsm  |  github.com/shrijithsm  |  shrijithsm.tech  
 
                                     
 
Skills 
 
Python  |  Problem  Solving  |  OOPs  |  Machine  Learning  |  NumPy  |  Pandas  |  MongoDB  |  MySQL|  SQLite  |  HTML/CSS  |  
Java
 
|
 
Git
 
|
 
Linux
 
  
Education 
 
Jain  (Deemed-to-be-university),  Bengaluru,  India       
   
 
 
                        2023  -  2027 ●  B.Tech  Computer  Sc


In [3]:
# Cell 3
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_text_into_chunks(text, chunk_size=1000, chunk_overlap=200):
    """
    Splits a given text into smaller chunks.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.create_documents([text])
    print(f"\nText split into {len(chunks)} chunks.")
    print(f"Example chunk (first 200 chars):")
    print(chunks[0].page_content[:200])
    return chunks

if pdf_text:
    text_chunks = split_text_into_chunks(pdf_text)
else:
    text_chunks = []
    print("Cannot split text because PDF loading failed.")


Text split into 5 chunks.
Example chunk (first 200 chars):
SHRIJITH  S  MENON  shrijithsmenon@gmail.com
 
|
 +91-8891874928  |  linkedin.com/in/shrijithsm  |  github.com/shrijithsm  |  shrijithsm.tech  
 
                                     
 
Skills 
 
Pyth


In [4]:
# Cell 4
from sentence_transformers import SentenceTransformer
import torch

def get_embeddings_model():
    """
    Loads a pre-trained Sentence Transformer model for embeddings.
    """
    # Using 'all-MiniLM-L6-v2' as it's good for semantic similarity and relatively small.
    # For better performance, consider 'all-MiniLM-L12-v2' or 'multi-qa-mpnet-base-dot-v1'.
    model_name = 'all-MiniLM-L6-v2'
    try:
        model = SentenceTransformer(model_name)
        print(f"SentenceTransformer model '{model_name}' loaded successfully.")
        return model
    except Exception as e:
        print(f"Error loading SentenceTransformer model: {e}")
        print("Please ensure you have an active internet connection to download the model if it's not cached.")
        return None

embedding_model = get_embeddings_model()

if embedding_model and text_chunks:
    print("\nGenerating embeddings for text chunks...")
    # Extract just the page_content from the Document objects
    chunk_contents = [chunk.page_content for chunk in text_chunks]
    chunk_embeddings = embedding_model.encode(chunk_contents, show_progress_bar=True)
    print(f"Generated {len(chunk_embeddings)} embeddings, each with dimension {chunk_embeddings.shape[1]}.")
else:
    print("Cannot generate embeddings: Embedding model or text chunks not available.")

  from .autonotebook import tqdm as notebook_tqdm
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


SentenceTransformer model 'all-MiniLM-L6-v2' loaded successfully.

Generating embeddings for text chunks...


Batches: 100%|██████████| 1/1 [00:00<00:00,  3.43it/s]

Generated 5 embeddings, each with dimension 384.





In [5]:
# Cell 5
import faiss
import numpy as np
import os

def create_faiss_index(embeddings, texts):
    """
    Creates a FAISS index from embeddings and stores the corresponding texts.
    """
    if embeddings is None or len(embeddings) == 0:
        print("No embeddings to create FAISS index.")
        return None, None

    # Ensure embeddings are float32 as required by FAISS
    embeddings = np.array(embeddings).astype('float32')
    dimension = embeddings.shape[1]

    # Create an IndexFlatL2 index (L2 for Euclidean distance, suitable for many embedding types)
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    print(f"FAISS index created with {index.ntotal} vectors.")
    return index, texts # Store original texts alongside the index

def save_faiss_index(index, texts, index_name="pdf_rag_faiss_index.bin", texts_name="pdf_rag_texts.npy"):
    """
    Saves the FAISS index and associated texts to disk.
    """
    if index is None or texts is None:
        print("Nothing to save: FAISS index or texts are missing.")
        return

    faiss.write_index(index, index_name)
    np.save(texts_name, texts)
    print(f"FAISS index saved to {index_name}")
    print(f"Associated texts saved to {texts_name}")

def load_faiss_index(index_name="pdf_rag_faiss_index.bin", texts_name="pdf_rag_texts.npy"):
    """
    Loads the FAISS index and associated texts from disk.
    """
    if not os.path.exists(index_name) or not os.path.exists(texts_name):
        print(f"FAISS index or texts not found. Please create them first. Missing: {index_name} or {texts_name}")
        return None, None
    try:
        index = faiss.read_index(index_name)
        texts = np.load(texts_name, allow_pickle=True).tolist() # Convert back to list of Documents
        print(f"FAISS index loaded from {index_name}")
        print(f"Associated texts loaded from {texts_name}")
        return index, texts
    except Exception as e:
        print(f"Error loading FAISS index or texts: {e}")
        return None, None

# Try to load existing index, otherwise create and save
faiss_index, indexed_texts = load_faiss_index()

if faiss_index is None:
    if embedding_model and text_chunks and chunk_embeddings is not None:
        faiss_index, indexed_texts = create_faiss_index(chunk_embeddings, text_chunks)
        if faiss_index: # Only save if creation was successful
            save_faiss_index(faiss_index, indexed_texts)
    else:
        print("Cannot create FAISS index: Missing embedding model, text chunks, or chunk embeddings.")

FAISS index or texts not found. Please create them first. Missing: pdf_rag_faiss_index.bin or pdf_rag_texts.npy
FAISS index created with 5 vectors.
FAISS index saved to pdf_rag_faiss_index.bin
Associated texts saved to pdf_rag_texts.npy


In [6]:
# Cell 6
def retrieve_chunks(query, embedding_model, faiss_index, indexed_texts, k=5):
    """
    Retrieves the top-k most similar text chunks to a given query.
    """
    if embedding_model is None or faiss_index is None or indexed_texts is None:
        print("Cannot retrieve chunks: Missing embedding model, FAISS index, or indexed texts.")
        return []

    query_embedding = embedding_model.encode([query]).astype('float32')

    # D, I are distances and indices
    distances, indices = faiss_index.search(query_embedding, k)

    retrieved_chunks = [indexed_texts[i].page_content for i in indices[0]]
    print(f"\nRetrieved {len(retrieved_chunks)} relevant chunks for the query.")
    # print("Retrieved chunks preview:")
    # for i, chunk in enumerate(retrieved_chunks):
    #     print(f"Chunk {i+1}:\n{chunk[:200]}...\n") # Print first 200 chars of each chunk
    return retrieved_chunks

# Example Retrieval:
# if embedding_model and faiss_index and indexed_texts:
#     sample_query = "What is Retrieval-Augmented Generation?"
#     retrieved_content = retrieve_chunks(sample_query, embedding_model, faiss_index, indexed_texts, k=3)
#     if retrieved_content:
#         print("\nCombined retrieved content for LLM:")
#         print("\n--- NEW CHUNK ---\n".join(retrieved_content))
# else:
#     print("Skipping example retrieval: Prerequisites not met.")

In [8]:
# Cell 7
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Initialize the LLM
llm_model_name = "google/flan-t5-small" # A relatively small model for local inference
# You might try 'google/flan-t5-base' or 'MBZUAI/LaMini-Flan-T5-77M' for slightly better performance if 'small' is too weak.
# For more advanced usage, consider quantized models or local models via ctransformers if performance is an issue.

try:
    print(f"\nLoading local LLM: {llm_model_name}...")
    # Using 'cuda' if GPU is available, otherwise 'cpu'
    device = 0 if torch.cuda.is_available() else -1
    print(f"Using device: {'GPU' if device == 0 else 'CPU'}")

    # For Flan-T5, AutoModelForSeq2SeqLM is appropriate
    tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(llm_model_name,
                                                  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)

    # Use pipeline for ease of generation
    generator = pipeline(
        "text2text-generation", # For Flan-T5 models, this is common
        model=model,
        tokenizer=tokenizer,
        device=device,
        max_new_tokens=200, # Max length of the generated answer
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95
    )
    print("Local LLM loaded successfully.")
except Exception as e:
    generator = None
    print(f"Error loading local LLM: {e}")
    print("Ensure you have enough memory (RAM/VRAM) and a stable internet connection for initial download.")
    print("If you encounter OOM errors, try a smaller model or run on CPU if GPU memory is insufficient.")

def generate_answer(query, retrieved_chunks, llm_generator):
    """
    Generates an answer using the local LLM, conditioned on retrieved chunks.
    """
    if llm_generator is None:
        print("LLM generator not available. Cannot generate answer.")
        return "Error: Language model not loaded."

    context = "\n".join(retrieved_chunks)

    # Construct the prompt for the LLM
    # The prompt engineering here is crucial. Experiment with different formats.
    prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"
    # print(f"\n--- LLM Prompt --- \n{prompt[:1000]}...\n--- End Prompt ---\n") # Print a snippet of the prompt

    try:
        response = llm_generator(prompt)
        # The output format depends on the pipeline and model. For text2text-generation, it's usually a list of dicts.
        answer = response[0]['generated_text']
        return answer
    except Exception as e:
        print(f"Error generating answer with LLM: {e}")
        return "An error occurred while generating the answer."


Loading local LLM: google/flan-t5-small...
Using device: CPU


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cpu


Local LLM loaded successfully.


In [9]:
# Cell 8
def chat_with_pdf(query, embedding_model, faiss_index, indexed_texts, llm_generator, num_retrieved_chunks=3):
    """
    Main function to answer a query about the PDF.
    """
    if not (embedding_model and faiss_index and indexed_texts and llm_generator):
        return "Chatbot not fully initialized. Please ensure all components are loaded."

    print(f"\nUser Query: {query}")

    # 1. Retrieve relevant chunks
    retrieved_content = retrieve_chunks(query, embedding_model, faiss_index, indexed_texts, k=num_retrieved_chunks)

    if not retrieved_content:
        return "Could not retrieve relevant information from the PDF for your query."

    # 2. Generate answer using the LLM
    answer = generate_answer(query, retrieved_content, llm_generator)

    return answer

# Test the full chatbot
if __name__ == "__main__":
    # Ensure all components are loaded from previous cells
    if not (embedding_model and faiss_index and indexed_texts and generator):
        print("One or more required components (embedding_model, faiss_index, indexed_texts, generator) are missing or failed to load. Please re-run previous cells.")
    else:
        print("\n--- Starting Chatbot Test ---")
        while True:
            user_question = input("Ask a question about the PDF (type 'quit' to exit): ")
            if user_question.lower() == 'quit':
                break
            response = chat_with_pdf(user_question, embedding_model, faiss_index, indexed_texts, generator)
            print(f"\nChatbot: {response}")
            print("-" * 50)


--- Starting Chatbot Test ---

User Query: what is the users name

Retrieved 3 relevant chunks for the query.

Chatbot: Jupyter Notebooks
--------------------------------------------------


Token indices sequence length is longer than the specified maximum sequence length for this model (577 > 512). Running this sequence through the model will result in indexing errors



User Query: hello

Retrieved 3 relevant chunks for the query.

Chatbot: i m in this class i am in this class
--------------------------------------------------

User Query: what are the skills of the person

Retrieved 3 relevant chunks for the query.

Chatbot: demonstrating strong problem-solving and teamwork skills
--------------------------------------------------

User Query: what is the email id

Retrieved 3 relevant chunks for the query.

Chatbot: cnte
--------------------------------------------------

User Query: what is the name of the user

Retrieved 3 relevant chunks for the query.

Chatbot: shrijithsmenon@gmail.com
--------------------------------------------------

User Query: what is the phone number

Retrieved 3 relevant chunks for the query.

Chatbot: 437
--------------------------------------------------

User Query: 

Retrieved 3 relevant chunks for the query.

Chatbot: What were the best features of the product?
--------------------------------------------------
