<a href="https://colab.research.google.com/github/Sidharth-2592/LLM_RAG_Q-A/blob/main/LLM_RAG_Q%26A.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# prompt: install faiss and pypdf2

!pip install faiss-cpu
!pip install pypdf2
!pip install transformers datasets sentence-transformers


Collecting faiss-cpu
  Using cached faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Using cached faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0
Collecting pypdf2
  Using cached pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Using cached pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: pypdf2
Successfully installed pypdf2-3.0.1


In [1]:
# prompt: mout drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import os
import re
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
import faiss
import nltk
from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import PyPDF2
from google.colab import files
from IPython.display import display, HTML, clear_output
import ipywidgets as widgets
from tqdm.notebook import tqdm
import io

In [6]:
# Download additional NLTK data
nltk.download('punkt_tab')
!pip install -q PyPDF2 faiss-cpu sentence-transformers transformers ipywidgets tqdm
nltk.download('punkt', quiet=True)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h

True

In [7]:

class PDFQuestionAnswerer:
    def __init__(self):
        """Initialize the PDF QA system"""
        print("Loading models...")
        # Initialize the embedding model
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

        # Initialize the LLM for answer generation
        # self.model_name = "google/flan-t5-base"
        self.model_name="google/flan-t5-large"
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)

        # Initialize document storage
        self.document_chunks = []
        self.document_sources = []
        self.document_embeddings = None
        self.faiss_index = None

        print("System ready for PDF upload.")

    def extract_text_from_pdf(self, pdf_bytes):
        """Extract text from a PDF file given its bytes"""
        text = ""
        try:
            pdf_file = io.BytesIO(pdf_bytes)
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                page_text = page.extract_text()
                if page_text:
                    text += f"[Page {page_num+1}] " + page_text + "\n\n"
        except Exception as e:
            print(f"Error extracting text from PDF: {e}")
        return text

    def clean_text(self, text):
        """Clean extracted text by removing extra whitespace and unwanted characters"""
        text = re.sub(r'\s+', ' ', text).strip()
        text = re.sub(r'[^\w\s.,:;?!()-]', '', text)
        return text

    def process_pdf(self, pdf_bytes, filename, chunk_size=3):
        """Process a PDF file and prepare it for querying"""
        text = self.extract_text_from_pdf(pdf_bytes)
        if not text.strip():
            return "No text could be extracted from the PDF."

        text = self.clean_text(text)
        sentences = sent_tokenize(text)
        chunks = []
        sources = []
        for i in range(0, len(sentences), chunk_size):
            chunk = " ".join(sentences[i:i+chunk_size])
            chunks.append(chunk)
            page_match = re.search(r'\[Page (\d+)\]', chunk)
            page_info = f" (Page {page_match.group(1)})" if page_match else ""
            sources.append(f"{filename}{page_info}")

        self.document_chunks = chunks
        self.document_sources = sources

        print(f"Creating embeddings for {len(chunks)} chunks...")
        self.document_embeddings = self.embedding_model.encode(chunks, show_progress_bar=True)

        dimension = self.document_embeddings.shape[1]
        self.faiss_index = faiss.IndexFlatL2(dimension)
        self.faiss_index.add(self.document_embeddings.astype('float32'))

        return f"PDF processed successfully. {len(chunks)} chunks created. Ready for questions."

    def answer_question(self, question, top_k=5):
        """Answer a question based on the processed PDF"""
        if not self.document_chunks or not self.faiss_index:
            return "Please upload and process a PDF document first.", []

        question_embedding = self.embedding_model.encode([question])[0].reshape(1, -1)
        distances, indices = self.faiss_index.search(question_embedding.astype('float32'), top_k)
        relevant_chunks = [self.document_chunks[idx] for idx in indices[0]]
        sources = [self.document_sources[idx] for idx in indices[0]]

        context = " ".join(relevant_chunks)
        prompt = (
            f"Answer the question based on the following context. If the context doesn't contain "
            f"relevant information, say 'I don't have enough information to answer this question.'\n\n"
            f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
        )

        inputs = self.tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True)
        with torch.no_grad():
            outputs = self.model.generate(
                inputs.input_ids,
                max_length=1024,
                num_beams=4,
                early_stopping=True
            )
        answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        unique_sources = []
        for src in sources:
            if src not in unique_sources:
                unique_sources.append(src)
        source_refs = [f"{i+1}. {src}" for i, src in enumerate(unique_sources)]

        return answer, source_refs

def upload_and_process_pdf():
    qa_system = PDFQuestionAnswerer()

    print("Please upload your PDF file...")
    uploaded = files.upload()

    if not uploaded:
        print("No file was uploaded.")
        return

    filename = list(uploaded.keys())[0]
    pdf_bytes = uploaded[filename]

    print(f"Processing {filename}...")
    result = qa_system.process_pdf(pdf_bytes, filename)
    print(result)

    if "successfully" in result:
        def ask_question():
            while True:
                question = input("\nEnter your question (or type 'exit' to quit): ")
                if question.lower() == 'exit':
                    break

                print("\nGenerating answer...")
                answer, sources = qa_system.answer_question(question)

                print(f"\nQuestion: {question}")
                print(f"\nAnswer: {answer}\n")
                print("Sources:")
                for source in sources:
                    print(source)
        ask_question()


In [None]:
upload_and_process_pdf()

Loading models...
System ready for PDF upload.
Please upload your PDF file...


Saving ADEARUnravelingtheMystery12-033.pdf to ADEARUnravelingtheMystery12-033 (1).pdf
Processing ADEARUnravelingtheMystery12-033 (1).pdf...
Creating embeddings for 725 chunks...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

PDF processed successfully. 725 chunks created. Ready for questions.

Enter your question (or type 'exit' to quit): what are the different kinds of alzheimer's disease?

Generating answer...

Question: what are the different kinds of alzheimer's disease?

Answer: It's impossible to say

Sources:
1. ADEARUnravelingtheMystery12-033 (1).pdf

Enter your question (or type 'exit' to quit): What is the main topic of this pdf?

Generating answer...

Question: What is the main topic of this pdf?

Answer: A walking tourthrough the brain

Sources:
1. ADEARUnravelingtheMystery12-033 (1).pdf

Enter your question (or type 'exit' to quit): What is Alzheimer's disease?

Generating answer...

Question: What is Alzheimer's disease?

Answer: irreversible, progressive brain disease

Sources:
1. ADEARUnravelingtheMystery12-033 (1).pdf
