<a href="https://colab.research.google.com/github/Tanmay2008/Gen-AI/blob/main/QNA_Application.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
#Text Extraction (text_extraction.py):
import pdfplumber

def extract_text_from_pdf(pdf_path, start_page, end_page):
    """
    Extracts text from a given range of pages in a PDF file.

    Args:
        pdf_path (str): Path to the PDF file.
        start_page (int): Start page number.
        end_page (int): End page number.

    Returns:
        List[str]: List of text chunks from the PDF.
    """
    text_chunks = []
    with pdfplumber.open(pdf_path) as pdf:
        for i in range(start_page, end_page):
            page = pdf.pages[i]
            text_chunks.append(page.extract_text())
    return text_chunks

In [18]:
#Vectorizer (vectorizer.py):
from sentence_transformers import SentenceTransformer

def vectorize_text(text_chunks):
    """
    Converts text chunks into vectors using a pre-trained model.

    Args:
        text_chunks (List[str]): List of text chunks.

    Returns:
        List[np.array]: List of vectors corresponding to text chunks.
    """
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    embeddings = model.encode(text_chunks)
    return embeddings

  from tqdm.autonotebook import tqdm, trange


In [19]:
 #Retriever (retriever.py):
import faiss
import numpy as np

class Retriever:
    def __init__(self, embeddings, text_chunks):
        """
        Initializes the retriever with FAISS and text chunks.

        Args:
            embeddings (np.array): Array of embeddings.
            text_chunks (List[str]): List of text chunks.
        """
        self.index = faiss.IndexFlatL2(embeddings.shape[1])
        self.index.add(embeddings)
        self.text_chunks = text_chunks

    def retrieve(self, query_vector, top_k=3):
        """
        Retrieves the top-k most similar text chunks to the query.

        Args:
            query_vector (np.array): Vector for the query.
            top_k (int): Number of top results to return.

        Returns:
            List[str]: List of top-k text chunks.
        """
        distances, indices = self.index.search(np.array([query_vector]), top_k)
        return [self.text_chunks[idx] for idx in indices[0]]

In [22]:
#RAG Pipeline (rag_pipeline.py)
from transformers import T5ForConditionalGeneration, T5Tokenizer
#from vectorizer import vectorize_text
#from retriever import Retriever

class RAGPipeline:
    def __init__(self, text_chunks):
        """
        Initializes the RAG pipeline with retriever and generator.

        Args:
            text_chunks (List[str]): List of text chunks to use for retrieval.
        """
        self.tokenizer = T5Tokenizer.from_pretrained('t5-small')
        self.generator = T5ForConditionalGeneration.from_pretrained('t5-small')

        self.embeddings = vectorize_text(text_chunks)
        self.retriever = Retriever(self.embeddings, text_chunks)

    def generate_answer(self, question):
        """
        Generates an answer for the given question using retrieval-augmented generation.

        Args:
            question (str): The input question.

        Returns:
            str: The generated answer.
        """
        query_vector = vectorize_text([question])[0]
        retrieved_chunks = self.retriever.retrieve(query_vector)
        context = " ".join(retrieved_chunks)

        input_text = f"question: {question} context: {context}"
        input_ids = self.tokenizer.encode(input_text, return_tensors='pt')
        output = self.generator.generate(input_ids,max_new_tokens=400)
        return self.tokenizer.decode(output[0], skip_special_tokens=True)

In [24]:
#app.py
import os
def main():
    """
    Main function to execute the RAG pipeline for answering questions from the PDF content.
    """

    # Step 1: Extract Text from the PDF
    pdf_path = "/content/ConceptsofBiology-WEB.pdf"  # Replace with the actual path to the PDF
    start_page = 14  # Define the range of pages to extract (start page of Chapter 1)
    end_page = 16    # Define the range of pages to extract (end page of Chapter 2)

    print("Extracting text from PDF...")
    text_chunks = extract_text_from_pdf(pdf_path, start_page, end_page)

    # Step 2: Vectorize the Extracted Text
    print("Vectorizing text chunks...")
    embeddings = vectorize_text(text_chunks)

    # Step 3: Initialize Retriever
    print("Initializing retriever...")
    retriever = Retriever(embeddings, text_chunks)

    # Step 4: Initialize RAG Pipeline with the text chunks
    print("Initializing RAG pipeline...")
    rag_pipeline = RAGPipeline(text_chunks)

    # Step 5: Input loop for querying
    while True:
        question = input("\nEnter your question (or type 'exit' to quit): ").strip()

        if question.lower() == 'exit':
            print("Exiting the application...")
            break

        # Step 6: Get the generated answer using RAG pipeline
        print("Retrieving and generating answer...")
        answer = rag_pipeline.generate_answer(question)
        print(f"\nGenerated Answer: {answer}")

if __name__ == "__main__":
    main()

Extracting text from PDF...
Vectorizing text chunks...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Initializing retriever...
Initializing RAG pipeline...


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]


Enter your question (or type 'exit' to quit): Pedagogical Foundation and Features
Retrieving and generating answer...

Generated Answer: Preface 1 2 Preface Access for free at openstax.org 2 Preface Access for free at openstax.org

Enter your question (or type 'exit' to quit): Concepts of Biology
Retrieving and generating answer...

Generated Answer: Preface 1 2 Preface Access for free at openstax.org 2 Preface Access for free at openstax.org

Enter your question (or type 'exit' to quit): The Cellular Foundation of Life
Retrieving and generating answer...

Generated Answer: Preface 1 2 Preface Access for free at openstax.org 2 Preface Access for free at openstax.org

Enter your question (or type 'exit' to quit): exit
Exiting the application...


In [25]:
import unittest
#from text_extraction import extract_text_from_pdf
#from vectorizer import vectorize_text
#from retriever import Retriever
#from rag_pipeline import RAGPipeline

class TestRAGPipeline(unittest.TestCase):

    def setUp(self):
        """
        Set up initial conditions before each test case.
        """
        # Simulate PDF text data (instead of reading from actual PDF)
        self.sample_text_chunks = [
            "OpenStax is part of Rice University, which is a 501(c)(3)nonprofit charitable corporation",
            "In Concepts of Biology, most art contains attribution to its creator within the caption.",
            "All OpenStax textbooks undergo a rigorous review process."
        ]

        # Simulate embeddings (mock vectors) for the sample chunks (could also mock with actual model)
        self.sample_embeddings = vectorize_text(self.sample_text_chunks)

        # Initialize retriever and RAG pipeline with sample data
        self.retriever = Retriever(self.sample_embeddings, self.sample_text_chunks)
        self.rag_pipeline = RAGPipeline(self.sample_text_chunks)

    def test_text_extraction(self):
        """
        Test text extraction functionality from a PDF.
        """
        # Assuming we have a small sample PDF for testing
        pdf_path = "/content/ConceptsofBiology-WEB.pdf"
        extracted_text = extract_text_from_pdf(pdf_path, 15, 16)
        # Extract from page 15 to 16

        self.assertIsInstance(extracted_text, list)
        self.assertGreater(len(extracted_text), 0)  # Ensure some text is extracted

    def test_vectorization(self):
        """
        Test the vectorization of text chunks.
        """
        sample_text = ["This is a test sentence for vectorization."]
        embeddings = vectorize_text(sample_text)

        self.assertIsInstance(embeddings, list)
        self.assertEqual(len(embeddings), 1)  # One embedding per text chunk
        self.assertEqual(len(embeddings[0]), 768)  # Assuming 768-dimensional embeddings (BERT-like)

    def test_retriever(self):
        """
        Test that the retriever retrieves the correct text based on similarity.
        """
        query = "What is Coverage and Scope?"
        query_vector = vectorize_text([query])[0]

        retrieved_text = self.retriever.retrieve(query_vector, top_k=1)

        self.assertEqual(len(retrieved_text), 1)
        self.assertIn("Concepts of Biology textbook adheres", retrieved_text[0])  # Check if correct chunk is retrieved

    def test_rag_pipeline(self):
        """
        Test that the RAG pipeline generates an answer based on the retrieved context.
        """
        question = "What is Coverage and Scope?"
        generated_answer = self.rag_pipeline.generate_answer(question)

        self.assertIsInstance(generated_answer, str)
        self.assertIn("Concepts of Biology textbook adheres", generated_answer)  # Check that answer contains expected info

if __name__ == '__main__':
    unittest.main()

E
ERROR: /root/ (unittest.loader._FailedTest)
----------------------------------------------------------------------
AttributeError: module '__main__' has no attribute '/root/'

----------------------------------------------------------------------
Ran 1 test in 0.002s

FAILED (errors=1)


SystemExit: True

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
