# Building Retrieval Augmented Generation (RAG)

Retrieval: This involves retrieving relevant information from a large corpus of text or documents based on a given query or prompt.
Generation: This involves generating a new text response using a language model, incorporating the retrieved information to make the response more informative and accurate.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
!pip install gradio



In [3]:
!pip install torch transformers sentence-transformers chromadb pymupdf




In [4]:
!pip install -q -U transformers
!pip install -q -U accelerate
!pip install -q -U bitsandbytes

In [5]:
!pip install langchain



In [6]:
!pip install sentence_transformers



In [7]:
# !shell
# !huggingface-cli login

In [8]:
# !git config --global credential.helper store


In [13]:
import fitz  # PyMuPDF
from sentence_transformers import SentenceTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter
from chromadb import Client
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
import gradio as gr

# Load SentenceTransformer model for embeddings
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Load a lightweight question-answering model
qa_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")
qa_model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad")


def load_pdf_content(pdf_path):
    """Loads PDF content and handles potential errors."""
    try:
        document = fitz.open(pdf_path)
        content = []
        for page_num in range(len(document)):
            page = document[page_num]
            text = page.get_text("text")
            content.append(text)
        return content
    except Exception as e:
        print(f"Error loading PDF: {e}")
        return []

def process_pdf(pdf_path, question):
    """Processes PDF, generates embeddings, and retrieves relevant chunks."""
    try:
        docs = load_pdf_content(pdf_path)

        if not docs:
            return "Error: No content found in the PDF."

        # Chunks the content using RecursiveCharacterTextSplitter
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        docs_string = " ".join(docs)
        splits = text_splitter.split_text(docs_string)

        if not splits:
            return "Error: Unable to split the document text."

        # Create embeddings using SentenceTransformers
        embeddings = embedding_model.encode(splits)

        # ChromaDB interactions
        client = Client()
        collection_name = "my_collection"

        try:
            client.delete_collection(name=collection_name)
        except Exception as e:
            print(f"Error deleting collection: {e}")

        collection = client.get_or_create_collection(name=collection_name)

        # Generate unique IDs for each document chunk
        ids = [str(i) for i in range(len(splits))]

        # Add documents and embeddings to the collection
        collection.add(documents=splits, embeddings=embeddings, ids=ids)

        # Perform a query to retrieve relevant document chunks
        query_embedding = embedding_model.encode([question])
        results = collection.query(query_embeddings=query_embedding, n_results=5)

        if not results or 'documents' not in results:
            return "Error: No relevant information found."

        return results

    except Exception as e:
        return f"Error processing PDF: {e}"

def format_docs(docs):
    """Flattens and joins the retrieved documents for readability."""
    try:
        flat_docs = [doc for sublist in docs['documents'] for doc in sublist]
        return flat_docs  # Returning a list of docs instead of a joined string for chunking
    except Exception as e:
        return f"Error formatting documents: {e}"

def generate_answer(context_chunks, question):
    """Generate answer by iterating over chunks using the question-answering model."""
    best_answer = ""
    best_confidence = -float('inf')  # Initialize to a very low value

    for context in context_chunks:
        inputs = qa_tokenizer.encode_plus(question, context, return_tensors="pt", truncation=True)
        input_ids = inputs["input_ids"].tolist()[0]

        # Get model outputs
        outputs = qa_model(**inputs)
        answer_start_scores = outputs.start_logits
        answer_end_scores = outputs.end_logits

        # Get the most likely start and end of answer
        answer_start = torch.argmax(answer_start_scores)
        answer_end = torch.argmax(answer_end_scores) + 1

        answer = qa_tokenizer.convert_tokens_to_string(qa_tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
        confidence_score = torch.max(answer_start_scores).item()

        # Update the best answer if this one has a higher confidence score
        if confidence_score > best_confidence and answer.strip() != "":
            best_confidence = confidence_score
            best_answer = answer

    return best_answer, best_confidence

def answer_from_pdf(pdf_file, question, progress=gr.Progress()):
    """Main function to handle PDF content, retrieve relevant chunks, and generate an answer."""
    try:
        progress(0.1, "Loading PDF content...")
        results = process_pdf(pdf_file, question)
        progress(0.5, "Retrieving relevant information...")

        if "Error" in results:  # If an error occurred during PDF processing
            return results

        context_chunks = format_docs(results)
        if "Error" in context_chunks:
            return context_chunks

        progress(0.8, "Generating answer...")

        answer, confidence_score = generate_answer(context_chunks, question)
        progress(1.0, "Done.")

        if answer:
            return f"Answer: {answer}\n\nConfidence: {confidence_score}"
        else:
            return "No relevant answer found."

    except Exception as e:
        return f"Error: {e}"

# Gradio interface definition
interface = gr.Interface(
    fn=answer_from_pdf,
    inputs=[
        gr.File(label="Upload PDF", type="filepath"),  # Set type to 'filepath'
        gr.Textbox(label="Your Question")
    ],
    outputs="text",
    title="Question Answering with PDF Retrieval",
    description="Ask a question about the uploaded PDF and get an answer generated from relevant sections."
)

interface.launch()




Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://56d0ef67da2bbe86d1.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


