In [6]:
# This notebook uses mistral and embedding model from HF
# Models are downloaded to the system so we should have enough RAM (min 16 GB) to run the script
# We also require token from HF and sometime we need to accept the agreement on HF model that we are using e.g Mistral
# After accepting we can use download that model

In [1]:
import gradio as gr
import os
import re
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain_community.llms import HuggingFaceHub
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_core.output_parsers import StrOutputParser
from markdown import markdown
from transformers import pipeline



In [2]:
# Load API keys from .env file
load_dotenv()
hf_api_key = os.getenv("HUGGINGFACE_API_KEY")

# Define Hugging Face models
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # Small, fast embeddings
LLM_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"

# Initialize Embeddings model
embedding_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)

# Load LLM as a text generation pipeline
llm_pipeline = pipeline("text-generation", model=LLM_MODEL, token=hf_api_key, max_new_tokens=200, use_fast=True)
llm = HuggingFacePipeline(pipeline=llm_pipeline)

# Global variables
VECTOR_DB_NAME = "huggingface-rag"
vector_db = None

  return torch._C._cuda_getDeviceCount() > 0


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
# Function to load PDF
def load_pdf(file_path):
    loader = PyPDFLoader(file_path=file_path)
    return loader.load()

# Function to split text
def split_text(data, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return text_splitter.split_documents(data)

# Function to create vector database
def create_vector_db(chunks):
    global vector_db
    if not chunks:
        return "Error: No text extracted from the PDF."
    
    vector_db = Chroma.from_documents(
        documents=chunks,
        embedding=embedding_model,
        collection_name=VECTOR_DB_NAME,
        persist_directory="./chroma_db6"
#         persist_directory=None # Set to None for in-memory storage -> so that embeddings are not saved to disk
    )
    return "PDF processed successfully! You can now ask questions."

# Function to set up retriever
def get_retriever():
    if not vector_db:
        return None
    
    query_prompt = ChatPromptTemplate.from_template(
        """Generate 2 alternative versions of the question to improve retrieval:
        Original: {question}"""
    )
    
    return MultiQueryRetriever.from_llm(vector_db.as_retriever(), llm, prompt=query_prompt)

# Function to create RAG chain
def create_rag_chain():
    retriever = get_retriever()
    if not retriever:
        return None
    
    prompt = ChatPromptTemplate.from_template(
        """You are an AI assistant that answers questions based only on the given context.
        Provide a well-structured, coherent, and concise response.

        ### Context:
        {context}

        ### Question:
        {question}

        ### Answer:
        """
    )
    
    return (
        {"context": retriever, "question": RunnablePassthrough()} | prompt | llm | StrOutputParser()
    )

# Function to process user query
def process_query(question, chat_history):
    if not vector_db:
        return "Error: No vector database found. Please upload and process a PDF first.", chat_history

    chain = create_rag_chain()
    if not chain:
        return "Error: Unable to initialize the RAG chain.", chat_history

    response = chain.invoke(question)
    
    # Extract only the answer from the response
    answer_match = re.search(r"### Answer:\s*(.*)", response, re.DOTALL)
    answer = answer_match.group(1).strip() if answer_match else response.strip()
    
    chat_history.append((question, markdown(answer)))
    return "", chat_history

# Function to process PDF upload
def process_pdf(file):
    global vector_db
    if not file:
        return "Please upload a valid PDF file."
    
    # Reset the vector DB before adding new embeddings
    vector_db = None
    # Load and process the PDF
    file_path = file.name
    data = load_pdf(file_path)
    chunks = split_text(data)
    
    return create_vector_db(chunks)

In [4]:
# Gradio UI
def gradio_ui():
    with gr.Blocks(theme="soft") as demo:
        gr.Markdown("<h1 style='text-align: center; color: #4A90E2;'>📖 Conversational AI for PDFs</h1>")

        with gr.Row():
            with gr.Column(scale=2):
                chatbot = gr.Chatbot(label="AI Chat")
                user_input = gr.Textbox(placeholder="Ask me a question...", label="Your Question")

                with gr.Row():
                    ask_button = gr.Button("🔍 Ask", variant="primary")
                    clear_button = gr.Button("🗑️ Clear")

            with gr.Column(scale=1):
                gr.Markdown("### Upload PDFs Here:")
                pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
                status = gr.Textbox(label="Status", interactive=False)

        # Define button actions
        ask_button.click(process_query, inputs=[user_input, chatbot], outputs=[user_input, chatbot])
        clear_button.click(lambda: [], outputs=[chatbot])
        pdf_upload.change(process_pdf, inputs=[pdf_upload], outputs=[status])

    return demo

demo = gradio_ui()
demo.launch(share=True, debug=True)

Running on local URL:  http://127.0.0.1:7866
Running on public URL: https://6490426722ddecb799.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7866 <> https://6490426722ddecb799.gradio.live


