In [20]:
!pip install pytesseract
!pip install Pillow
!pip install pypdf
!pip install pypdf2 
!pip install langchain
!pip install langchain_community 
!pip install faiss-cpu 
!pip install pdf2image 
!pip install opencv-python==4.8.0.74
!pip install gradio 
!pip install langchain-huggingface 
!pip install -U langchain-ollama 
!pip install hf_xet 
!pip install sentence-transformers
!pip install langchain-ollama



In [21]:
from langchain_ollama import OllamaLLM # Our bridge to Ollama's local LLMs
import cv2 # OpenCV for image manipulation
import pytesseract # The star for Optical Character Recognition (OCR)
from PIL import Image # Pillow, for handling images (PyTesseract likes it)
import pandas as pd # Though not directly used in the final flow, often handy for data
import sqlite3 # Our little in-memory database for storing text chunks temporarily
import os # For interacting with the operating system, like checking file paths
import re # Regular expressions, super useful for text cleaning (though not heavily used here)
from PyPDF2 import PdfReader # For reading PDF files page by page

In [22]:
import gradio as gr
from langchain.text_splitter import RecursiveCharacterTextSplitter # To break down our big PDF text into smaller, manageable pieces
from langchain_community.vectorstores import FAISS # Our vector database for storing text embeddings
from langchain_community.embeddings import HuggingFaceEmbeddings # This is how we turn text into searchable numbers!
from langchain.prompts import PromptTemplate # To craft specific instructions for our LLM
from langchain_core.runnables import RunnablePassthrough # A neat LangChain utility for passing inputs through
from langchain_core.output_parsers import StrOutputParser # To get clean string output from our LLM
from langchain_community.llms import Ollama # Another way to interact with Ollama, used later
from langchain_huggingface import HuggingFaceEmbeddings # Just confirming this one, sometimes there's a slight overlap!

In [23]:
# Configuration 
PDF_FILE = 'NCERT_science_10th.pdf' # The path to our textbook PDF
TARGET_PDF_PAGE_START = 0 # Starting from the very first page (0-indexed, so page 1 is 0)
TARGET_PDF_PAGE_END = 70 # I've set it to page 70 for now. You can adjust this if you want to process more (or fewer) pages!


In [24]:
# Function to extract text using PyTesseract 
# It tries to pull text out of an image using OCR.
def extract_text_from_image_with_pytesseract(image_path_or_array):
    """
    This function is my go-to for getting text from images using PyTesseract.
    It's flexible – I can give it either an image file path or a raw image (like from OpenCV).
    """
    try:
        if isinstance(image_path_or_array, str):
            img = Image.open(image_path_or_array) # If it's a path, open the image
        else:
            # If it's an array (like from OpenCV), convert it to a Pillow image
            img = Image.fromarray(cv2.cvtColor(image_path_or_array, cv2.COLOR_BGR2RGB))

        text = pytesseract.image_to_string(img) 
        return text
    except pytesseract.TesseractNotFoundError:
        #Tesseract isn't installed. This is a common hiccup
        return "ERROR: Tesseract-OCR engine not found. Please install it first. See instructions: https://tesseract-ocr.github.io/tessdoc/Installation.html"
    except Exception as e:
        # Catching any other unexpected issues during OCR
        return f"ERROR: Could not perform OCR with pytesseract: {e}"


In [25]:
# RAG Setup Function 
def setup_rag_system(pdf_path, start_page, end_page):
    print("📖 Alright, let's load up that PDF and get it ready for some serious Q&A!")
    full_extracted_text = "" # This will hold all the text we pull from the PDF
    try:
        reader_pdf = PdfReader(pdf_path) # Opening our PDF file
        num_pdf_pages = len(reader_pdf.pages) 
        print(f"The PDF has {num_pdf_pages} pages in total.")

        
        for i in range(start_page, min(end_page + 1, num_pdf_pages)):
            print(f"Processing page {i + 1}...") 
            page = reader_pdf.pages[i]

            page_text = page.extract_text() # Trying to extract text directly from the PDF
            if page_text:
                full_extracted_text += f"\n--- PAGE {i+1} ---\n" + page_text 
            else:
                print(f"Hmm, page {i+1} didn't yield any direct text. If I had pdf2image fully integrated, I'd try OCR here.")
                full_extracted_text += f"\n--- PAGE {i+1} (No direct text extracted) ---\n" # Mark pages without text

    except FileNotFoundError:
        # Classic "file not found" error 
        return None, "ERROR: The PDF file was not found. Please ensure 'Ncert_science_10th.pdf' is in the 'OCR' folder or the path is correct."
    except Exception as e:
        # Catching any other unexpected problems during PDF loading
        return None, f"An unexpected error occurred while trying to process the PDF: {e}"

    if not full_extracted_text:
        # If we went through all that and got no text, something's wrong.
        return None, "ERROR: No text could be extracted from the PDF pages. Can't proceed with RAG. Is the PDF empty or corrupted?"

    print("Okay, now to break this big block of text into smaller, more manageable chunks...")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, # Each chunk will aim for about 1000 characters
        chunk_overlap=200, # Allow some overlap between chunks to maintain context
        length_function=len, # How we measure the length (just character count)
        add_start_index=True, # Helps keep track of where chunks originated
    )
    chunks = text_splitter.create_documents([full_extracted_text]) # Splitting the text!
    print(f"Split the entire textbook content into {len(chunks)} bite-sized chunks.")

    # Data Storage (SQLite) 
    # Just a little in-memory database to store our text chunks. Simple and effective
    print("Alright, let's tuck these text chunks into a temporary SQLite database (in memory)...")
    conn = sqlite3.connect(':memory:') # ':memory:' means it lives only while the script runs
    cursor = conn.cursor()
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS document_chunks (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        chunk_text TEXT NOT NULL
    )
    ''') # Creating a simple table
    conn.commit()
    for i, chunk in enumerate(chunks):
        cursor.execute("INSERT INTO document_chunks (chunk_text) VALUES (?)", (chunk.page_content,)) # Adding each chunk
    conn.commit()
    print("Text chunks are safely stored in our SQLite database. Nice!")

    # RAG System Setup 
    # This is where we bring everything together for our RAG system.
    print("Time to wire up the core components of our RAG system...")

    # 1. Initialize Embedding Model
    # We need a model to turn our text chunks into numerical vectors (embeddings)
    # so we can search them efficiently. MobileBERT is a good choice here!
    print("First, initializing our Embedding Model (sentence-transformers/msmarco-bert-base-dot-v5)...")
    embedding_model_name = "sentence-transformers/msmarco-bert-base-dot-v5"
    try:
        embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
    except Exception as e:
        # If there's an issue with the embedding model, we definitely need to know.
        return None, (f"Error initializing HuggingFaceEmbeddings: {e}\n"
                      "This model is public and doesn't require a token for inference. "
                      "If you are in Colab, ensure no HF_TOKEN secret is inadvertently set or remove it.")

    # 2. Create a Vector Store from Chunks
    # Now we take those chunks and their embeddings and put them into a FAISS vector store.
    # This allows us to quickly find relevant chunks later.
    print("Next, building our super-fast FAISS Vector Store from all those text chunks...")
    chunk_texts_for_embedding = [chunk.page_content for chunk in chunks]
    vectorstore = FAISS.from_texts(chunk_texts_for_embedding, embeddings)
    retriever = vectorstore.as_retriever() # This retriever will fetch the most relevant chunks

    # 3. Initialize Ollama LLM
    # IMPORTANT: You need Ollama running in the background with the 'llama2' model pulled!
    print("Now, hooking up our Ollama LLM (I'm using 'llama2' for this)...")
    try:
        llm = Ollama(model="llama2", temperature=0.7) # Setting a bit of creativity with temperature
        # Let's do a quick test to make sure Ollama is actually talking to us!
        print("Just a quick ping to Ollama to make sure it's alive...")
        llm.invoke("Hello!")
        print("Ollama is online and ready! 🎉")
    except Exception as e:
        return None, (f"Error initializing Ollama LLM: {e}\n"
                      "**CRITICAL:** Please ensure Ollama is installed, running, and the 'llama2' model is pulled. "
                      "You'll need to run 'ollama run llama2' in a **separate terminal** and wait for it to load completely *before* starting this script.")

    # 4. Define the RAG Prompt Template
    # This is how we tell our LLM what its job is and how to use the information we give it.
    template = """
    Hey there! You're an intelligent assistant, and your main job is to answer questions
    specifically about the NCERT Science Class 10 textbook.
    Please use *only* the context I provide below to answer the question accurately and concisely.
    If you can't find the answer in the provided context, it's totally okay to just say you don't know.
    Please don't try to make up answers!

    Context:
    {context}

    Question: {question}

    Answer:
    """
    prompt = PromptTemplate.from_template(template) # Creating our prompt template

    # 5. Build the RAG Chain
    # This is like building a pipeline: question -> retrieve context -> combine with prompt -> send to LLM -> get answer!
    rag_chain = (
        {"context": retriever, "question": RunnablePassthrough()} # First, get relevant context for the question
        | prompt # Then, put the context and question into our prompt template
        | llm # Send the complete prompt to our Ollama LLM
        | StrOutputParser() # And finally, make sure the LLM's output is a clean string
    )
    print("Awesome! The RAG chain is all constructed and ready to answer your questions! ✨")

    return rag_chain, conn # Returning our ready-to-use RAG system and the DB connection


In [37]:
# Initialize RAG components globally 
# it makes things easier to initialize our RAG system just once at startup.
global global_rag_chain
global global_db_conn
global setup_error_message

setup_error_message = None 

In [39]:
# Let's perform the setup right when our script runs.
print("Kicking off the RAG system setup now...")
global_rag_chain, global_db_conn_or_error = setup_rag_system(PDF_FILE, TARGET_PDF_PAGE_START, TARGET_PDF_PAGE_END)
if global_rag_chain is None:
    setup_error_message = global_db_conn_or_error # If something went wrong, this will hold the error!
else:
    print("RAG system initialized successfully. We're good to go!")


Kicking off the RAG system setup now...
📖 Alright, let's load up that PDF and get it ready for some serious Q&A!
The PDF has 209 pages in total.
Processing page 1...
Processing page 2...
Processing page 3...
Processing page 4...
Processing page 5...
Processing page 6...
Processing page 7...
Processing page 8...
Processing page 9...
Processing page 10...
Processing page 11...
Processing page 12...
Processing page 13...
Processing page 14...
Processing page 15...
Processing page 16...
Processing page 17...
Processing page 18...
Processing page 19...
Processing page 20...
Processing page 21...
Processing page 22...
Processing page 23...
Processing page 24...
Processing page 25...
Processing page 26...
Processing page 27...
Processing page 28...
Processing page 29...
Processing page 30...
Processing page 31...
Processing page 32...
Processing page 33...
Processing page 34...
Processing page 35...
Processing page 36...
Processing page 37...
Processing page 38...
Processing page 39...
Proces

In [40]:
# Gradio Interface Function 
# This function is what Gradio calls every time someone types a message into the chat.
def respond(message, chat_history):
    # Just making sure chat_history is always a list of dictionaries, especially on the first call
    if chat_history is None:
        chat_history = []

    if setup_error_message:
        # If the setup failed, let's tell the user right away instead of trying to process the message.
        return "", chat_history + [{'role': 'user', 'content': message}, {'role': 'assistant', 'content': f"Setup Error: {setup_error_message}"}]

    if global_rag_chain is None:
        # Another check: if the RAG system somehow didn't initialize, we can't answer questions.
        return "", chat_history + [{'role': 'user', 'content': message}, {'role': 'assistant', 'content': "Error: RAG system not initialized. Please check your console for errors that occurred during startup."}]

    try:
        # First, add the user's message to the chat history immediately
        chat_history.append({'role': 'user', 'content': message})

        # Now, send the user's question to our RAG chain to get an answer!
        response = global_rag_chain.invoke(message)

        # Add the AI's response to the chat history
        chat_history.append({'role': 'assistant', 'content': response})
    except Exception as e:
        # If something goes wrong while getting the AI's response (e.g., Ollama server disconnects)
        error_msg = f"Oops! An error occurred while getting a response from the LLM: {e}. Please ensure your Ollama server is still running and the 'llama2' model is loaded."
        chat_history.append({'role': 'assistant', 'content': error_msg}) # Add the error as an assistant message
        print(f"Error during RAG chain invocation: {e}") # Log it for debugging

    return "", chat_history # Return an empty string for the textbox and the updated history

# Gradio UI Layout
# Time to build our chatbot interface using Gradio!
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 📚 NCERT Science Class 10 Textbook Q&A (Ollama & RAG)")
    gr.Markdown("Hi there! Ask me any question about the NCERT Science Class 10 textbook. "
                "I'll use information extracted directly from the book to give you answers. "
                "This is all powered by a local AI (Llama 2 via Ollama) and smart search (MobileBERT-based embeddings)!")

    if setup_error_message:
        # If there was a setup error, let's prominently display it at the top of the UI.
        gr.Warning(f"**Heads up! There was an application setup error:** {setup_error_message}")
        gr.Markdown("Please resolve the above error and restart the application to get me working!")

    # This is our main chat window. I've set `type='messages'` to handle the role-based chat nicely.
    chatbot = gr.Chatbot(label="Our Chat", type='messages')

    with gr.Row(): # Arranging input elements in a row
        msg = gr.Textbox(label="Your Question", placeholder="e.g., What are the types of chemical reactions?", lines=2,
                         interactive=True if global_rag_chain is not None else False, # Make it interactive only if setup was successful
                         scale=4) # Give it more space
        submit_btn = gr.Button("Submit Question",
                               interactive=True if global_rag_chain is not None else False, # Same for the button
                               scale=1) # Smaller space for the button

    clear = gr.ClearButton([msg, chatbot]) # A handy button to clear the chat and input box

    if global_rag_chain is not None:
        # If our RAG system is ready, hook up the submit actions to our `respond` function!
        msg.submit(respond, [msg, chatbot], [msg, chatbot]) # When you press Enter in the textbox
        submit_btn.click(respond, [msg, chatbot], [msg, chatbot]) # When you click the submit button
    else:
        # If setup failed, disable interaction and give a warning if they try to click.
        submit_btn.click(lambda: gr.Warning("Sorry, the application setup failed. Please fix the errors and restart!"), [], [])
        msg.submit(lambda x,y: (x,y), [msg, chatbot], [msg, chatbot]) # Keep the textbox from doing anything useful

print("\nAlright, launching our Gradio app now! Check your browser for the link. ✨")
demo.launch(share=False) # Launch it locally, no need to share publicly for now


Alright, launching our Gradio app now! Check your browser for the link. ✨
* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




In [43]:
demo.close() # Clean up when done

Closing server running on port: 7860
