In [1]:
# --- Cell 1: Installations ---
# This cell installs the necessary system-level OCR engine and all required Python libraries.

# Install Tesseract OCR
!apt-get install -y tesseract-ocr

# Install Python libraries for AI, PDF processing, and the RAG pipeline
!pip install -q langchain langchain-community sentence-transformers faiss-cpu
!pip install -q transformers pymupdf pillow bitsandbytes accelerate pytesseract

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m69.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m53.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88

In [2]:
# --- Cell 2: Imports ---
# All the necessary libraries for the entire notebook are imported here.

# Core Python and File Management
import os
import fitz  # PyMuPDF
from PIL import Image
import torch
import pytesseract

# LangChain Components
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

# Hugging Face Transformers for Vision and Language Models
# FIX: Added BitsAndBytesConfig to handle quantization explicitly
from transformers import AutoProcessor, LlavaForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Google Colab specific
from google.colab import files

# Suppress warnings for a cleaner output
import warnings
warnings.filterwarnings("ignore")


In [3]:
# Create a folder for uploaded documents
upload_folder = "/content/uploaded_docs"
os.makedirs(upload_folder, exist_ok=True)

# Use Colab's file uploader
print("Please upload your PDF files.")
uploaded = files.upload()

# Move uploaded files to the dedicated folder
for filename in uploaded.keys():
    os.rename(filename, os.path.join(upload_folder, filename))

print(f"\n✅ Uploaded files moved to {upload_folder}:", os.listdir(upload_folder))

Please upload your PDF files.


Saving Ancient-India-RS-Sharma.pdf to Ancient-India-RS-Sharma.pdf

✅ Uploaded files moved to /content/uploaded_docs: ['Ancient-India-RS-Sharma.pdf']


In [4]:
print("Loading Vision Model (LLaVA)...")
model_id = "llava-hf/llava-1.5-7b-hf"

# Create an explicit quantization configuration to solve potential runtime errors.
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

vision_processor = AutoProcessor.from_pretrained(model_id)
vision_model = LlavaForConditionalGeneration.from_pretrained(
    model_id,
    quantization_config=quantization_config, # Use the explicit config
    device_map="auto"
)

print("✅ Vision model loaded successfully.")

Loading Vision Model (LLaVA)...


processor_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/674 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/950 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

✅ Vision model loaded successfully.


In [5]:
def extract_pdf_content(pdf_path, image_output_dir):
    """Extracts text from each page and saves images from a PDF."""
    os.makedirs(image_output_dir, exist_ok=True)
    doc = fitz.open(pdf_path)
    text_content_by_page = []
    image_paths_by_page = {}

    for page_num, page in enumerate(doc):
        text_content_by_page.append(page.get_text())
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]

            image_filename = f"page_{page_num+1}_img_{img_index}.{image_ext}"
            image_path = os.path.join(image_output_dir, image_filename)
            with open(image_path, "wb") as f:
                f.write(image_bytes)

            if page_num not in image_paths_by_page:
                image_paths_by_page[page_num] = []
            image_paths_by_page[page_num].append(image_path)

    return text_content_by_page, image_paths_by_page


def generate_caption(image_path, model, processor):
    """Generates a text caption for a given image using the LLaVA model."""
    raw_image = Image.open(image_path).convert('RGB')
    prompt = "USER: <image>\nDescribe this image in detail. It is a map, figure, or diagram from a history document. Explain what it is about. ASSISTANT:"

    inputs = processor(text=prompt, images=raw_image, return_tensors="pt").to(model.device)

    output = model.generate(**inputs, max_new_tokens=200)
    caption = processor.decode(output[0], skip_special_tokens=True)

    assistant_keyword = "ASSISTANT:"
    caption = caption[caption.find(assistant_keyword) + len(assistant_keyword):].strip()
    return caption

def extract_text_from_image(image_path):
    """Extracts text from an image using Tesseract OCR."""
    try:
        text = pytesseract.image_to_string(Image.open(image_path))
        return text
    except Exception as e:
        return f"Error during OCR: {e}"

print("✅ Helper functions (LLaVA captioning and Tesseract OCR) are defined.")

✅ Helper functions (LLaVA captioning and Tesseract OCR) are defined.


In [6]:
upload_folder = "/content/uploaded_docs"
image_folder = "/content/extracted_images"
all_rich_docs = []

os.makedirs(upload_folder, exist_ok=True)

for filename in os.listdir(upload_folder):
    if filename.lower().endswith(".pdf"):
        pdf_path = os.path.join(upload_folder, filename)
        print(f"Processing {pdf_path}...")

        texts, images_by_page = extract_pdf_content(pdf_path, image_folder)

        captions_by_page = {}
        for page_num, image_paths in images_by_page.items():
            page_captions = []
            print(f"  - Analyzing {len(image_paths)} images on page {page_num + 1}...")
            for img_path in image_paths:
                try:
                    llava_caption = generate_caption(img_path, vision_model, vision_processor)
                    ocr_text = extract_text_from_image(img_path)

                    combined_description = (
                        f"VISUAL DESCRIPTION: {llava_caption}\n\n"
                        f"TEXT EXTRACTED FROM IMAGE (OCR):\n---\n{ocr_text}\n---"
                    )
                    page_captions.append(combined_description)

                except Exception as e:
                    print(f"    - Error processing image {img_path}: {e}")
            captions_by_page[page_num] = page_captions

        for page_num, page_text in enumerate(texts):
            page_content = page_text
            if page_num in captions_by_page:
                rich_image_descriptions = "\n\n".join(captions_by_page[page_num])
                page_content += f"\n\n--- Image Content Analysis ---\n{rich_image_descriptions}"

            doc = Document(page_content=page_content, metadata={"source": filename, "page": page_num + 1})
            all_rich_docs.append(doc)

print(f"\n✅ Created {len(all_rich_docs)} documents with combined text, LLaVA descriptions, and OCR text.")

Processing /content/uploaded_docs/Ancient-India-RS-Sharma.pdf...
  - Analyzing 1 images on page 1...
  - Analyzing 1 images on page 2...
  - Analyzing 1 images on page 3...
  - Analyzing 1 images on page 4...
  - Analyzing 1 images on page 5...
  - Analyzing 1 images on page 6...
  - Analyzing 1 images on page 7...
  - Analyzing 1 images on page 8...
  - Analyzing 1 images on page 9...
  - Analyzing 1 images on page 10...
  - Analyzing 1 images on page 11...
  - Analyzing 1 images on page 12...
  - Analyzing 1 images on page 13...
  - Analyzing 1 images on page 14...
  - Analyzing 1 images on page 15...
  - Analyzing 1 images on page 16...
  - Analyzing 1 images on page 17...
  - Analyzing 1 images on page 18...
  - Analyzing 1 images on page 19...
  - Analyzing 1 images on page 20...
  - Analyzing 1 images on page 21...
  - Analyzing 1 images on page 22...
  - Analyzing 1 images on page 23...
  - Analyzing 1 images on page 24...
  - Analyzing 1 images on page 25...
  - Analyzing 1 ima

In [7]:
print("Splitting documents into chunks...")
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
chunks = splitter.split_documents(all_rich_docs)
print(f"✅ Total chunks created: {len(chunks)}")

Splitting documents into chunks...
✅ Total chunks created: 1172


In [8]:
print("Creating vector store...")
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = FAISS.from_documents(chunks, embedding_model)
print("✅ Vector store ready!")

Creating vector store...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Vector store ready!


In [9]:
print("Loading Text Generation Model (Qwen)...")
text_model_id = "Qwen/Qwen1.5-1.8B-Chat"
tokenizer = AutoTokenizer.from_pretrained(text_model_id)
llm = AutoModelForCausalLM.from_pretrained(text_model_id, device_map="auto")
print("✅ Qwen 1.8B model loaded")

Loading Text Generation Model (Qwen)...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.67G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

✅ Qwen 1.8B model loaded


In [12]:
# --- Replace your Cell 10 with this corrected version ---

def generate_answer(question, context):
    """Generates an answer based on the provided context and question."""
    context = context[:2000]

    messages = [
        {
            "role": "system",
            "content": (
                "You are a factual assistant. Your ONLY source of information is the text provided in the 'Context' section. "
                "You are to answer the user's 'Question' based *only* on the information in the 'Context'. "
                "Do NOT use any external knowledge. Do not invent any details, dates, names, or numbers that are not explicitly in the 'Context'. "
                "If the answer is not available in the context, you MUST reply with the exact phrase: 'Not available in the document.'"
            )
        },
        {
            "role": "user",
            "content": (
                f"Context:\n{context}\n\n"
                f"Question: {question}\n\n"
                "Based on the context provided, answer the question comprehensively. If the question asks for a list, provide the full list."
            )
        }
    ]

    text_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(text_prompt, return_tensors="pt").to(llm.device)

    # --- FIX APPLIED HERE ---
    # Unpack the 'inputs' dictionary into keyword arguments using **
    outputs = llm.generate(**inputs, max_new_tokens=256, do_sample=False, temperature=0.0)

    # Decode only the newly generated part of the output
    answer = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)

    return answer.strip()


def rag_pipeline(question, k=10):
    """The main RAG pipeline: retrieves context and generates an answer."""
    retrieved_docs = db.similarity_search(question, k=k)
    unique_contexts = list(dict.fromkeys([doc.page_content.strip() for doc in retrieved_docs]))
    context = "\n\n".join(unique_contexts)
    return generate_answer(question, context)

print("✅ RAG pipeline functions are defined and corrected.")

✅ RAG pipeline functions are defined and corrected.


In [13]:
print("\n--- Starting the RAG system. Type 'exit' to end. ---")
while True:
    question = input("\nEnter your question: ")
    if question.lower().strip() == "exit":
        print("✅ Exiting RAG system.")
        break
    answer = rag_pipeline(question)
    print("\n✅ Answer:", answer)


--- Starting the RAG system. Type 'exit' to end. ---

Enter your question: Who were the early Harappans and what characterized their culture?


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



✅ Answer: The Harappan civilization emerged in the north-western region of India, specifically in the regions of Baluchistan and Kalibangan in Rajasthan. Their culture was characterized by several key features:

1. Well-Planned Cities: The Harappan civilization planned their towns with a specific layout, featuring a chessboard system, streets, drainage pipes, and cess pits. This system allowed for efficient organization and management of urban spaces, making them highly organized and functional.

2. Agricultural Practices: Harappan agriculture was based on terrace farming, where crops were grown on raised platforms called "khas" or terraces. They also cultivated grains like wheat, barley, and rice, along with vegetables, fruits, and nuts. This led to a surplus of food production and helped the society maintain a stable economy.

3. Trade Networks: The Harappan civilization had a sophisticated trading network that connected different parts of the Indian subcontinent through ports, such

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



✅ Answer: The main features of the Indus Valley Civilization include:

1. Well-Planned Cities: The Harappan civilization built large, well-planned cities that were centered around water sources such as rivers, lakes, and canals. These cities featured a grid-like layout, with streets, markets, and public buildings arranged in a systematic manner.

2. Agricultural Practices: The Harappan civilization was primarily agricultural, relying on crops like wheat, rice, and cotton. They developed sophisticated irrigation systems, terraced fields, and crop rotation techniques to maximize yields and ensure food security.

3. Trade Networks: The Harappan civilization had extensive trade networks, connecting different parts of the region through ports, roads, and trade routes. This allowed them to exchange goods, technology, and ideas with neighboring civilizations, fostering cultural exchange and economic growth.

4. Political Organization: The Harappan civilization had a complex political system,

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



✅ Answer: The Vedic society differed from the Harappan civilization in several ways:

1. Geographical Extent: The Harappan civilization covered a vast area, including the Indus Basin, while the Vedic society primarily existed in the lower Gangetic plain. This difference in geographical extent allowed for the development of different cultures and societies within the same region.

2. Town Planning and Structures: The Harappan civilization had a well-planned city layout, with distinct neighborhoods and public spaces, such as marketplaces, temples, and residential areas. In contrast, the Vedic society's cities were smaller and more dispersed, with a focus on agriculture and domestication rather than urbanization.

3. Architecture: The Harappan civilization built grand structures like palaces, temples, and fortresses, often adorned with intricate carvings and sculptures. These buildings served as symbols of power and status, reflecting the hierarchical social structure of the time. On the

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



✅ Answer: The Vedic society differed from the Harappan civilization in several ways:

1. Geographical Extent: The Harappan civilization covered a vast area, including the Indus Basin, while the Vedic society primarily existed in the lower Gangetic plain. This difference in geographical extent allowed for the development of different cultures and societies within the same region.

2. Town Planning and Structures: The Harappan civilization had a well-planned city layout, with distinct neighborhoods and public spaces, such as marketplaces, temples, and residential areas. In contrast, the Vedic society's cities were smaller and more dispersed, with a focus on agriculture and domestication rather than urbanization.

3. Architecture: The Harappan civilization built grand structures like palaces, temples, and fortresses, often adorned with intricate carvings and sculptures. These buildings served as symbols of power and status, reflecting the hierarchical social structure of the time. On the

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



✅ Answer: Main Causes of the Decline of the Maurya Empire:

1. Brahmanical Reaction: The Maurya Empire faced a significant challenge due to the rise of the Buddhist religion and the influence of the Vedic scriptures. The Mauryans believed in the superiority of the Hindu gods over the Buddhist deities, leading to a rejection of Buddhism as a dominant religion in the region. This religious divide weakened the social cohesion and unity of the empire, making it more vulnerable to external threats.

2. Asoka's Exit from Power: Asoka, the emperor of the Maurya dynasty, stepped down from power in 232 BC due to a series of conflicts and defeats. His departure marked the end of the Mauryan era and the beginning of a period of instability and fragmentation. Asoka's successor, Ashoka, attempted to restore order and stability to the empire, but he faced resistance from various regional leaders who sought independence from the central authority.

3. Military Defeats: The Maurya Empire suffered sev