In [1]:
# pip install -q langchain langchain-community langchain-huggingface sentence-transformers python-dotenv openai faiss-cpu PyMuPDF gradio tiktoken google-generativeai langchain-google-genai

In [2]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

# Now you can use GOOGLE_API_KEY wherever required
print(GOOGLE_API_KEY)


AIzaSyAJVD0jTifmfrnXWnRd9_OGClZO_QzwKwA


In [3]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import RetrievalQA
import re
from collections import defaultdict

# Globals
faiss_db = None
qa_chain = None
all_docs = []
year_event_map = {}  # new: year -> list of events


def load_and_split_pdf(pdf_path):
    loader = PyMuPDFLoader(pdf_path)
    docs = loader.load()
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
    return splitter.split_documents(docs)


def create_faiss_index(chunks):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return FAISS.from_documents(chunks, embeddings)
def create_qa_chain(faiss_index):
    retriever = faiss_index.as_retriever(search_kwargs={"k": 5})
    llm = ChatGoogleGenerativeAI(
        model="gemini-2.0-flash",
        temperature=0.2,
        google_api_key=GOOGLE_API_KEY  # make sure this is defined globally
    )
    return RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type="stuff")


def answer_question(question):
    global qa_chain
    if qa_chain is None:
        return "❌ Please upload and process a PDF first."
    try:
        result = qa_chain.run(question)
        return result
    except Exception as e:
        return f"❌ Error answering question: {str(e)}"

In [4]:
def show_timeline(): 
    global all_docs

    if not all_docs:
        return "❌ Please upload and process a PDF first."

    from collections import defaultdict
    import re

    year_event_map = defaultdict(list)
    pattern = r"\b(1[5-9]\d{2}|20\d{2})\b"  # Match years from 1500–2099

    for doc in all_docs:
        sentences = re.split(r'(?<=[.!?])\s+', doc.page_content)
        for sentence in sentences:
            sentence = sentence.strip()

            # Skip short or noisy sentences
            if len(sentence) < 30 or re.search(r'\b(page|figure|table|chapter)\b', sentence, re.IGNORECASE):
                continue

            matches = re.findall(pattern, sentence)
            for year in matches:
                year_event_map[int(year)].append(sentence)

    if not year_event_map:
        return "⚠️ No historical events with years were found in the document."

    # Format timeline with better structure
    timeline = []
    timeline.append("📜 Chronological Timeline of Events")
    timeline.append("=" * 40)

    for year in sorted(year_event_map.keys()):
        timeline.append(f"\n🗓 Year: {year}")
        timeline.append("-" * 40)
        events = list(dict.fromkeys(year_event_map[year]))[:2]  # Remove duplicates, max 2
        for i, event in enumerate(events, start=1):
            timeline.append(f"{i}) {event}")
    
    return "\n".join(timeline)


In [5]:
import os

def upload_pdf(file):
    global faiss_db, qa_chain, all_docs

    try:
        if file is None:
            return "❌ No file uploaded."

        file_path = file.name
        if not os.path.exists(file_path):
            return f"❌ File not found: {file_path}"

        all_docs = load_and_split_pdf(file_path)
        faiss_db = create_faiss_index(all_docs)
        qa_chain = create_qa_chain(faiss_db)

        return f"✅ Processed file: {os.path.basename(file_path)}"

    except Exception as e:
        return f"❌ Error: {str(e)}"


In [6]:
import gradio as gr

with gr.Blocks() as demo:
    with gr.Row():
        pdf_input = gr.File(label="Upload History PDF", file_types=[".pdf"], type="filepath")
        upload_btn = gr.Button("Process PDF")
    status = gr.Textbox(label="Status")

    with gr.Row():
        question_input = gr.Textbox(label="Ask a historical question (e.g., What happened in 1945?)")
        answer_btn = gr.Button("Get Answer")
    answer_output = gr.Textbox(label="Answer")

    with gr.Row():
        timeline_btn = gr.Button("Generate Timeline")
        timeline_output = gr.Textbox(label="Chronological Timeline", lines=20)

    upload_btn.click(upload_pdf, inputs=[pdf_input], outputs=[status])
    answer_btn.click(answer_question, inputs=[question_input], outputs=[answer_output])
    timeline_btn.click(show_timeline, outputs=[timeline_output])

demo.launch()

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




  result = qa_chain.run(question)
