In [3]:
!pip install pymupdf



In [4]:
!pip install streamlit



In [5]:
!pip install langchain langchain-community langchain-core langchain-groq faiss-cpu
!pip install langchain-huggingface
!pip install sentence-transformers



In [24]:
# -*- coding: utf-8 -*-
"""AI PDF Chatbot - PyMuPDF + Streamlit + Groq (Colab + Local Friendly)"""

import os
import sys
import fitz  # PyMuPDF
import streamlit as st
from typing import List
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain.schema import Document
import getpass

os.environ["GROQ_API_KEY"] = getpass.getpass("Enter your API key here:")

# Detect if running in Google Colab
try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

# ==============================
# ⚙️ Configuration
# ==============================
PDFS_DIRECTORY = "pdfs/"
FAISS_DB_PATH = "vectorstore/db_faiss"
LLM_MODEL = "deepseek-r1-distill-llama-70b"
HUGGINGFACE_MODEL_NAME = "all-MiniLM-L6-v2"

# Ensure directories exist
os.makedirs(PDFS_DIRECTORY, exist_ok=True)
os.makedirs("vectorstore", exist_ok=True)


# ==============================
# 📄 PDF Handling
# ==============================
class PyMuPDFLoader:
    def __init__(self, file_path: str):
        self.file_path = file_path

    def load(self) -> List[Document]:
        docs = []
        pdf_doc = fitz.open(self.file_path)
        for page_num in range(pdf_doc.page_count):
            page = pdf_doc[page_num]
            text = page.get_text("text")
            if text.strip():
                metadata = {"source": self.file_path, "page": page_num + 1}
                docs.append(Document(page_content=text, metadata=metadata))
        pdf_doc.close()
        return docs


def upload_pdf(file):
    file_path = os.path.join(PDFS_DIRECTORY, file.name)
    with open(file_path, "wb") as f:
        f.write(file.getbuffer())
    return file_path


def create_chunks(documents):
    splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=300)
    return splitter.split_documents(documents)


def create_vector_store(chunks):
    embeddings = HuggingFaceEmbeddings(model_name=HUGGINGFACE_MODEL_NAME)
    db = FAISS.from_documents(chunks, embeddings)
    db.save_local(FAISS_DB_PATH)
    return db


def answer_query(docs, llm, query):
    context = "\n\n".join([d.page_content for d, _ in docs])
    prompt = ChatPromptTemplate.from_template("Context:\n{context}\n\nQ:{question}\nA:")
    chain = prompt | llm
    return chain.invoke({"context": context, "question": query}).content


def initialize_llm(api_key):
    return ChatGroq(model_name=LLM_MODEL, api_key=api_key)


# ==============================
# 🌐 Streamlit App
# ==============================
def main():
    st.set_page_config(page_title="AI PDF Chatbot", page_icon="🤖")
    st.title("🤖 AI PDF Chatbot")

    st.markdown("## ⚖️ AI PDF Chatbot")
    st.caption("Powered by PyMuPDF for Superior Document Processing")

    # Sidebar API Key
    api_key = st.sidebar.text_input("🔑 Groq API Key", type="password")
    if api_key:
        st.sidebar.success("✅ API key configured")
    else:
        st.sidebar.warning("⚠️ API key required")

    # Handle PDF depending on environment
    file_path = None

    if IN_COLAB:
        from google.colab import files
        st.info("📂 Running in Colab - please upload your PDF below:")
        uploaded = files.upload()
        for filename in uploaded.keys():
            file_path = os.path.join(PDFS_DIRECTORY, filename)
            with open(file_path, "wb") as f:
                f.write(uploaded[filename])
        if file_path:
            st.success(f"✅ PDF uploaded: {os.path.basename(file_path)}")

    else:
        st.markdown("### 📄 Document Upload")
        uploaded_file = st.file_uploader("Upload PDF", type="pdf", accept_multiple_files=False)
        if uploaded_file:
            file_path = upload_pdf(uploaded_file)
            st.success(f"✅ Uploaded: {uploaded_file.name} ({uploaded_file.size/1024:.1f} KB)")

    # Question input
    st.markdown("### 💬 Ask Your Question")
    query = st.text_area("Enter your question:")

    # Predefined quick templates
    quick_questions = [
        "What are the main terms and conditions?",
        "What are the rights and obligations?",
        "What are the termination clauses?",
        "Summarize the key points"
    ]
    cols = st.columns(len(quick_questions))
    for i, q in enumerate(quick_questions):
        if cols[i].button(q):
            query = q

    # Analyze button
    if st.button("😊 Analyze Document"):
        if not api_key or not file_path or not query:
            st.error("⚠️ Please upload a PDF, enter your API key, and type a question.")
            return

        try:
            # Load and process PDF
            docs = PyMuPDFLoader(file_path).load()
            if not docs:
                st.error("❌ Could not extract text from PDF")
                return

            # Create chunks + vector store
            chunks = create_chunks(docs)
            db = create_vector_store(chunks)
            retrieved = db.similarity_search_with_score(query, k=4)

            # Initialize LLM
            llm = initialize_llm(api_key)
            response = answer_query(retrieved, llm, query)

            # Display result
            st.subheader("📋 AI Analysis")
            st.success(response)

            # Context explorer
            with st.expander("📚 Retrieved Context"):
                for i, (doc, score) in enumerate(retrieved):
                    st.markdown(f"**Page {doc.metadata.get('page')} - Score {score:.3f}**")
                    st.text(doc.page_content[:500] + ("..." if len(doc.page_content) > 500 else ""))

        except Exception as e:
            st.error(f"❌ Error: {str(e)}")


# ==============================
# 🚀 Run
# ==============================
if __name__ == "__main__":
    main()


Enter your API key here:··········






Saving wormhole.pdf to wormhole.pdf


In [22]:
!wget -q -O - ipv4.icanhazip.com

35.197.15.53


In [25]:
! streamlit run app.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://35.197.15.53:8501[0m
[0m
[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0Kyour url is: https://fast-bars-peel.loca.lt
[34m  Stopping...[0m
^C
