<a href="https://colab.research.google.com/github/Qasim-Gill/rag-pdf-upload-chatbot/blob/main/rag_exercise_for_company_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q pymupdf sentence-transformers faiss-cpu transformers langchain pdfplumber streamlit gdown



[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m54.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/59.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m73.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m79.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import fitz  # PyMuPDF
import faiss
import numpy as np
import streamlit as st
import gdown
import pdfplumber
from sentence_transformers import SentenceTransformer, util
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load Models
print("🔄 Loading models...")
embed_model = SentenceTransformer("all-MiniLM-L6-v2")  # Embedding Model
model_name = "google/flan-t5-large"  # LLM Model
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
print("✅ Models loaded successfully.")

# Define Google Drive file IDs
pdf_links = [
    "1QxsNzr-9BsSFGlVEJFcuL63OF06-ssrp"
    # "1CEam1bfUKukBv23K72mS7aGy10I5oJDN",
    # "1yW3kxVc5ruoIzKp37thGHWoAPoTjTlOD",
    # "1oRzbim1rmgd3dOQT55QSI40MmZv0jTSr"
]

# Directory to store PDFs
pdf_dir = "rag_company_pdfs"
os.makedirs(pdf_dir, exist_ok=True)

# Extract text from PDFs
def extract_text_from_pdf(pdf_path):
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = "\n".join([page.extract_text() or "" for page in pdf.pages])
        return text
    except Exception as e:
        print(f"❌ Error extracting text from {pdf_path}: {e}")
        return ""

# Process PDFs
chunks = []
for file_id in pdf_links:
    pdf_path = os.path.join(pdf_dir, f"{file_id}.pdf")
    gdown.download(f"https://drive.google.com/uc?id={file_id}", pdf_path, quiet=False)

    if not os.path.exists(pdf_path):
        print(f"❌ PDF {pdf_path} was not downloaded correctly.")
        continue

    pdf_text = extract_text_from_pdf(pdf_path)
    if not pdf_text.strip():
        print(f"❌ No text extracted from {pdf_path}, skipping.")
        continue

    # Chunking text
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
    pdf_chunks = text_splitter.split_text(pdf_text)

    if not pdf_chunks:
        print(f"❌ No chunks created for {pdf_path}, skipping.")
        continue

    print(f"✅ {len(pdf_chunks)} chunks created.")
    chunks.extend(pdf_chunks)

if not chunks:
    print("❌ No text chunks available for processing. Exiting...")
else:
    print(f"✅ Total {len(chunks)} chunks created across all PDFs.")

# Generate embeddings for all chunks
if chunks:
    print("🔄 Generating embeddings...")
    embeddings = np.array([embed_model.encode(chunk) for chunk in chunks], dtype=np.float32)
    print(f"✅ Generated {embeddings.shape[0]} embeddings of dimension {embeddings.shape[1]}")
else:
    print("❌ No chunks found, skipping embeddings.")

# Store embeddings in FAISS
if embeddings.shape[0] > 0:
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    print(f"✅ Added {index.ntotal} embeddings to FAISS index.")
else:
    print("❌ No embeddings available, FAISS index not created.")

# Retrieve Relevant Chunks (Improved Re-ranking)
def retrieve_relevant_chunks(query, top_k=8):
    query_embedding = embed_model.encode(query).reshape(1, -1).astype(np.float32)
    print(f"🔎 Searching FAISS for query: {query}")

    _, indices = index.search(query_embedding, top_k)
    retrieved = [chunks[i] for i in indices[0] if i < len(chunks)]

    # Re-rank based on similarity score
    chunk_embeddings = [embed_model.encode(chunk) for chunk in retrieved]
    similarities = [util.pytorch_cos_sim(embed_model.encode(query), emb)[0][0].item() for emb in chunk_embeddings]

    # Sort by highest similarity
    sorted_chunks = [chunk for _, chunk in sorted(zip(similarities, retrieved), reverse=True)]

    print(f"✅ Retrieved {len(sorted_chunks)} chunks")
    for i, chunk in enumerate(sorted_chunks[:3]):  # Show top 3 chunks in logs
        print(f"Chunk {i+1}: {chunk[:200]}...")

    return sorted_chunks[:3]  # Keep only the top 3 most relevant chunks

# Generate Answer with FLAN-T5 (More Context + Detailed Response)
def generate_answer(question, context):
    if not context.strip():
        return "⚠️ No relevant information found in the documents."

    prompt = f"""
    The following is information from company policy documents.
    Use the given context to answer the question as accurately and fully as possible.

    Context: {context}
    Question: {question}

    Answer the question in detail, ensuring that the explanation is thorough and user-friendly.
    """

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
    outputs = model.generate(**inputs, max_length=768, do_sample=True, top_p=0.95, temperature=0.7)  # Increased max tokens
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Streamlit UI
st.title("📄 PDF Chatbot with RAG")
st.subheader("Ask a question based on the company PDFs")

user_question = st.text_input("Ask a question:")
if st.button("Get Answer") and user_question:
    retrieved_chunks = retrieve_relevant_chunks(user_question)
    context = " ".join(retrieved_chunks)
    answer = generate_answer(user_question, context)

    if answer.strip():
        st.write("**Answer:**", answer)
    else:
        st.write("⚠️ No relevant answer found.")


🔄 Loading models...
✅ Models loaded successfully.


Downloading...
From: https://drive.google.com/uc?id=1QxsNzr-9BsSFGlVEJFcuL63OF06-ssrp
To: /content/rag_company_pdfs/1QxsNzr-9BsSFGlVEJFcuL63OF06-ssrp.pdf
100%|██████████| 450k/450k [00:00<00:00, 6.23MB/s]


✅ 146 chunks created.
✅ Total 146 chunks created across all PDFs.
🔄 Generating embeddings...




✅ Generated 146 embeddings of dimension 384
✅ Added 146 embeddings to FAISS index.


In [None]:
!wget -qO- ipv4.icanhazip.com

34.16.163.174


In [None]:
!streamlit run app.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.16.163.174:8501[0m
[0m
[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0Kyour url is: https://swift-ghosts-warn.loca.lt
2025-02-14 13:38:06.327255: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739540286.351159   19463 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739540286.358675   19463 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has alre