In [1]:
# ==========================================
# üìò OFFLINE DOCUMENT Q&A CHATBOT (RAG SYSTEM)
# ==========================================

# ‚úÖ Step 1: Install dependencies
# (Uncomment below if not installed)
# !pip install PyPDF2 faiss-cpu sentence-transformers transformers torch accelerate tqdm

# ‚úÖ Step 2: Imports
import os
import PyPDF2
import numpy as np
import faiss
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ‚úÖ Step 3: Load your document
def load_document(file_path):
    text = ""
    if file_path.endswith(".pdf"):
        with open(file_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            for page in reader.pages:
                text += page.extract_text() + "\n"
    elif file_path.endswith(".txt"):
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
    else:
        raise ValueError("Unsupported file type! Use PDF or TXT.")
    return text.strip()

In [3]:
# ‚úÖ Step 4: Split document into chunks
def chunk_text(text, chunk_size=300):
    words = text.split()
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

In [4]:
# ‚úÖ Step 5: Create embeddings and FAISS index
def build_faiss_index(chunks, model_name="all-MiniLM-L6-v2"):
    embedder = SentenceTransformer(model_name)
    embeddings = embedder.encode(chunks, show_progress_bar=True, convert_to_numpy=True)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    print(f"‚úÖ FAISS index built with {index.ntotal} chunks.")
    return embedder, index, embeddings

In [5]:
# ‚úÖ Step 6: Load local QA model
def load_qa_model():
    print("Loading local QA model (google/flan-t5-base)...")
    return pipeline("text2text-generation", model="google/flan-t5-base")

In [6]:
# ‚úÖ Step 7: Retrieve top chunks
def retrieve_context(query, embedder, index, chunks, k=2):
    query_emb = embedder.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_emb, k)
    return " ".join([chunks[i] for i in indices[0]])

In [7]:
# ‚úÖ Step 8: Generate an answer
def generate_answer(query, embedder, index, chunks, qa_model):
    context = retrieve_context(query, embedder, index, chunks)
    prompt = f"Answer the question using only the context below:\nContext: {context}\n\nQuestion: {query}\nAnswer:"
    response = qa_model(prompt, max_new_tokens=150)[0]["generated_text"]
    return response.strip()

In [8]:
# ‚úÖ Step 9: Run the chatbot
file_path = input("üìÑ Enter path of your document (PDF or TXT): ").strip()

if not os.path.exists(file_path):
    print("‚ùå File not found! Please check the path and try again.")
else:
    text = load_document(file_path)
    chunks = chunk_text(text)
    embedder, index, embeddings = build_faiss_index(chunks)
    qa_model = load_qa_model()

    print("\n‚úÖ Chatbot is ready! Ask questions related to your document.\n(Type 'exit' to quit.)")
    while True:
        query = input("\nüí¨ Your question: ")
        if query.lower() in ["exit", "quit"]:
            print("üëã Exiting chatbot.")
            break
        answer = generate_answer(query, embedder, index, chunks, qa_model)
        print("\nü§ñ Answer:", answer)

üìÑ Enter path of your document (PDF or TXT):  C:\Users\prana\OneDrive\Desktop\CHEMICAL KINETICS\SD4.pdf


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  2.71it/s]


‚úÖ FAISS index built with 6 chunks.
Loading local QA model (google/flan-t5-base)...


Device set to use cpu



‚úÖ Chatbot is ready! Ask questions related to your document.
(Type 'exit' to quit.)



üí¨ Your question:  define elastic scattering


Token indices sequence length is longer than the specified maximum sequence length for this model (958 > 512). Running this sequence through the model will result in indexing errors



ü§ñ Answer: the energy of the scattered light is the same as the incident light



üí¨ Your question:  who got noble prize for electron microscope



ü§ñ Answer: Ernst Ruska



üí¨ Your question:  which year



ü§ñ Answer: 2017



üí¨ Your question:   who got noble prize for electron microscope and in which year



ü§ñ Answer: Ernst Ruska won the Nobel Prize in Physics in 1986 for his work on electron microscopes, including designing the first commercially available electron microscope . The Nobel Prize in Chemistry 2017 is awarded toJacques Dubochet , Joachim Frank , and Richard Henderson for the development ofcryo-electron microscopy , which both simplifies and improves the imaging ofbiomolecules (colloids)



üí¨ Your question:  exit


üëã Exiting chatbot.
