In [1]:
%pip install -q langchain langchain-community transformers sentence-transformers faiss-cpu requests
!pip install pypdf

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m65.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m444.0/444.0 kB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires requests==2.32.4, but you have requests 2.32.5 which is incompatible.[0m[31m
[0mCollecting pypdf
  Downloading pypdf-6.0.0-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-6.0.0-py3-none

In [3]:
import requests
import tempfile
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from transformers import pipeline

# Sources
text_sources = {
    'I.P.C': 'https://github.com/SaiSudheerKankanala/SAIbot/raw/main/ipc.pdf',
    'Constitution': 'https://github.com/SaiSudheerKankanala/SAIbot/raw/main/indian%20constitution.pdf',
    'Garuda': 'https://github.com/SaiSudheerKankanala/SAIbot/raw/main/GarudaPurana.pdf',
    'Bhagavad Gita': 'https://github.com/SaiSudheerKankanala/SAIbot/raw/main/Bhagavad-gita_As_It_Is.pdf',
    'Quran': 'https://github.com/SaiSudheerKankanala/SAIbot/raw/main/quran-allah.pdf',
    'Bible':'https://github.com/SaiSudheerKankanala/SAIbot/raw/main/bb.pdf'
}

# Function to download and load PDF
def load_pdf(url, source_name):
    response = requests.get(url)
    response.raise_for_status()
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
        tmp.write(response.content)
        tmp_path = tmp.name
    loader = PyPDFLoader(tmp_path)
    docs = loader.load()
    for d in docs:
        d.metadata["source"] = source_name
    return docs

# Load all documents
all_docs = {name: load_pdf(url, name) for name, url in text_sources.items()}

# Overlapping chunking
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,   # smaller for better focus
    chunk_overlap=50
)

# Embeddings + VectorDB
vector_dbs = {}
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

for name, docs in all_docs.items():
    chunks = text_splitter.split_documents(docs)
    if chunks:
        vector_dbs[name] = FAISS.from_documents(chunks, embedding=embedder)

# Local answer generator (FLAN-T5)
generator = pipeline("text2text-generation", model="google/flan-t5-base")

def generate_answer(question, context):
    prompt = f"""
    You are a helpful assistant.
    Answer the question ONLY if it is clearly answered in the given context.
    If the context is unrelated or unclear, respond with exactly: "Not mentioned in this source."

    Question: {question}
    Context: {context}

    Answer:
    """
    return generator(prompt, max_new_tokens=200, clean_up_tokenization_spaces=True)[0]['generated_text']

# Main QA function
def answer_question(question, k=2):
    final_output = []
    for source_name in text_sources.keys():
        if source_name in vector_dbs:
            docs = vector_dbs[source_name].similarity_search(question, k=k)
            if docs:
                context = "\n\n".join([doc.page_content for doc in docs])
                answer = generate_answer(question, context).strip()
                # Only include if real answer
                if answer and answer != "Not mentioned in this source.":
                    final_output.append(f"According to {source_name.capitalize()}: {answer}")
    return "\n\n".join(final_output) if final_output else "No relevant answer found in any source."

# Chat loop
if __name__ == "__main__":
    while True:
        question = input("You: ")
        if question.lower() in ["exit", "quit", "bye"]:
            print("Bot: Goodbye!")
            break
        print(answer_question(question))

  embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


You: what is the punishment for theft?
According to I.p.c: ten years, and shall also be liable to fine.

According to Constitution: a penalty greater than that which might have been inflicted under the law in force at the time of the commission of the offence

According to Garuda: poverty-stricken

According to Quran: Cut their hands.

According to Bible: he must pay seven times as much; he must give up all the wealth in his house.
You: stop


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


No relevant answer found in any source.
You: exit
Bot: Goodbye!
