In [1]:
!pip uninstall -y langchain langchain-community langchain-core
!pip install langchain==0.1.20
!pip install langchain-community==0.0.36
!pip install langchain-core==0.1.52
!pip install langchain-text-splitters
!pip install transformers sentence-transformers faiss-cpu pypdf accelerate bitsandbytes

Found existing installation: langchain 0.1.20
Uninstalling langchain-0.1.20:
  Successfully uninstalled langchain-0.1.20
Found existing installation: langchain-community 0.0.38
Uninstalling langchain-community-0.0.38:
  Successfully uninstalled langchain-community-0.0.38
Found existing installation: langchain-core 0.1.53
Uninstalling langchain-core-0.1.53:
  Successfully uninstalled langchain-core-0.1.53
Collecting langchain==0.1.20
  Using cached langchain-0.1.20-py3-none-any.whl.metadata (13 kB)
Collecting langchain-community<0.1,>=0.0.38 (from langchain==0.1.20)
  Using cached langchain_community-0.0.38-py3-none-any.whl.metadata (8.7 kB)
Collecting langchain-core<0.2.0,>=0.1.52 (from langchain==0.1.20)
  Using cached langchain_core-0.1.53-py3-none-any.whl.metadata (5.9 kB)
Using cached langchain-0.1.20-py3-none-any.whl (1.0 MB)
Using cached langchain_community-0.0.38-py3-none-any.whl (2.0 MB)
Using cached langchain_core-0.1.53-py3-none-any.whl (303 kB)
Installing collected packages:

In [2]:
import os
import re
import requests
import logging
from io import BytesIO

import torch
from pypdf import PdfReader

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    pipeline,
    logging as hf_logging
)

from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA


In [3]:
hf_logging.set_verbosity_error()
logging.basicConfig(level=logging.INFO)

In [4]:
text_sources = {
    "Garuda": "https://github.com/SaiSudheerKankanala/SAIbot/raw/main/GarudaPurana.pdf",
    "Bhagavad Gita": "https://github.com/SaiSudheerKankanala/SAIbot/raw/main/Bhagavad-gita_As_It_Is.pdf",
}
documents = []

for name, url in text_sources.items():
    response = requests.get(url)
    reader = PdfReader(BytesIO(response.content))

    for i, page in enumerate(reader.pages):
        text = page.extract_text() or ""
        documents.append(
            Document(
                page_content=text,
                metadata={"source": name, "page": i}
            )
        )

logging.info(f"Loaded {len(documents)} pages")


In [5]:
legal_stopwords = {
    "herein","hereinafter","therein","thereof","thereby","whereas",
    "wherein","thereafter","notwithstanding","pursuant",
    "agreement","party","parties","contract","plaintiff","defendant",
    "court","judge","jury","appeal","order","decision","shall","may","must"
}

def clean_text(text):
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    words = [w for w in text.split() if w not in legal_stopwords and len(w) > 2]
    return " ".join(words)

clean_docs = [
    Document(
        page_content=clean_text(doc.page_content),
        metadata=doc.metadata
    )
    for doc in documents
]

In [6]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=200
)

chunks = []
for doc in clean_docs:
    for i, chunk in enumerate(splitter.split_text(doc.page_content)):
        meta = doc.metadata.copy()
        meta["chunk_id"] = i
        chunks.append(Document(page_content=chunk, metadata=meta))

logging.info(f"Created {len(chunks)} chunks")


In [7]:
embedding = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [9]:
INDEX_PATH = "faiss_index"

if os.path.exists(INDEX_PATH):
    vectorstore = FAISS.load_local(
        INDEX_PATH,
        embedding,
        allow_dangerous_deserialization=True
    )
    logging.info("Loaded FAISS index safely")
else:
    vectorstore = FAISS.from_documents(chunks, embedding)
    vectorstore.save_local(INDEX_PATH)
    logging.info("Created & saved FAISS index")

In [10]:
model_name = "tiiuae/falcon-7b-instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,
    device_map="auto",
    torch_dtype=torch.float16
)

text_gen_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=150,
    temperature=0.3,
    do_sample=True
)

llm = HuggingFacePipeline(pipeline=text_gen_pipeline)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}
)

rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="map_reduce"
)


In [12]:
while True:
    query = input("\nAsk your question (type 'exit' to quit): ")

    if query.lower() in {"exit", "quit"}:
        break

    result = rag_chain.invoke({"query": query})

    print("\n==== ANSWER ====\n")
    print(result["result"])
    print("\n================")



Ask your question (type 'exit' to quit): what is karma?


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]


==== ANSWER ====

Given the following extracted parts of a long document and a question, create a final answer. 
If you don't know the answer, just say that you don't know. Don't try to make up an answer.

QUESTION: Which state/country's law governs the interpretation of the contract?
Content: This Agreement is governed by English law and the parties submit to the exclusive jurisdiction of the English courts in  relation to any dispute (contractual or non-contractual) concerning this Agreement save that either party may apply to any court for an  injunction or other relief to protect its Intellectual Property Rights.

Content: No Waiver. Failure or delay in exercising any right or remedy under this Agreement shall not constitute a waiver of such (or any other)  right or remedy.

11.7 Severability. The invalidity, illegality or unenforceability of any term (or part of a term) of this Agreement shall not affect the continuation  in force of the remainder of the term (if any) and this Ag