In [5]:
import requests
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from transformers import pipeline
text_sources = {
    'ipc': 'https://raw.githubusercontent.com/SaiSudheerKankanala/SAIbot/main/ipc.txt',
    'gita': 'https://raw.githubusercontent.com/SaiSudheerKankanala/SAIbot/main/Bhagavad-gita_As_It_Is.txt',
    'quran': 'https://raw.githubusercontent.com/SaiSudheerKankanala/SAIbot/main/quran-allah.txt',
    'bible': 'https://raw.githubusercontent.com/SaiSudheerKankanala/SAIbot/main/Bible.txt',
    'constitution': 'https://raw.githubusercontent.com/SaiSudheerKankanala/SAIbot/main/indian%20constitution.txt',
    'garuda': 'https://raw.githubusercontent.com/SaiSudheerKankanala/SAIbot/main/GarudaPurana.txt'
}

def load_text(url):
    response = requests.get(url)
    response.raise_for_status()
    return response.text
text_data = {name: load_text(url) for name, url in text_sources.items()}
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    separators=["\n\nSECTION", "\n\nVERSE", "\n\nARTICLE", "\n\n"]
)

chunks = []
for name, text in text_data.items():
    for chunk in text_splitter.split_text(text):
        chunks.append({"text": chunk, "source": name})

texts = [chunk["text"] for chunk in chunks]
metadatas = [{"source": chunk["source"]} for chunk in chunks]
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vector_db = FAISS.from_texts(
    texts=texts,
    embedding=embedder,
    metadatas=metadatas
)

generator = pipeline("text2text-generation", model="google/flan-t5-base", device="cpu")

def generate_answer(question, context):
    input_text = f"question: {question} context: {context}"
    return generator(input_text, max_length=200)[0]['generated_text']

def answer_question(question, k=3):
    docs = vector_db.similarity_search(question, k=k)
    context = "\n\n".join([doc.page_content for doc in docs])
    sources = ", ".join(set(doc.metadata["source"] for doc in docs))
    answer = generate_answer(question, context)
    return f"Answer: {answer}\nSources: {sources}"
question = input("")
print(answer_question(question))

Device set to use cpu


punishment for rape 


Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Answer: death, or rigorous imprisonment for a term which shall not be less than twenty years, but which may extend to imprisonment for life, which shall mean imprisonment for the remainder of that person’s natural life, and shall also be liable to fine
Sources: ipc
