<a href="https://colab.research.google.com/github/Thilak21/mini-RAG-chatbot/blob/main/notebook/mini_RAG_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Upload Documents

In [None]:
from google.colab import files
uploaded = files.upload()

Load and Spilt Documents

In [29]:

from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
documents = []
for file in uploaded.keys():
  print("Loading:", file)
  if file.endswith(".pdf"):
    loader = PyPDFLoader(file)
    docs = loader.load()
    documents.extend(docs)

  elif file.endswith(".txt"):
    loader = TextLoader(file)
    docs = loader.load()
    documents.extend(docs)
print("Total documents loaded:", len(uploaded))
print('Total pages:', len(documents))


splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)

chunks = splitter.split_documents(documents)
texts = [chunk.page_content for chunk in chunks]

print("Total Chunks:", len(texts))

Creating Embeddings

In [None]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = embedding_model.encode(texts, batch_size=32, show_progress_bar=True)

Store in FAISS

In [32]:
import faiss
import numpy as np

In [33]:
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
embeddings = np.array(embeddings).astype('float32')
index.add(embeddings)
print("Vectors Stored:", index.ntotal)

Vectors Stored: 950


Check all the vectors,chunks & embedding are retrieved same

In [34]:
print("Documents:", len(documents))
print("Vectors:", index.ntotal)
print("chunks:", len(chunks))
print("Embeddings:", len(embeddings))


Documents: 87
Vectors: 950
chunks: 950
Embeddings: 950


Load Hugging Face LLM

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

Bulid Retrival and Generation function

In [36]:
def answer_question(query, k=4):

  query_embedding = embedding_model.encode([query])
  query_embedding = np.array(query_embedding).astype('float32')
  distances, indices = index.search(query_embedding, k)

  retrieved_chunks = [texts[i] for i in indices[0]]
  context = "\n\n".join(retrieved_chunks)
  context = context[:1000]

  prompt = f"""
  Answer only using the provided context .if the answer is not in the context, say i don't know.
    Context:
    {context}
    Question:
    {query}
    Answer :
    """

  inputs = tokenizer(prompt, return_tensors="pt", truncation = True)

  outputs = model.generate(
          **inputs,
          max_new_tokens=150,
          temperature = 0.3)

  answer = tokenizer.decode(outputs[0], skip_special_tokens = True)
  return answer

create gradio ui

In [37]:
import gradio as gr

def chatbot(query):
  return answer_question(query)

interface = gr.Interface(
       fn=chatbot,
       inputs="text",
       outputs="text",
       title="Mini RAG Chatbot",
       description="Ask questions based on uploaded documents"
)

interface.launch(
    share=True
)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://5ed83c695939b071b7.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


