<a href="https://colab.research.google.com/github/VARSHAMG23/Tranformers/blob/main/RAG_Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
!pip install sentence-transformers faiss-cpu transformers gradio pypdf nltk




In [21]:
from google.colab import files


In [22]:
import nltk, os
# Download NLTK data for tokenization
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize
from pypdf import PdfReader

# Function to read content from different document types
def read_document(path):
    # Read text files
    if path.endswith(".txt"):
        return open(path, "r", encoding="utf-8").read()
    # Read PDF files
    elif path.endswith(".pdf"):
        reader = PdfReader(path)
        return " ".join(page.extract_text() or "" for page in reader.pages)
    # Handle unsupported file types
    else:
        return ""

# Function to chunk text into smaller segments for processing
def chunk_text(text, chunk_size=8, overlap=2):
    # Tokenize text into sentences
    sentences = sent_tokenize(text)
    chunks = []
    i = 0
    # Iterate through sentences to create chunks with overlap
    while i < len(sentences):
        chunk = " ".join(sentences[i:i+chunk_size])
        chunks.append(chunk)
        i += (chunk_size - overlap)
    return chunks

# Define 'uploaded' as the file is assumed to be present.
# This simulates the output of files.upload() for a specific file.
uploaded = {'Virat_Kohli_Info.txt': ''}

docs = []
# Process each uploaded file
for file in uploaded.keys():
    # Read the full text content of the document
    full_text = read_document(file)
    # Chunk the full text into smaller, manageable pieces
    chunks = chunk_text(full_text)
    # Add each chunk as a separate document entry with its source
    for ch in chunks:
        docs.append({"text": ch, "source": file})

# Print the total number of chunks created
print("Total chunks:", len(docs))

Total chunks: 1


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [23]:
from sentence_transformers import SentenceTransformer
import numpy as np

embedder = SentenceTransformer("all-MiniLM-L6-v2")

texts = [d["text"] for d in docs]
embeddings = embedder.encode(texts, convert_to_numpy=True, show_progress_bar=True)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [24]:
import faiss

dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)


In [25]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

model_name = "google/flan-t5-base"  # small enough for Colab CPU
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

generator = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=300
)


Device set to use cpu


In [26]:
def retrieve(query, k=4):
    q_emb = embedder.encode([query], convert_to_numpy=True)
    distances, indices = index.search(q_emb, k)
    return [docs[i] for i in indices[0]]


In [27]:
def build_prompt(question, retrieved_chunks):
    context = " ".join([chunk["text"] for chunk in retrieved_chunks])

    prompt = (
        "Context: " + context + "\n\n"
        "Question: " + question + "\n\n"
        "Answer: "
    )

    return prompt


In [28]:
def answer(query):
    retrieved = retrieve(query)
    prompt = build_prompt(query, retrieved)
    output = generator(prompt)[0]["generated_text"]
    return output


In [30]:
import gradio as gr

def chat_fn(question):
    return answer(question)

ui = gr.Interface(
    fn=chat_fn,
    inputs=gr.Textbox(label="Ask a question about your documents"),
    outputs=gr.Textbox(label="Response"),
    title="RAG Chatbot"
)

ui.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://59d7afac9821f59de9.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


