<a href="https://colab.research.google.com/github/Qudsia-jabeen20/RAG-Model/blob/main/RAG_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# 1. Load PDF
def load_pdf_text(filename):
    reader = PdfReader(filename)
    return "\n".join([page.extract_text() for page in reader.pages])

# 2. Chunk text
def chunk_text(text):
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    return splitter.split_text(text)

# 3. Embed and store in Chroma
def create_vectorstore(chunks):
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    vectorstore = Chroma.from_texts(chunks, embeddings, persist_directory="./rag_store")
    vectorstore.persist()
    return vectorstore

# 4. Retrieve context based on question
def retrieve_context(vectorstore, question, k=3):
    retriever = vectorstore.as_retriever(search_kwargs={"k": k})
    docs = retriever.get_relevant_documents(question)
    return "\n".join([doc.page_content for doc in docs])

# 5. QA using local model

def answer_question(context, question):
    prompt = f"Answer the question based on the following context:\n\n{context}\n\nQuestion: {question}\nAnswer:"
    model_name = "google/flan-t5-base"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True)
    outputs = model.generate(**inputs, max_length=256)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# 🧪 Run the RAG Pipeline
if __name__ == "__main__":
    # Step 1: Load PDF or Text
    text = load_pdf_text("path of your file")  # change to your path

    # Step 2: Chunk
    chunks = chunk_text(text)

    # Step 3: Embed and Store
    vectorstore = create_vectorstore(chunks)

    # Step 4: Ask Q&A
    print("\n🤖 RAG Q&A Chatbot Ready! Type 'exit' to quit.")
    while True:
        question = input("\nQ: ")
        if question.lower() == "exit":
            break
        context = retrieve_context(vectorstore, question)
        answer = answer_question(context, question)
        print(f"\nA: {answer}")




In [None]:
%pip install PyPDF2

In [None]:
%pip install -U langchain-community transformers

In [None]:
%pip install chromadb