# 📌 Cell 1 — Install Dependencies

In [1]:
# Install required packages for RAG + FastAPI + Ngrok
!pip install -q transformers accelerate sentence-transformers faiss-cpu \
fastapi uvicorn nest_asyncio pyngrok pydantic python-multipart


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25h

# 📌 Cell 2 — Import Libraries

In [2]:
# Core imports
import os
import torch
import nest_asyncio
import faiss
from fastapi import FastAPI, UploadFile, File
from pydantic import BaseModel

# HuggingFace embeddings + LLM
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer

# Ngrok for public API
from pyngrok import ngrok


# 📌 Cell 3 — Load Models (FREE MODELS)

In [4]:
# FREE Embedding model (no login required)
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# FREE LLM (no HuggingFace token required)
model_name = "microsoft/Phi-3-mini-4k-instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)

llm_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

print("Models loaded successfully without HF token.")


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

Models loaded successfully without HF token.


# 📌 Cell 4 — RAG: Text Chunking Function

In [5]:
# ---------------------------
# 📌 Cell 4 — Text Chunking for RAG
# ---------------------------

def chunk_text(text, chunk_size=300, overlap=50):
    """
    Splits large text into smaller overlapping chunks.
    Helps improve retrieval accuracy in RAG.

    Parameters:
        text (str): Input text.
        chunk_size (int): Max words per chunk.
        overlap (int): Overlap words between chunks.

    Returns:
        list: List of text chunks.
    """
    words = text.split()
    chunks = []

    start = 0
    while start < len(words):
        end = start + chunk_size
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += chunk_size - overlap  # Move by chunk - overlap

    return chunks

print("Chunking function loaded successfully.")


Chunking function loaded successfully.


# Cell 5 — Build Vector Database (FAISS + Embeddings)

In [7]:
# =========================================================
# 📌 CELL 5 — Upload Documents & Create Embeddings (FAISS)
# =========================================================

from google.colab import files
import os
import glob
import pickle
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings

# Folder to store uploaded docs
UPLOAD_DIR = "/content/docs"
os.makedirs(UPLOAD_DIR, exist_ok=True)

# ===== STEP 1: UPLOAD FILES =====
print("📁 Upload your files now (PDF, TXT, DOCX, CSV):")
uploaded = files.upload()

for filename in uploaded.keys():
    file_path = os.path.join(UPLOAD_DIR, filename)
    with open(file_path, "wb") as f:
        f.write(uploaded[filename])
print("✅ Files uploaded successfully!")

# ===== STEP 2: LOAD DOCUMENTS =====
def load_documents(directory):
    docs = []
    for f in glob.glob(directory + "/*"):
        if f.endswith(".txt"):
            with open(f, "r", encoding="utf-8") as file:
                docs.append(file.read())

        elif f.endswith(".pdf"):
            from PyPDF2 import PdfReader
            reader = PdfReader(f)
            text = ""
            for page in reader.pages:
                text += page.extract_text() or ""
            docs.append(text)

        elif f.endswith(".docx"):
            import docx
            document = docx.Document(f)
            text = "\n".join([p.text for p in document.paragraphs])
            docs.append(text)

        elif f.endswith(".csv"):
            import csv
            text = ""
            with open(f, "r") as csvfile:
                for row in csv.reader(csvfile):
                    text += " ".join(row) + "\n"
            docs.append(text)

    return docs

documents = load_documents(UPLOAD_DIR)

if len(documents) == 0:
    print("❌ No documents found! Upload files first.")
else:
    print(f"📄 Loaded {len(documents)} documents.")

# ===== STEP 3: SPLIT TEXT =====
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)

text_chunks = splitter.create_documents(documents)
print(f"✂️ Created {len(text_chunks)} text chunks.")

# ===== STEP 4: CREATE EMBEDDINGS MODEL =====
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# ===== STEP 5: CREATE VECTORSTORE =====
vectorstore = FAISS.from_documents(text_chunks, embedding_model)

# Save for later use
with open("vectorstore.pkl", "wb") as f:
    pickle.dump(vectorstore, f)

print("🎉 SUCCESS! Vectorstore created and saved as vectorstore.pkl")


📁 Upload your files now (PDF, TXT, DOCX, CSV):


Saving In Big Data Engineering.pdf to In Big Data Engineering (1).pdf
✅ Files uploaded successfully!
📄 Loaded 1 documents.
✂️ Created 20 text chunks.
🎉 SUCCESS! Vectorstore created and saved as vectorstore.pkl


# Cell 6 — Upload Documents (PDF/TXT/DOCX) and Add to Vector DB

In [9]:
# =========================================================
# 📌 FIXED CELL — Load Vectorstore + RAG Chatbot
# =========================================================

import pickle
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

VECTOR_DB_PATH = "vectorstore.pkl"

# ---------- Load FAISS Vectorstore ----------
def load_vectorstore():
    try:
        with open(VECTOR_DB_PATH, "rb") as f:
            vs = pickle.load(f)
        print("✅ Vectorstore loaded successfully.")
        return vs
    except Exception as e:
        print("❌ Could not load vectorstore:", e)
        return None

vectorstore = load_vectorstore()


# ---------- Load FREE BlenderBot Model ----------
model_name = "facebook/blenderbot-400M-distill"
tokenizer = AutoTokenizer.from_pretrained(model_name)
llm_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

print("🤖 LLM Ready")


# ---------- RAG Chatbot Function ----------
def chatbot_answer(query):

    if vectorstore is None:
        return "❌ Vectorstore not found. Please upload files and create embeddings first."

    # Retrieve relevant document chunks
    retrieved_docs = vectorstore.similarity_search(query, k=3)

    if len(retrieved_docs) == 0:
        return "❌ No relevant information found inside your PDF."

    # Combine retrieved text
    context = "\n\n".join([doc.page_content for doc in retrieved_docs])

    # Build final prompt
    final_prompt = (
        "You are a strict PDF-based AI assistant. "
        "Answer ONLY using the information below. "
        "If the answer is missing, respond: 'The document does not contain this information.'\n\n"
        f"DOCUMENT CONTENT:\n{context}\n\n"
        f"USER QUESTION: {query}\n\n"
        "ANSWER:"
    )

    # Generate output from BlenderBot
    inputs = tokenizer(final_prompt, return_tensors="pt", truncation=True)
    output = llm_model.generate(
        **inputs,
        max_new_tokens=150,
        num_beams=3,
        early_stopping=True,
    )

    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    return answer


✅ Vectorstore loaded successfully.


tokenizer_config.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/16.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/730M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/730M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

🤖 LLM Ready


# Cell 7 — Build the FAISS Index (Final Step of Vector DB)

In [9]:
# -----------------------------------------
# 📌 Cell 7 — Build the FAISS Vector Index
# -----------------------------------------

import numpy as np

def build_vector_database():
    """
    Build the FAISS vector index using all stored embeddings.
    Must be run after uploading documents.
    """
    global index, vectors

    if len(vectors) == 0:
        print("❌ No vectors found! Upload files first.")
        return

    # Convert list → numpy array
    vector_array = np.array(vectors).astype("float32")

    # Create FAISS index
    dim = vector_array.shape[1]  # embedding dimensions
    index = faiss.IndexFlatL2(dim)

    # Add vectors to FAISS
    index.add(vector_array)

    print(f"🔥 FAISS index built successfully with {index.ntotal} vectors!")


# Run to build index
build_vector_database()


🔥 FAISS index built successfully with 5 vectors!


# Cell 8 — FAISS Search (Top-k Retrieval)

In [10]:
# -----------------------------------------
# 📌 Cell 8 — Search Function (FAISS Retrieval)
# -----------------------------------------

def search_documents(query, top_k=5):
    """
    Search the FAISS index using a query.

    Steps:
    1. Embed the user query.
    2. Search FAISS for similar vectors.
    3. Return the most relevant document chunks.
    """

    if index is None:
        print("❌ FAISS index not found! Run Cell 7 first.")
        return []

    # Encode query into vector
    query_vector = embedding_model.encode(query).astype("float32")

    # Perform FAISS similarity search
    distances, indices = index.search(np.array([query_vector]), top_k)

    results = []
    for idx in indices[0]:
        if idx < len(documents):
            results.append(documents[idx])

    return results


# Test search function
test_results = search_documents("What is the summary?")
print("✔ Search function working. Retrieved:", len(test_results), "chunks.")


✔ Search function working. Retrieved: 5 chunks.


# Cell 9 — Helper Functions for Loading Documents (TXT, CSV, PDF, DOCX)

In [2]:
# ============================================
# 📌 CELL 9 — Helper Functions to Load Files
# ============================================

import csv
import json
from docx import Document
from PyPDF2 import PdfReader

def load_txt(file):
    """Load plain text files"""
    return file.read().decode("utf-8")

def load_csv(file):
    """Load CSV files into a list of rows"""
    decoded = file.read().decode("utf-8").splitlines()
    reader = csv.reader(decoded)
    return "\n".join([", ".join(row) for row in reader])

def load_pdf(file):
    """Extract text from PDF"""
    reader = PdfReader(file)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text

def load_docx(file):
    """Extract text from DOCX"""
    doc = Document(file)
    text = "\n".join([para.text for para in doc.paragraphs])
    return text

def process_uploaded_file(uploaded):
    """General loader based on file type"""
    for filename, file in uploaded.items():
        if filename.endswith(".txt"):
            return load_txt(file)
        elif filename.endswith(".csv"):
            return load_csv(file)
        elif filename.endswith(".pdf"):
            return load_pdf(file)
        elif filename.endswith(".docx"):
            return load_docx(file)
        else:
            return f"❌ Unsupported file format: {filename}"


# Cell 10 — Create Vectorstore (FAISS) and Embed Uploaded Data

In [3]:
!pip install langchain==0.2.10
!pip install langchain-community==0.2.10
!pip install sentence-transformers faiss-cpu


Collecting langchain-community==0.2.10
  Using cached langchain_community-0.2.10-py3-none-any.whl.metadata (2.7 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community==0.2.10)
  Using cached dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community==0.2.10)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community==0.2.10)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community==0.2.10)
  Downloading mypy_extensions-1.1.0-py3-none-any.whl.metadata (1.1 kB)
Downloading langchain_community-0.2.10-py3-none-any.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [4]:
# =========================================================
# 📌 CELL 10 — Create Vector Database (FAISS) from Documents
# =========================================================

from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings

# Use FREE embedding model (NO OpenAI required)
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectorstore = None  # global variable

def build_vectorstore(text_data):
    """Split documents, create embeddings, and store in FAISS DB"""
    global vectorstore

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
        separators=["\n\n", "\n", ".", "!", "?"]
    )

    chunks = splitter.split_text(text_data)
    print(f"📌 Total text chunks created: {len(chunks)}")

    # Create FAISS vector database
    vectorstore = FAISS.from_texts(chunks, embedding_model)

    print("✅ Vectorstore created successfully!")

    return vectorstore


  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Cell 11 — Build the Retrieval QA Chain (FREE LLM + FAISS Retrieval)

In [5]:
# =========================================================
# 📌 CELL 11 — Build RetrievalQA Chatbot (FREE)
# =========================================================

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Load FREE chatbot model
model_name = "facebook/blenderbot_small-90M"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def chatbot_answer(user_query):
    """
    1️⃣ Retrieve relevant text chunks from FAISS
    2️⃣ Combine them into context
    3️⃣ Use a free HuggingFace model to generate answer
    """

    if vectorstore is None:
        return "❌ Vectorstore not found. Upload files and create embeddings first."

    # Retrieve top-k relevant chunks
    retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
    docs = retriever.get_relevant_documents(user_query)

    retrieved_context = "\n".join([d.page_content for d in docs])

    # Build prompt for the model
    prompt = f"Context:\n{retrieved_context}\n\nQuestion: {user_query}\nAnswer:"

    # Encode and generate
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)

    reply_ids = model.generate(
        **inputs,
        max_length=256,
        num_beams=4,
        early_stopping=True
    )

    answer = tokenizer.decode(reply_ids[0], skip_special_tokens=True)

    return answer


tokenizer_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/350M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/311 [00:00<?, ?B/s]

# CELL 12 — Gradio Chat UI (Prototype Level, Free)

In [10]:
# =========================================================
# 📌 CELL 12 — Gradio Chat UI for RAG Chatbot
# =========================================================

import gradio as gr

def chatbot_interface(user_input, history):
    """
    Wrapper to connect chatbot_answer() with Gradio UI
    """
    answer = chatbot_answer(user_input)
    history = history + [(user_input, answer)]
    return history, history

with gr.Blocks() as demo:
    gr.Markdown("## 📘 AI Document Assistant — Free RAG Prototype")
    gr.Markdown("Upload documents → Create embeddings → Ask questions")

    chatbot = gr.Chatbot(height=450)
    state = gr.State([])

    with gr.Row():
        user_input = gr.Textbox(
            placeholder="Ask anything from your uploaded documents...",
            label="Your Question"
        )
        submit_btn = gr.Button("Ask")

    submit_btn.click(
        chatbot_interface,
        inputs=[user_input, state],
        outputs=[chatbot, state]
    )

demo.launch()


  chatbot = gr.Chatbot(height=450)


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://34914a5c51b93d772a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




# ✅ STEP 1 — Mount Google Drive

In [11]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


# ✅ STEP 2 — Install Required Libraries

In [12]:
!pip install langchain sentence-transformers faiss-cpu




# STEP 3 — Upload Files & Create Embeddings

In [17]:
# =========================================================
# STEP 3 — Auto-create folder + Load documents
# =========================================================

!pip install -q pypdf langchain_community python-docx docx2txt

import os
from langchain_community.document_loaders import (
    PyPDFLoader,
    TextLoader,
    Docx2txtLoader
)

# 📂 TARGET FOLDER
folder_path = "/content/drive/MyDrive/my_documents"

# 🔧 Auto-create folder if missing
if not os.path.exists(folder_path):
    os.makedirs(folder_path)
    print("📂 Folder did NOT exist. A new folder is created at:")
    print(folder_path)
    print("\n👉 Please upload PDF/TXT/DOCX files into this folder in Google Drive.")
else:
    print("📁 Folder found:", folder_path)

documents = []

print("\n🔍 Scanning folder for files...")

for file in os.listdir(folder_path):
    full_path = os.path.join(folder_path, file)

    try:
        # ========== PDF ==========
        if file.lower().endswith(".pdf"):
            loader = PyPDFLoader(full_path)
            loaded_docs = loader.load()
            documents.extend(loaded_docs)
            print(f"✅ Loaded PDF: {file} ({len(loaded_docs)} pages)")

        # ========== TXT ==========
        elif file.lower().endswith(".txt"):
            loader = TextLoader(full_path)
            loaded_docs = loader.load()
            documents.extend(loaded_docs)
            print(f"✅ Loaded TXT: {file}")

        # ========== DOCX ==========
        elif file.lower().endswith(".docx"):
            loader = Docx2txtLoader(full_path)
            loaded_docs = loader.load()
            documents.extend(loaded_docs)
            print(f"✅ Loaded DOCX: {file}")

        else:
            print(f"⚠ Unsupported file skipped: {file}")

    except Exception as e:
        print(f"❌ Error loading {file}: {e}")

print("\n📄 Total documents loaded:", len(documents))


📂 Folder did NOT exist. A new folder is created at:
/content/drive/MyDrive/my_documents

👉 Please upload PDF/TXT/DOCX files into this folder in Google Drive.

🔍 Scanning folder for files...

📄 Total documents loaded: 0


# STEP 4 — Load Vectorstore from Google Drive

In [None]:
# =========================================================
# STEP 4 — Load Vectorstore from Google Drive
# =========================================================

from langchain_community.vectorstores import FAISS
import os

DB_PATH = "/content/drive/MyDrive/my_vector_db"

if not os.path.exists(DB_PATH):
    print("❌ Vectorstore not found in Google Drive. Please run Step 3 first.")
else:
    try:
        vectorstore = FAISS.load_local(
            folder_path=DB_PATH,
            embeddings=None,   # not needed for loading
            allow_dangerous_deserialization=True
        )
        print("✅ Vectorstore loaded successfully from Google Drive.")
    except Exception as e:
        print("❌ Error loading vectorstore:", e)
