<a href="https://colab.research.google.com/github/Renlim61/MVP_Product001_2025_Tier120pbc/blob/main/Phase1_RAG_MVP_clean_v6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Phase 1 – GenAI RAG MVP (Colab, Clean v6) This version ran successfuly.

**Goal:** Single-tenant RAG assistant (BYO OpenAI key). Ingest PDF/DOCX/TXT → chunk + embed → search with FAISS → answer with citations.

### Highlights
- Upload **.pdf / .docx / .txt**
- OpenAI **embeddings** (fallback to large if small not available)
- **FAISS** cosine similarity
- **Gradio UI** with per-prompt document selection
- **Optional**: Save/Load state to Google Drive

**Instructions:** Run the next cell, paste your OpenAI key in the UI, upload files, click **Ingest**, then ask questions under **Ask**.


In [1]:
# CELL 0 - INSTALL & IMPORTS
# %%capture
!pip -q install openai gradio faiss-cpu pypdf python-docx

import os, io, json, pickle, time, traceback
import numpy as np
from typing import List, Dict, Any
import gradio as gr
from pypdf import PdfReader
from docx import Document as DocxDocument
import faiss
from dataclasses import dataclass

try:
    from google.colab import drive as colab_drive
    IN_COLAB = True
except Exception:
    IN_COLAB = False

try:
    from openai import OpenAI
except Exception:
    raise RuntimeError("The 'openai' package failed to import. Make sure the install cell ran successfully.")

In [2]:
# Cell 1 — Config & Globals
# --- Config ---
EMBED_MODEL_DEFAULT = "text-embedding-3-small"
CHAT_MODEL_DEFAULT = "gpt-4o-mini"
CHUNK_SIZE = 1200
CHUNK_OVERLAP = 200

# In-memory documents
DOCUMENTS: Dict[str, Dict[str, Any]] = {}

def build_openai_client(api_key: str):
    if not api_key:
        raise ValueError("OpenAI API key is required.")
    os.environ["OPENAI_API_KEY"] = api_key
    return OpenAI(api_key=api_key)

In [3]:
# CELL 2 — Readers (PDF/DOCX/TXT)
# --- Readers ---
def _read_pdf(file_bytes: bytes) -> str:
    reader = PdfReader(io.BytesIO(file_bytes))
    texts = []
    for page in reader.pages:
        try:
            texts.append(page.extract_text() or "")
        except Exception:
            texts.append("")
    return "\n".join(texts)

def _read_docx(file_bytes: bytes) -> str:
    bio = io.BytesIO(file_bytes)
    doc = DocxDocument(bio)
    return "\n".join(p.text for p in doc.paragraphs)

def _read_txt(file_bytes: bytes) -> str:
    try:
        return file_bytes.decode("utf-8")
    except Exception:
        return file_bytes.decode("latin-1", errors="ignore")

# Page-aware PDF reader (returns list of {"page": int, "text": str})
def _read_pdf_pages(file_bytes: bytes):
    reader = PdfReader(io.BytesIO(file_bytes))
    pages = []
    for i, page in enumerate(reader.pages, start=1):
        try:
            text = page.extract_text() or ""
        except Exception:
            text = ""
        pages.append({"page": i, "text": text})
    return pages


def load_file(file_obj) -> Dict[str, Any]:
    """Return dict with keys: name, text, filetype, pages? (for PDFs)."""
    if isinstance(file_obj, str):
        path = file_obj
        name = os.path.basename(path)
        with open(path, 'rb') as f:
            content = f.read()
    else:
        name = getattr(file_obj, 'orig_name', None) or getattr(file_obj, 'name', 'uploaded_file')
        if hasattr(file_obj, 'read'):
            content = file_obj.read()
        else:
            path = getattr(file_obj, 'path', None)
            if not path:
                raise ValueError("Unsupported file object received from Gradio upload.")
            with open(path, 'rb') as f:
                content = f.read()

    if not content:
        raise ValueError(f"{name}: file is empty.")

    lower = name.lower()
    meta = {"name": os.path.basename(name)}

    if lower.endswith('.pdf'):
        pages = _read_pdf_pages(content)
        full_text = "\n".join(p["text"] for p in pages)
        meta.update({"text": (full_text or "").strip(), "filetype": "pdf", "pages": pages})
    elif lower.endswith('.docx'):
        text = _read_docx(content)
        meta.update({"text": (text or "").strip(), "filetype": "docx"})
    elif lower.endswith('.txt'):
        text = _read_txt(content)
        meta.update({"text": (text or "").strip(), "filetype": "txt"})
    else:
        raise ValueError(f"Unsupported file type for {name}. Use PDF/DOCX/TXT.")

    if not meta["text"]:
        raise ValueError(f"{meta['name']}: no extractable text found (scanned PDF or empty file?).")

    return meta

In [4]:
# CELL 3 — Chunking
# --- Chunking ---
def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
    chunks = []
    start = 0
    n = len(text)
    while start < n:
        end = min(start + chunk_size, n)
        chunk = text[start:end]
        chunks.append(chunk)
        if end == n:
            break
        start = end - overlap
        if start < 0:
            start = 0
    return [c.strip() for c in chunks if c.strip()]

In [5]:
# CELL 4 — Embeddings
# --- Embeddings ---
def embed_texts(client: OpenAI, texts: List[str], model: str = EMBED_MODEL_DEFAULT, batch_size: int = 128) -> np.ndarray:
    vectors = []
    def _call(batch, mdl):
        return client.embeddings.create(model=mdl, input=batch)
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        try:
            resp = _call(batch, model)
        except Exception as e1:
            if model == "text-embedding-3-small":
                try:
                    resp = _call(batch, "text-embedding-3-large")
                except Exception as e2:
                    raise RuntimeError(f"Embedding failed on both models: small-> {e1}; large-> {e2}")
            else:
                raise
        for d in resp.data:
            vectors.append(d.embedding)
    return np.array(vectors, dtype=np.float32)

def normalize(vecs: np.ndarray) -> np.ndarray:
    norms = np.linalg.norm(vecs, axis=1, keepdims=True) + 1e-12
    return vecs / norms


In [6]:
#CELL 5 — FAISS (Index/Search)
# --- FAISS ---
def build_faiss_index(embs: np.ndarray) -> faiss.IndexFlatIP:
    index = faiss.IndexFlatIP(embs.shape[1])
    index.add(embs)
    return index

def search_faiss(index: faiss.IndexFlatIP, query_vec: np.ndarray, k: int = 5):
    D, I = index.search(query_vec, k)
    return D, I

In [7]:
#Cell 6 — Persistence (Optional: Google Drive)
# --- Persistence (Drive optional) ---
def mount_drive() -> str:
    if not IN_COLAB:
        return ""
    colab_drive.mount('/content/drive', force_remount=False)
    save_dir = '/content/drive/MyDrive/RAG_MVP_Phase1'
    os.makedirs(save_dir, exist_ok=True)
    return save_dir

def save_state(save_dir: str):
    os.makedirs(save_dir, exist_ok=True)
    state_path = os.path.join(save_dir, 'documents_state.pkl')
    with open(state_path, 'wb') as f:
        pickle.dump(DOCUMENTS, f)
    return state_path

def load_state(save_dir: str):
    state_path = os.path.join(save_dir, 'documents_state.pkl')
    if not os.path.exists(state_path):
        raise FileNotFoundError(f"No saved state at {state_path}")
    with open(state_path, 'rb') as f:
        loaded = pickle.load(f)
    DOCUMENTS.clear()
    DOCUMENTS.update(loaded)
    return True

In [8]:
# CELL 7 - Ingestion
def ingest_files(api_key: str, embed_model: str, files) -> Dict:
    # quick API sanity check
    try:
        _ = OpenAI(api_key=api_key).embeddings.create(model=embed_model, input=["sanity check"]).data[0].embedding
    except Exception as e:
        raise RuntimeError(f"OpenAI key/model check failed: {type(e).__name__}: {e}")

    client = build_openai_client(api_key)
    added = []

    for f in files:
        try:
            meta = load_file(f)  # {name, text, filetype, pages?}
            name = meta["name"]
            filetype = meta.get("filetype", "txt")

            all_chunks = []
            all_metas = []  # each item: {"page": int|None}

            if filetype == "pdf" and "pages" in meta:
                # page-aware chunking
                for page_entry in meta["pages"]:
                    page_no = page_entry["page"]
                    page_text = (page_entry["text"] or "").strip()
                    if not page_text:
                        continue
                    page_chunks = chunk_text(page_text, CHUNK_SIZE, CHUNK_OVERLAP)
                    for ch in page_chunks:
                        all_chunks.append(ch)
                        all_metas.append({"page": page_no})
            else:
                # docx/txt
                text = meta["text"]
                chunks = chunk_text(text, CHUNK_SIZE, CHUNK_OVERLAP)
                for ch in chunks:
                    all_chunks.append(ch)
                    all_metas.append({"page": None})

            if not all_chunks:
                continue

            embs = embed_texts(client, all_chunks, model=embed_model)
            embs = normalize(embs)
            index = build_faiss_index(embs)

            doc_id = f"doc_{int(time.time()*1000)}_{len(DOCUMENTS)+1}"
            DOCUMENTS[doc_id] = {
                "name": name,
                "filetype": filetype,
                "chunks": all_chunks,
                "meta": all_metas,  # <-- page info here
                "embs": embs,
                "index": index,
            }
            added.append({"doc_id": doc_id, "name": name, "chunks": len(all_chunks)})
        except Exception as e:
            print(f"[ingest warning] {type(e).__name__}: {e}\n", traceback.format_exc())

    return {
        "message": f"Ingested {len(added)} file(s).",
        "docs": [{"doc_id": k, "name": v["name"], "chunks": len(v["chunks"])} for k, v in DOCUMENTS.items()]
    }


In [9]:
# CELL 8 - Retrieval + Answering
def assemble_subset_index(selected_doc_ids: List[str]):
    texts, metas, all_embs = [], [], []
    for did in selected_doc_ids:
        rec = DOCUMENTS.get(did)
        if not rec:
            continue
        k = len(rec["chunks"])
        texts.extend(rec["chunks"])
        # keep (doc_id, chunk_idx, page)
        metas.extend([(did, i, rec.get("meta", [{}]*k)[i].get("page")) for i in range(k)])
        all_embs.append(rec["embs"])
    if not all_embs:
        return None, None, None
    embs = np.vstack(all_embs)
    index = build_faiss_index(embs)
    return index, texts, metas

def retrieve(api_key: str, query: str, embed_model: str, selected_doc_ids: List[str], k: int = 5):
    client = build_openai_client(api_key)
    index, texts, metas = assemble_subset_index(selected_doc_ids)
    if index is None:
        return []
    q_vec = client.embeddings.create(model=embed_model, input=[query]).data[0].embedding
    q_vec = np.array(q_vec, dtype=np.float32)[None, :]
    q_vec = normalize(q_vec)
    D, I = search_faiss(index, q_vec, k)
    hits = []
    for rank, idx in enumerate(I[0].tolist()):
        sim = float(D[0][rank])
        did, chunk_idx, page_no = metas[idx]
        doc = DOCUMENTS[did]
        hits.append({
            "rank": rank+1,
            "doc_id": did,
            "doc_name": doc["name"],
            "chunk_idx": chunk_idx,
            "page": page_no,
            "score": sim,
            "text": texts[idx]
        })
    return hits

def make_context_with_citations(hits: List[Dict], max_chars: int = 4000) -> str:
    ctx_parts, citations, total = [], [], 0
    for i, h in enumerate(hits, start=1):
        chunk = (h["text"] or "").strip().replace("\n", " ")
        page_str = f" · p.{h['page']}" if h.get("page") else ""
        prefix = f"[Source {i}: {h['doc_name']}{page_str} · chunk {h['chunk_idx']}]\n"
        part = prefix + chunk + "\n\n"
        if total + len(part) > max_chars:
            break
        ctx_parts.append(part)
        if h.get("page"):
            citations.append(f"[{i}] {h['doc_name']} (p.{h['page']}, chunk {h['chunk_idx']})")
        else:
            citations.append(f"[{i}] {h['doc_name']} (chunk {h['chunk_idx']})")
        total += len(part)
    ctx = "".join(ctx_parts)
    return ctx, citations

def answer_with_rag(api_key: str, chat_model: str, query: str, hits: List[Dict]):
    client = build_openai_client(api_key)
    ctx, citations = make_context_with_citations(hits)
    system_prompt = (
        "You are a helpful assistant. Use the provided sources to answer succinctly. "
        "When you rely on a source, include bracketed reference numbers like [1], [2]. If the sources don't contain the answer, say so."
    )
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Question: {query}\n\nSources:\n{ctx}"}
    ]
    resp = client.chat.completions.create(model=chat_model, messages=messages, temperature=0.2)
    answer = resp.choices[0].message.content
    if citations:
        answer = answer + "\n\nSources:\n" + "\n".join(citations)
    return answer

In [10]:
#CELL 9 - Gradio UI & Launch
# --- Gradio App (defined and launched in this cell) ---
def ui_list_docs():
    return [f"{v['name']} — {k}" for k, v in DOCUMENTS.items()]

def _ids_from_labels(labels: List[str]) -> List[str]:
    ids = []
    for lab in labels or []:
        if '—' in lab:
            ids.append(lab.split('—')[-1].strip())
    return ids

def on_ingest(api_key, embed_model, files):
    if not api_key:
        return gr.update(value="Please enter your OpenAI API key."), gr.update(choices=ui_list_docs(), value=[])
    if not files:
        return gr.update(value="No files selected."), gr.update(choices=ui_list_docs(), value=[])
    # quick API sanity check
    try:
        _ = OpenAI(api_key=api_key).embeddings.create(model=embed_model, input=["sanity check"]).data[0].embedding
    except Exception as e:
        return f"OpenAI key/model check failed: {type(e).__name__}: {e}", gr.update(choices=ui_list_docs(), value=[])
    try:
        res = ingest_files(api_key, embed_model, files)
        msg = res["message"] + "\n" + json.dumps(res["docs"], indent=2)
        return msg, gr.update(choices=ui_list_docs(), value=ui_list_docs())
    except Exception as e:
        tb = traceback.format_exc()
        return f"Ingest failed: {type(e).__name__}: {e}\n\nTraceback:\n{tb}", gr.update(choices=ui_list_docs(), value=[])

def on_ask(api_key, chat_model, embed_model, query, selected_labels, top_k):
    try:
        selected_ids = _ids_from_labels(selected_labels)
        if not selected_ids:
            return "Please select at least one ingested file."
        hits = retrieve(api_key, query, embed_model, selected_ids, k=top_k)
        if not hits:
            return "No results found. Try ingesting files or broadening your question."
        answer = answer_with_rag(api_key, chat_model, query, hits)
        return answer
    except Exception as e:
        tb = traceback.format_exc()
        return f"Error: {type(e).__name__}: {e}\n\nTraceback:\n{tb}"

def on_mount_drive():
    if not IN_COLAB:
        return "This action is only available in Google Colab.", ""
    save_dir = mount_drive()
    return f"Drive mounted. Save dir: {save_dir}", save_dir

def on_save(save_dir):
    if not save_dir:
        return "Provide a Google Drive folder path first."
    path = save_state(save_dir)
    return f"Saved state to: {path}"

def on_load(save_dir):
    try:
        load_state(save_dir)
        return f"Loaded state from: {save_dir}", gr.update(choices=ui_list_docs(), value=ui_list_docs())
    except Exception as e:
        return f"Load failed: {e}", gr.update()

with gr.Blocks(title="Phase 1 – RAG MVP") as demo:
    gr.Markdown("""
    # Phase 1 – RAG MVP
    **Bring Your Own OpenAI Key**. Ingest PDF/DOCX/TXT → chunk + embed → FAISS → ask with citations.
    """)

    with gr.Row():
        api_key = gr.Textbox(label="OpenAI API Key", type="password", placeholder="sk-...", show_label=True)
        chat_model = gr.Dropdown(choices=["gpt-4o-mini", "gpt-4o", "gpt-4.1-mini"], value=CHAT_MODEL_DEFAULT, label="Chat Model")
        embed_model = gr.Dropdown(choices=["text-embedding-3-small", "text-embedding-3-large"], value=EMBED_MODEL_DEFAULT, label="Embedding Model")

    # Shared document selector across tabs
    doc_selector = gr.CheckboxGroup(label="Available Documents (shared)", choices=[])

    with gr.Tab("Ingest"):
        files = gr.File(label="Upload files (PDF, DOCX, TXT)", file_count="multiple", type="filepath", file_types=[".pdf", ".docx", ".txt"])
        ingest_btn = gr.Button("Ingest")
        ingest_log = gr.Textbox(label="Ingestion Log", lines=10)
        ingest_btn.click(on_ingest, inputs=[api_key, embed_model, files], outputs=[ingest_log, doc_selector])

    with gr.Tab("Ask"):
        with gr.Row():
            query = gr.Textbox(label="Your question", placeholder="Ask me about your documents...", lines=3)
        with gr.Row():
            top_k = gr.Slider(1, 10, value=5, step=1, label="Top-K Chunks")
        ask_btn = gr.Button("Ask")
        answer_out = gr.Markdown()
        ask_btn.click(on_ask, inputs=[api_key, chat_model, embed_model, query, doc_selector, top_k], outputs=[answer_out,])

    with gr.Tab("Google Drive (Optional)"):
        drive_status = gr.Textbox(label="Status")
        save_dir = gr.Textbox(label="Save folder (e.g., /content/drive/MyDrive/RAG_MVP_Phase1)")
        with gr.Row():
            mount_btn = gr.Button("Mount Drive (Colab)")
            mount_btn.click(on_mount_drive, inputs=[], outputs=[drive_status, save_dir])
        with gr.Row():
            save_btn = gr.Button("Save State")
            load_btn = gr.Button("Load State")
        save_btn.click(on_save, inputs=[save_dir], outputs=[drive_status])
        load_btn.click(on_load, inputs=[save_dir], outputs=[drive_status, doc_selector])

    gr.Markdown("Built for fast iteration. ⚡️")

# Launch immediately to avoid ordering issues
demo.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://06498520080b700987.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


