<a href="https://colab.research.google.com/github/Renlim61/MVP_Product001_2025_Tier120pbc/blob/main/Phase1_RAG_MVP_clean_v7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Phase 1 – GenAI RAG MVP (Colab, Clean v7) This version ran successfuly.

**Goal:** Single-tenant RAG assistant (BYO OpenAI key). Ingest PDF/DOCX/TXT → chunk + embed → search with FAISS → answer with citations.

### Highlights
- Upload **.pdf / .docx / .txt**
- OpenAI **embeddings** (fallback to large if small not available)
- **FAISS** cosine similarity
- **Gradio UI** with per-prompt document selection
- **Optional**: Save/Load state to Google Drive

**Instructions:** Run the next cell, paste your OpenAI key in the UI, upload files, click **Ingest**, then ask questions under **Ask**.


Cell 8 (updates)
Core backend helpers consolidated: ingestion pipeline, chunking strategy, embedding calls, FAISS index handling, retrieval, RAG-answer assembly.
Persistence helpers expanded: save/load state, directory conventions for storing indexes and metadata.
Improved error handling, API sanity checks, and global state containers (DOCUMENTS, model defaults).




Cell 9 (updates / moved to cell9.py)
Gradio UI and event handlers (ingest, ask, Drive mount/save/load) packaged as a standalone file; UI improvements for document selection and better logging.




Cell 10
Document normalization: cleaned and standardized input text (Unicode, whitespace, simple OCR fixes) before chunking.




Cell 11
Chunking enhancements: configurable chunk size/overlap, sentence-aware splits, and token-count based chunking for consistent embeddings.




Cell 12
Per-document vector store management: create/load a separate FAISS index (and metadata files) per document or logical collection; folder layout conventions for indexes and metadata.




Cell 13
Batch embedding & rate-control: batched embedding calls with retry/backoff logic and optional parallelism to improve throughput and handle API limits.




Cell 14
Metadata and citation support: store source/file offsets, chunk IDs, and human-friendly citations for use in RAG outputs.




Cell 15
Retrieval improvements: hybrid filtering (by document IDs), configurable Top-K, and basic scoring/post-processing to prefer higher-quality chunks.




Cell 16
State management & export: functions to export/import full project state (indexes, docs, metadata) and lightweight snapshots suitable for Google Drive persistence.




Cell 17
Utilities, testing & debug helpers: quick QA checks, ingestion summary reports, verbose logging toggle, and small diagnostic endpoints to inspect index contents and sample embeddings.



Overall workflow impact

More robust ingestion (clean → chunk → batch-embed → per-doc index).
Better retrieval (document-scoped searches, metadata-aware citations).
Reliable persistence (clear folder layout, save/load/export to Drive).
Scalable embedding (batching, retries, rate control) and clearer debugging/logging.

In [None]:
# CELL 0 - INSTALL & IMPORTS
# (Run once) Install / import required packages
!pip install -q faiss-cpu openai gradio
# Standard imports
import osimport json
import shutil
import pickle
import numpy as npfrom typing
import List, Dict
import faissimport gradio as gr
# OpenAI client import placeholder (your notebook likely creates a client later)from openai import OpenAI
# Defaults
EMBED_MODEL_DEFAULT = "text-embedding-3-small"
CHAT_MODEL_DEFAULT = "gpt-4o-mini"

In [None]:
# Cell 1 — Config & Globals
# Cell 1 — Config & Globals
# Base paths and globals
BASE_DRIVE = "/content/drive/MyDrive/MVP_RAG" # will be created when Drive mounted
COHORTS_DIR = os.path.join(BASE_DRIVE, "coherts")

In [None]:
# CELL 2 — Readers (PDF/DOCX/TXT)
# --- Readers ---
def _read_pdf(file_bytes: bytes) -> str:
    reader = PdfReader(io.BytesIO(file_bytes))
    texts = []
    for page in reader.pages:
        try:
            texts.append(page.extract_text() or "")
        except Exception:
            texts.append("")
    return "\n".join(texts)

def _read_docx(file_bytes: bytes) -> str:
    bio = io.BytesIO(file_bytes)
    doc = DocxDocument(bio)
    return "\n".join(p.text for p in doc.paragraphs)

def _read_txt(file_bytes: bytes) -> str:
    try:
        return file_bytes.decode("utf-8")
    except Exception:
        return file_bytes.decode("latin-1", errors="ignore")

# Page-aware PDF reader (returns list of {"page": int, "text": str})
def _read_pdf_pages(file_bytes: bytes):
    reader = PdfReader(io.BytesIO(file_bytes))
    pages = []
    for i, page in enumerate(reader.pages, start=1):
        try:
            text = page.extract_text() or ""
        except Exception:
            text = ""
        pages.append({"page": i, "text": text})
    return pages


def load_file(file_obj) -> Dict[str, Any]:
    """Return dict with keys: name, text, filetype, pages? (for PDFs)."""
    if isinstance(file_obj, str):
        path = file_obj
        name = os.path.basename(path)
        with open(path, 'rb') as f:
            content = f.read()
    else:
        name = getattr(file_obj, 'orig_name', None) or getattr(file_obj, 'name', 'uploaded_file')
        if hasattr(file_obj, 'read'):
            content = file_obj.read()
        else:
            path = getattr(file_obj, 'path', None)
            if not path:
                raise ValueError("Unsupported file object received from Gradio upload.")
            with open(path, 'rb') as f:
                content = f.read()

    if not content:
        raise ValueError(f"{name}: file is empty.")

    lower = name.lower()
    meta = {"name": os.path.basename(name)}

    if lower.endswith('.pdf'):
        pages = _read_pdf_pages(content)
        full_text = "\n".join(p["text"] for p in pages)
        meta.update({"text": (full_text or "").strip(), "filetype": "pdf", "pages": pages})
    elif lower.endswith('.docx'):
        text = _read_docx(content)
        meta.update({"text": (text or "").strip(), "filetype": "docx"})
    elif lower.endswith('.txt'):
        text = _read_txt(content)
        meta.update({"text": (text or "").strip(), "filetype": "txt"})
    else:
        raise ValueError(f"Unsupported file type for {name}. Use PDF/DOCX/TXT.")

    if not meta["text"]:
        raise ValueError(f"{meta['name']}: no extractable text found (scanned PDF or empty file?).")

    return meta

In [None]:
# CELL 3 — Chunking
# --- Chunking ---
def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
    chunks = []
    start = 0
    n = len(text)
    while start < n:
        end = min(start + chunk_size, n)
        chunk = text[start:end]
        chunks.append(chunk)
        if end == n:
            break
        start = end - overlap
        if start < 0:
            start = 0
    return [c.strip() for c in chunks if c.strip()]

In [None]:
# CELL 4 — Embeddings
# --- Embeddings ---
def embed_texts(client: OpenAI, texts: List[str], model: str = EMBED_MODEL_DEFAULT, batch_size: int = 128) -> np.ndarray:
    vectors = []
    def _call(batch, mdl):
        return client.embeddings.create(model=mdl, input=batch)
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        try:
            resp = _call(batch, model)
        except Exception as e1:
            if model == "text-embedding-3-small":
                try:
                    resp = _call(batch, "text-embedding-3-large")
                except Exception as e2:
                    raise RuntimeError(f"Embedding failed on both models: small-> {e1}; large-> {e2}")
            else:
                raise
        for d in resp.data:
            vectors.append(d.embedding)
    return np.array(vectors, dtype=np.float32)

def normalize(vecs: np.ndarray) -> np.ndarray:
    norms = np.linalg.norm(vecs, axis=1, keepdims=True) + 1e-12
    return vecs / norms


In [None]:
#CELL 5 — FAISS (Index/Search)
# --- FAISS ---
def build_faiss_index(embs: np.ndarray) -> faiss.IndexFlatIP:
    index = faiss.IndexFlatIP(embs.shape[1])
    index.add(embs)
    return index

def search_faiss(index: faiss.IndexFlatIP, query_vec: np.ndarray, k: int = 5):
    D, I = index.search(query_vec, k)
    return D, I

In [None]:
#Cell 6 — Persistence (Optional: Google Drive)
# --- Persistence (Drive optional) ---
def mount_drive() -> str:
    if not IN_COLAB:
        return ""
    colab_drive.mount('/content/drive', force_remount=False)
    save_dir = '/content/drive/MyDrive/RAG_MVP_Phase1'
    os.makedirs(save_dir, exist_ok=True)
    return save_dir

def save_state(save_dir: str):
    os.makedirs(save_dir, exist_ok=True)
    state_path = os.path.join(save_dir, 'documents_state.pkl')
    with open(state_path, 'wb') as f:
        pickle.dump(DOCUMENTS, f)
    return state_path

def load_state(save_dir: str):
    state_path = os.path.join(save_dir, 'documents_state.pkl')
    if not os.path.exists(state_path):
        raise FileNotFoundError(f"No saved state at {state_path}")
    with open(state_path, 'rb') as f:
        loaded = pickle.load(f)
    DOCUMENTS.clear()
    DOCUMENTS.update(loaded)
    return True

In [None]:
# CELL 7 - Ingestion
def ingest_files(api_key: str, embed_model: str, files) -> Dict:
    # quick API sanity check
    try:
        _ = OpenAI(api_key=api_key).embeddings.create(model=embed_model, input=["sanity check"]).data[0].embedding
    except Exception as e:
        raise RuntimeError(f"OpenAI key/model check failed: {type(e).__name__}: {e}")

    client = build_openai_client(api_key)
    added = []

    for f in files:
        try:
            meta = load_file(f)  # {name, text, filetype, pages?}
            name = meta["name"]
            filetype = meta.get("filetype", "txt")

            all_chunks = []
            all_metas = []  # each item: {"page": int|None}

            if filetype == "pdf" and "pages" in meta:
                # page-aware chunking
                for page_entry in meta["pages"]:
                    page_no = page_entry["page"]
                    page_text = (page_entry["text"] or "").strip()
                    if not page_text:
                        continue
                    page_chunks = chunk_text(page_text, CHUNK_SIZE, CHUNK_OVERLAP)
                    for ch in page_chunks:
                        all_chunks.append(ch)
                        all_metas.append({"page": page_no})
            else:
                # docx/txt
                text = meta["text"]
                chunks = chunk_text(text, CHUNK_SIZE, CHUNK_OVERLAP)
                for ch in chunks:
                    all_chunks.append(ch)
                    all_metas.append({"page": None})

            if not all_chunks:
                continue

            embs = embed_texts(client, all_chunks, model=embed_model)
            embs = normalize(embs)
            index = build_faiss_index(embs)

            doc_id = f"doc_{int(time.time()*1000)}_{len(DOCUMENTS)+1}"
            DOCUMENTS[doc_id] = {
                "name": name,
                "filetype": filetype,
                "chunks": all_chunks,
                "meta": all_metas,  # <-- page info here
                "embs": embs,
                "index": index,
            }
            added.append({"doc_id": doc_id, "name": name, "chunks": len(all_chunks)})
        except Exception as e:
            print(f"[ingest warning] {type(e).__name__}: {e}\n", traceback.format_exc())

    return {
        "message": f"Ingested {len(added)} file(s).",
        "docs": [{"doc_id": k, "name": v["name"], "chunks": len(v["chunks"])} for k, v in DOCUMENTS.items()]
    }


In [None]:
# CELL 8 - Retrieval + Answering (updated to support cohort querying via "cohort:<name>")
def assemble_subset_index(selected_doc_ids: List[str]):
    texts, metas, all_embs = [], [], []
    for did in selected_doc_ids:
        rec = DOCUMENTS.get(did)
        if not rec:
          continue
        k = len(rec["chunks"])
        texts.extend(rec["chunks"])
        # keep (doc_id, chunk_idx, page)
        metas.extend([(did, i, rec.get("meta", [{}]*k)[i].get("page")) for i in range(k)])
        all_embs.append(rec["embs"])

        if not all_embs:
           return None, None, None
        embs = np.vstack(all_embs)
        index = build_faiss_index(embs)
        return index, texts, meta

def _format_cohort_hits(cohort_results: List[Dict], scores: List[float]):
   # cohort metadata entries expected to have keys like: "chunk", "source_file", optional "page"
   hits = []
   for rank, (meta, score) in enumerate(zip(cohort_results, scores), start=1:
       src = meta.get("source_file", "cohort")
       chunk_text = meta.get("chunk", "")
       page_no = meta.get("page")
       hits.append({
           "rank": rank,
           "doc_id": src,
           "doc_name": src,
           "chunk_idx": 0,
           "page": page_no,
           "score": score,
           "text": chunk_text
           })
  return hits

def retrieve(api_key: str, query: str, embed_model: str, selected_doc_ids: List[str], k: int = 5):
    client = build_openai_client(api_key)
    # Special case: query an entire cohort by passing selected_doc_ids = ["cohort:NAME"]
    if isinstance(selected_doc_ids, (list, tuple)) and len(selected_doc_ids) == 1 and isinstance(selected_doc_ids[0], str) and selected_doc_ids[0].startswith("cohort:"):
       cohort_name = selected_doc_ids[0].split("cohort:", 1)[1]        try:
           cohort_results, scores = query_cohort(cohort_name, client, query, k=k)
       except Exception as e:
           # return empty on error (or optionally return the exception message)
           print("cohort query error:", e)
           return []
       return _format_cohort_hits(cohort_results, scores)

    # Default: assemble subset index from DOCUMENTS
    index, texts, metas = assemble_subset_index(selected_doc_ids)
    if index is None:
       return []

    # embed query using your existing client pattern
    q_vec = client.embeddings.create(model=embed_model, input=[query]).data[0].embedding
    q_vec = np.array(q_vec, dtype=np.float32)[None, :]
    q_vec = normalize(q_vec)
    D, I = search_faiss(index, q_vec, k)
    hits = []
    for rank, idx in enumerate(I[0].tolist()):
        sim = float(D[0][rank])
        did, chunk_idx, page_no = metas[idx]
        doc = DOCUMENTS[did]
        hits.append({
            "rank": rank+1,
            "doc_id": did,
            "doc_name": doc["name"],
            "chunk_idx": chunk_idx,
            "page": page_no,
            "score": sim,
            "text": texts[idx]
            })
    return hits

def make_context_with_citations(hits: List[Dict], max_chars: int = 4000) -> str:
    ctx_parts, citations, total = [], [], 0
    for i, h in enumerate(hits, start=1):
        chunk = (h["text"] or "").strip().replace("\n", " ")
        page_str = f" · p.{h['page']}" if h.get("page") else ""
        prefix = f"[Source {i}: {h['doc_name']}{page_str} · chunk {h['chunk_idx']}]\n"
        part = prefix + chunk + "\n\n"
        if total + len(part) > max_chars:
            break
        ctx_parts.append(part)
        if h.get("page"):
            citations.append(f"[{i}] {h['doc_name']} (p.{h['page']}, chunk {h['chunk_idx']})")
            else:
            citations.append(f"[{i}] {h['doc_name']} (chunk {h['chunk_idx']})")
            total += len(part)
    ctx = "".join(ctx_parts)
    return ctx, citations

def answer_with_rag(api_key: str, chat_model: str, query: str, hits: List[Dict]):
    client = build_openai_client(api_key)
    ctx, citations = make_context_with_citations(hits)
    system_prompt = (
        "You are a helpful assistant. Use the provided sources to answer succinctly. "
        "When you rely on a source, include bracketed reference numbers like [1], [2]. If the sources don't contain the answer, say so."
        )
        messages = [
            {"role": "system", "content": system_prompt},
             {"role": "user", "content": f"Question: {query}\n\nSources:\n{ctx}"}
        ]
        resp = client.chat.completions.create(model=chat_model, messages=messages, temperature=0.2)
        answer = resp.choices[0].message.content
        if citations:
            answer = answer + "\n\nSources:\n" + "\n".join(citations)
        return answer

In [None]:
# CELL 9 - Gradio UI & Launch# --- Gradio App (defined and launched in this cell) ---
def ui_list_docs():
    return [f"{v['name']} — {k}" for k, v in DOCUMENTS.items()]

def _ids_from_labels(labels: List[str]) -> List[str]:
    ids = []
    for lab in labels or []:
        if '—' in lab:
           ids.append(lab.split('—')[-1].strip())
    return ids

def on_ingest(api_key, embed_model, files):
    if not api_key:
        return gr.update(value="Please enter your OpenAI API key."), gr.update(choices=ui_list_docs(), value=[])
    if not files:
        return gr.update(value="No files selected."), gr.update(choices=ui_list_docs(), value=[])

    # quick API sanity check
    try:
        _ = OpenAI(api_key=api_key).embeddings.create(model=embed_model, input=["sanity check"]).data[0].embedding
    except Exception as e:
        return f"OpenAI key/model check failed: {type(e).__name__}: {e}", gr.update(choices=ui_list_docs(), value=[])
    try:
        res = ingest_files(api_key, embed_model, files)
        msg = res["message"] + "\n" + json.dumps(res["docs"], indent=2)        return msg, gr.update(choices=ui_list_docs(), value=ui_list_docs())
    except Exception as e:
        tb = traceback.format_exc()
        return f"Ingest failed: {type(e).__name__}: {e}\n\nTraceback:\n{tb}", gr.update(choices=ui_list_docs(), value=[])

def on_ask(api_key, chat_model, embed_model, query, selected_labels, top_k):
    try:
        selected_ids = _ids_from_labels(selected_labels)
        if not selected_ids:
            return "Please select at least one ingested file."
        hits = retrieve(api_key, query, embed_model, selected_ids, k=top_k)
        if not hits:
            return "No results found. Try ingesting files or broadening your question."
        answer = answer_with_rag(api_key, chat_model, query, hits)
        return answer
    except Exception as e:
        tb = traceback.format_exc()
        return f"Error: {type(e).__name__}: {e}\n\nTraceback:\n{tb}"

def on_mount_drive():
    if not IN_COLAB:
        return "This action is only available in Google Colab.", ""
    save_dir = mount_drive()
    return f"Drive mounted. Save dir: {save_dir}", save_dir

def on_save(save_dir):
    if not save_dir:
        return "Provide a Google Drive folder path first."
    path = save_state(save_dir)
    return f"Saved state to: {path}"

def on_load(save_dir):
    try:
        load_state(save_dir)
        return f"Loaded state from: {save_dir}", gr.update(choices=ui_list_docs(), value=ui_list_docs())
    except Exception as e:
      return f"Load failed: {e}", gr.update()

with gr.Blocks(title="Phase 1 – RAG MVP") as demo:
    gr.Markdown("""
    # Phase 1 – RAG MVP
    **Bring Your Own OpenAI Key**. Ingest PDF/DOCX/TXT → chunk + embed → FAISS → ask with citations.
    """)

    with gr.Row():
        api_key = gr.Textbox(label="OpenAI API Key", type="password", placeholder="sk-...", show_label=True)
        chat_model = gr.Dropdown(choices=["gpt-4o-mini", "gpt-4o", "gpt-4.1-mini"], value=CHAT_MODEL_DEFAULT, label="Chat Model")        embed_model = gr.Dropdown(choices=["text-embedding-3-small", "text-embedding-3-large"], value=EMBED_MODEL_DEFAULT, label="Embedding Model")
    # Shared document selector across tabs
    doc_selector = gr.CheckboxGroup(label="Available Documents (shared)", choices=[])

    with gr.Tab("Ingest"):
      files = gr.File(label="Upload files (PDF, DOCX, TXT)", file_count="multiple", type="filepath", file_types=[".pdf", ".docx", ".txt"])
      ingest_btn = gr.Button("Ingest")
      ingest_log = gr.Textbox(label="Ingestion Log", lines=10)        ingest_btn.click(on_ingest, inputs=[api_key, embed_model, files], outputs=[ingest_log, doc_selector])

      with gr.Tab("Ask"):
          with gr.Row():
              query = gr.Textbox(label="Your question", placeholder="Ask me about your documents...", lines=3)
          with gr.Row():
              top_k = gr.Slider(1, 10, value=5, step=1, label="Top-K Chunks")
          ask_btn = gr.Button("Ask")
          answer_out = gr.Markdown()
          ask_btn.click(on_ask, inputs=[api_key, chat_model, embed_model, query, doc_selector, top_k], outputs=[answer_out,])

          with gr.Tab("Google Drive (Optional)"):
              drive_status = gr.Textbox(label="Status")
              save_dir = gr.Textbox(label="Save folder (e.g., /content/drive/MyDrive/RAG_MVP_Phase1)")

              with gr.Row():
                  mount_btn = gr.Button("Mount Drive (Colab)")            mount_btn.click(on_mount_drive, inputs=[], outputs=[drive_status, save_dir])
              with gr.Row():
                  save_btn = gr.Button("Save State")
                  load_btn = gr.Button("Load State")

              save_btn.click(on_save, inputs=[save_dir], outputs=[drive_status])

              load_btn.click(on_load, inputs=[save_dir], outputs=[drive_status, doc_selector])

        gr.Markdown("Built for fast iteration. ⚡️")

# Launch immediately to avoid ordering issues
demo.launch()

In [None]:
# CELL 10 - Mount Drive & pathsfrom google.colab import drive
drive.mount('/content/drive', force_remount=True)
import os, jsonBASE_DRIVE = "/content/drive/MyDrive/MVP_RAG"
COHORTS_DIR = os.path.join(BASE_DRIVE, "cohorts")
os.makedirs(COHORTS_DIR, exist_ok=True)

print("Cohorts root:", COHORTS_DIR)

In [None]:
# CELL 11 - Imports + detect embedding dim
import faiss, pickle, shutil
import numpy as np
from typing import List, Dict
EMBED_MODEL_DEFAULT = "text-embedding-3-small"  # ensure matches Cell 0

def detect_embed_dim(client) -> int:
      # safe tiny probe to detect embedding dimension at runtime
      sample = ["test"]
      vecs = embed_texts(client, sample, model=EMBED_MODEL_DEFAULT)
      return len(vecs[0])

In [None]:
# CELL 12 - Cohort filesystem helpers
def cohort_path(name: str) -> str:
  return os.path.join(COHORTS_DIR, name)

def ensure_cohort_dirs(name: str):
  p = cohort_path(name)
  os.makedirs(os.path.join(p, "files"), exist_ok=True)
  os.makedirs(os.path.join(p, "chunks"), exist_ok=True)

  def list_cohorts() -> List[str]:
    if not os.path.exists(COHORTS_DIR):
      return []
    return sorted([d for d in os.listdir(COHORTS_DIR) if os.path.isdir(cohort_path(d))])

In [None]:
# CELL 13 - Create, load, persist cohort

def create_cohort(name: str, client, description: str = "") -> Dict:
  p = cohort_path(name)
  if os.path.exists(p):
    raise FileExistsError(f"Cohort '{name}' exists")
    ensure_cohort_dirs(name)
    dim = detect_embed_dim(client)
    index = faiss.IndexFlatIP(dim)  # use inner-product on normalized vectors
    faiss.write_index(index, os.path.join(p, "index.faiss"))
    with open(os.path.join(p, "metadata.pkl"), "wb") as f:
      pickle.dump([], f)
    manifest = {"description": description, "embed_model": EMBED_MODEL_DEFAULT, "dim": dim}
    with open(os.path.join(p, "manifest.json"), "w") as f:
      json.dump(manifest, f)
    return manifest

  def load_cohort(name: str):
    p = cohort_path(name)
    if not os.path.exists(p):
      raise FileNotFoundError(name)
    idx_path = os.path.join(p, "index.faiss")
    meta_path = os.path.join(p, "metadata.pkl")
    manifest_path = os.path.join(p, "manifest.json")
    index = faiss.read_index(idx_path)
    with open(meta_path, "rb") as f:
      metadata = pickle.load(f)
    with open(manifest_path, "r") as f:
      manifest = json.load(f)
    return {"index": index, "metadata": metadata, "manifest": manifest, "path": p}

  def persist_cohort(name: str, index, metadata, manifest=None):
    p = cohort_path(name)
    faiss.write_index(index, os.path.join(p, "index.faiss"))
    with open(os.path.join(p, "metadata.pkl"), "wb") as f:
      pickle.dump(metadata, f)
    if manifest is not None:
      with open(os.path.join(p, "manifest.json"), "w") as f:
        json.dump(manifest, f)

In [None]:
# CELL 14 - Ingest files into cohort (relies on chunk_text and embed_texts)

def _normalize_vectors(np_arr: np.ndarray) -> np.ndarray:
    norms = np.linalg.norm(np_arr, axis=1, keepdims=True)
    norms[norms == 0] = 1.0
    return np_arr / norms

def ingest_file_to_cohort(name: str, client, local_file_path: str, verbose=False) -> int:
      cpath = cohort_path(name)
      if not os.path.exists(cpath):
        raise FileNotFoundError(name)
      files_dir = os.path.join(cpath, "files")
      os.makedirs(files_dir, exist_ok=True)
      dest = os.path.join(files_dir, os.path.basename(local_file_path))
      shutil.copy(local_file_path, dest)

      # your chunk_text should return list of (chunk_text, meta_dict)
      chunks = chunk_text(dest)
      texts = [t for t, meta in chunks]
      if not texts:
         return 0

      embeddings = embed_texts(client, texts, model=EMBED_MODEL_DEFAULT)   emb_np = np.array(embeddings, dtype=np.float32)
      emb_np = _normalize_vectors(emb_np)

      cohort = load_cohort(name)
      index = cohort["index"]
      metadata = cohort["metadata"]

      index.add(emb_np)
      for i, (txt, meta) in enumerate(chunks):
          entry = {"chunk": txt, "source_file": os.path.basename(dest)}
          if isinstance(meta, dict):
              entry.update(meta)
          metadata.append(entry)

      persist_cohort(name, index, metadata, manifest=cohort["manifest"])
      if verbose:
          print(f"Added {len(texts)} chunks to cohort '{name}'")
      return len(texts)

In [None]:
# CELL 15 - Delete file from cohort (rebuild index)

def delete_file_from_cohort(name: str, filename: str, client) -> bool:
    cohort = load_cohort(name)
    p = cohort["path"]
    files_dir = os.path.join(p, "files")
    target = os.path.join(files_dir, filename)
    if os.path.exists(target):
        os.remove(target)

    remaining = [m for m in cohort["metadata"] if m.get("source_file") != filename]
    texts = [m["chunk"] for m in remaining]
    if texts:
       embeddings = embed_texts(client, texts, model=cohort["manifest"].get("embed_model",EMBED_MODEL_DEFAULT))
       emb_np = np.array(embeddings, dtype=np.float32)
       emb_np = _normalize_vectors(emb_np)
       dim = emb_np.shape[1]
       new_index = faiss.IndexFlatIP(dim)
       new_index.add(emb_np)
    else:
       dim = cohort["manifest"].get("dim", detect_embed_dim(client))        new_index = faiss.IndexFlatIP(dim)

  persist_cohort(name, new_index, remaining, manifest=cohort["manifest"])    return True

In [None]:
# CELL 16 - Query cohort

def query_cohort(name: str, client, query: str, k: int = 5):
  cohort = load_cohort(name)
  index = cohort["index"]
  metadata = cohort["metadata"]
  if len(metadata) == 0:
    return [], []

    q_emb = embed_texts(client, [query], model=cohort["manifest"].get("embed_model", EMBED_MODEL_DEFAULT))
    q_np = np.array(q_emb, dtype=np.float32)
    q_np = _normalize_vectors(q_np)

    D, I = index.search(q_np, k)
    results = []
    scores = []
    for idx, score in zip(I[0], D[0]):
      if idx < len(metadata):
        results.append(metadata[idx])
        scores.append(float(score))
    return results, scores

In [None]:
# CELL 17 - Minimal Gradio wiring for cohorts (adapt to your existing UI variables)import gradio as gr
def ui_create_cohort(name, desc, api_key):
    from openai import OpenAI
    client_local = OpenAI(api_key=api_key)
    create_cohort(name, client_local, description=desc)
    return gr.update(choices=list_cohorts()), f"Created cohort '{name}'"

def ui_ingest_files(selected_cohort, uploaded_files, api_key):
    from openai import OpenAI
    client_local = OpenAI(api_key=api_key)
    added = 0
    for f in uploaded_files:
      path = f if isinstance(f, str) else f.name
      added += ingest_file_to_cohort(selected_cohort, client_local, path)   return f"Added {added} chunks to '{selected_cohort}'"

def ui_list_files(selected_cohort):
    p = cohort_path(selected_cohort)
    files_dir = os.path.join(p, "files")
    if not os.path.exists(files_dir):
      return []
    return sorted(os.listdir(files_dir))

def ui_delete_file(selected_cohort, filename, api_key):
    from openai import OpenAI
    client_local = OpenAI(api_key=api_key)
    delete_file_from_cohort(selected_cohort, filename, client_local)
    return f"Deleted {filename}"

def ui_delete_cohort(name):
    p = cohort_path(name)
    if os.path.exists(p):
      shutil.rmtree(p)
    return gr.update(choices=list_cohorts()), f"Deleted cohort '{name}'"

