In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os, textwrap
base = '/content/drive/MyDrive'
for root, dirs, files in os.walk(base):
    jsons = [f for f in files if f.endswith('.json')]
    if jsons:
        print(f"\n📁 {root}\n    {len(jsons)} JSON files (showing up to 5):")
        for j in jsons[:5]:
            print("     •", j)



📁 /content/drive/MyDrive/Patient_records/fhir (FHIR R4 Synthea)
    1180 JSON files (showing up to 5):
     • Lucia634_Bahena335_f49e08bc-8983-4835-b87f-26adc9ab0e24.json
     • Allegra202_Nitzsche158_66d017d6-97ae-4de0-b02c-2c1caa5cddf6.json
     • Adam631_Cronin387_aff8f143-2375-416f-901d-b0e4c73e3e58.json
     • Vernon254_Kuphal363_278091a2-c98a-4324-bbc8-9d032aeddaa4.json
     • Gaynell126_Abshire638_243d664f-91db-4a01-b154-c23445218635.json


KeyboardInterrupt: 

In [None]:
# ⚠️ Run *once* per Colab session
!pip install --quiet \
      fhir.resources pandas pyarrow tqdm \
      sentence-transformers faiss-gpu \
      transformers accelerate bitsandbytes

# (Optional) uncomment if you want to track Hugging Face downloads
from huggingface_hub import login; login()

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/49.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.2/49.2 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: Could not find a version that satisfies the requirement faiss-gpu (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for faiss-gpu[0m[31m
[0m

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import torch, subprocess, re, os, sys, textwrap
print(torch.version.cuda)         # e.g. '12.1'

12.4


In [None]:
%%bash
# takes ~10 minutes on Colab A100
apt-get update -qq && apt-get install -y -qq build-essential cmake libopenblas-dev swig libgflags-dev
git clone --depth 1 https://github.com/facebookresearch/faiss.git
cd faiss
cmake -B build -DFAISS_ENABLE_GPU=ON -DFAISS_ENABLE_PYTHON=ON -DCMAKE_BUILD_TYPE=Release .
cmake --build build -j "$(nproc)"
pip install ./build/faiss/python

Selecting previously unselected package libgflags2.2.
(Reading database ... (Reading database ... 5%(Reading database ... 10%(Reading database ... 15%(Reading database ... 20%(Reading database ... 25%(Reading database ... 30%(Reading database ... 35%(Reading database ... 40%(Reading database ... 45%(Reading database ... 50%(Reading database ... 55%(Reading database ... 60%(Reading database ... 65%(Reading database ... 70%(Reading database ... 75%(Reading database ... 80%(Reading database ... 85%(Reading database ... 90%(Reading database ... 95%(Reading database ... 100%(Reading database ... 126281 files and directories currently installed.)
Preparing to unpack .../libgflags2.2_2.2.2-2_amd64.deb ...
Unpacking libgflags2.2 (2.2.2-2) ...
Selecting previously unselected package libgflags-dev.
Preparing to unpack .../libgflags-dev_2.2.2-2_amd64.deb ...
Unpacking libgflags-dev (2.2.2-2) ...
Selecting previously unselected package swig4.0.
Preparing to unpack .../

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Cloning into 'faiss'...
  Compatibility with CMake < 3.10 will be removed from a future version of
  CMake.

  Update the VERSION argument <min> value.  Or, use the <min>...<max> syntax
  to tell CMake that the project requires at least <min> but has been updated
  to work with policies introduced by <max> or earlier.


  Compatibility with CMake < 3.10 will be removed from a future version of
  CMake.

  Update the VERSION argument <min> value.  Or, use the <min>...<max> syntax
  to tell CMake that the project requires at least <min> but has been updated
  to work with policies introduced by <max> or earlier.


  struct IndexFlatCodes : Index {
         ^


  struct IndexFlatCodes : Index {
         ^

  struct IndexFlat : IndexFlatCodes {
         ^

  struct IndexFlat1D : IndexFlatL2 {
         ^


In [None]:
import faiss, torch
print("FAISS version :", faiss.__version__)          # expect 1.11.0
print("CUDA  version :", torch.version.cuda)         # 12.4
try:
    n = faiss.get_num_gpus()
    print("GPUs visible:", n)                        # expect ≥1
    res = faiss.StandardGpuResources()
    print("GPU index OK:", bool(res))                # True → everything wired up
except Exception as e:
    print("GPU test failed →", e)

FAISS version : 1.11.0
CUDA  version : 12.4
GPUs visible: 1
GPU index OK: True


In [None]:
# ──────────────────────────────────────────────
#  Ingest  ➜  flatten  ➜  embed  (schema‑agnostic)
# ──────────────────────────────────────────────
!pip install fhir.resources

from pathlib import Path
from collections import defaultdict
import json, pandas as pd, tqdm, numpy as np, torch
from sentence_transformers import SentenceTransformer

RAW_DIR  = Path('/content/drive/MyDrive/Patient_records/fhir (FHIR R4 Synthea)')   # ← adjust if needed
PROC_DIR = Path('/content/drive/MyDrive/Patient_processed')
PROC_DIR.mkdir(parents=True, exist_ok=True)

files = list(RAW_DIR.rglob('*.json'))
print(f"📂  Found {len(files):,} JSON bundles")

# 1.  Flatten a handful of resource types without validation
def bundle_to_rows(fp: Path) -> dict[str, list[dict]]:
    rows = defaultdict(list)
    with open(fp) as f:
        bundle = json.load(f)

    for ent in bundle.get("entry", []):
        r = ent["resource"]
        rtype = r["resourceType"]
        base  = {"bundle_file": fp.name,
                 "resource_id": r.get("id"),
                 "patient_ref": r.get("subject", {}).get("reference")}

        if rtype == "Observation":
            code = r.get("code", {})
            code_text = code.get("text") or next(
                (c.get("display") for c in code.get("coding", []) if c.get("display")), "")
            if "valueQuantity" in r:
                val = f"{r['valueQuantity']['value']} {r['valueQuantity']['unit']}"
            elif "valueCodeableConcept" in r:
                val = r["valueCodeableConcept"].get("text", "")
            else:
                val = ""
            rows["Observation"].append({**base,
                                         "datetime": r.get("effectiveDateTime"),
                                         "text": f"{code_text}: {val}"})

        elif rtype == "Condition":
            rows["Condition"].append({**base,
                                      "datetime": r.get("onsetDateTime"),
                                      "text": r.get("code", {}).get("text")})

        elif rtype == "MedicationRequest":
            med = r.get("medicationCodeableConcept", {})
            rows["MedicationRequest"].append({**base,
                                              "datetime": r.get("authoredOn"),
                                              "text": med.get("text")})
    return rows

tables = defaultdict(list)
for fp in tqdm.tqdm(files, desc="Parsing bundles"):
    for typ, lst in bundle_to_rows(fp).items():
        tables[typ].extend(lst)

print(f"✅  Flattened {len(files):,} bundles → {sum(len(v) for v in tables.values()):,} rows")

# (Optional) store Parquet for inspection
for typ, lst in tables.items():
    pd.DataFrame(lst).to_parquet(PROC_DIR / f"{typ}.parquet")

# 2.  Build docs + metadata
docs, meta = [], []
for typ, rows in tables.items():
    for row in pd.DataFrame(rows).itertuples(index=False):
        docs.append(f"[{typ}] {row.datetime} — {row.text}")
        meta.append({"resource_type": typ,
                     "bundle_file" : row.bundle_file,
                     "resource_id" : row.resource_id,
                     "patient_ref" : getattr(row, 'patient_ref', None)})

print(f"✅  Created {len(docs):,} text chunks")

# 3.  Embed on GPU
EMB_MODEL = "pritamdeka/S-PubMedBERT-MS-MARCO"
BATCH_SZ  = 1024
device    = "cuda" if torch.cuda.is_available() else "cpu"

embedder  = SentenceTransformer(EMB_MODEL, device=device)
embedder.max_seq_length = 256

vecs = []
for i in tqdm.trange(0, len(docs), BATCH_SZ, desc="Embedding"):
    vecs.append(embedder.encode(docs[i:i+BATCH_SZ], convert_to_numpy=True,
                                device=device, batch_size=64,
                                show_progress_bar=False).astype("float32"))

X = np.vstack(vecs)
print("✅  Embeddings ready :", X.shape)


📂  Found 1,180 JSON bundles


Parsing bundles: 100%|██████████| 1180/1180 [01:10<00:00, 16.72it/s]


✅  Flattened 1,180 bundles → 282,797 rows
✅  Created 282,797 text chunks


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/388 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding: 100%|██████████| 277/277 [02:35<00:00,  1.79it/s]


✅  Embeddings ready : (282797, 768)


In [None]:
# ───────────────────────────────────────────────
#  Vector DB (build GPU index → save to Drive)
# ───────────────────────────────────────────────
import faiss, numpy as np, torch, pandas as pd
from pathlib import Path

# ==== configuration (edit if your paths differ) ====
PROC_DIR   = Path('/content/drive/MyDrive/Patient_processed')
PROC_DIR.mkdir(parents=True, exist_ok=True)

INDEX_FILE = PROC_DIR / 'patient_docs_faiss.index'    # binary file
META_FILE  = PROC_DIR / 'patient_docs_meta.parquet'   # parquet table

# ==== 1  build a GPU index if possible, else CPU ====
d         = X.shape[1]                      # embedding dimension
cpu_index = faiss.IndexFlatL2(d)            # brute‑force exact search

if faiss.get_num_gpus() > 0:
    gpu_res = faiss.StandardGpuResources()
    index   = faiss.index_cpu_to_gpu(gpu_res, 0, cpu_index)
    print("🟢 Using GPU FAISS index")
else:
    index   = cpu_index
    print("🟡 GPU not found – using CPU index")

index.add(X)                                # X = numpy array (n_docs, d)

# ==== 2  persist: always save a CPU copy ====
cpu_copy = faiss.index_gpu_to_cpu(index) if faiss.get_num_gpus() > 0 else index
faiss.write_index(cpu_copy, str(INDEX_FILE))
pd.DataFrame(meta).to_parquet(META_FILE)
print(f"✔ Saved {len(X):,} vectors → {INDEX_FILE.name}")
print(f"✔ Saved metadata table    → {META_FILE.name}")

# ==== 3  retrieval helper ====
def rag_search(query: str, k: int = 5):
    """
    Return [(snippet, metadata_dict, distance), …] sorted by distance (lower = closer).
    """
    vec = embedder.encode([query], convert_to_numpy=True, device='cuda' if torch.cuda.is_available() else 'cpu')
    D, I = index.search(vec, k)
    meta_df = pd.read_parquet(META_FILE)     # light – cached in RAM after first call
    return [(docs[i], meta_df.iloc[i].to_dict(), float(D[0][rank]))
            for rank, i in enumerate(I[0])]

# ==== 4  quick smoke test ====
for s, m, d in rag_search("declining kidney function", k=3):
    print(f"{d:7.4f} | {s[:120]}…")


🟢 Using GPU FAISS index
