Create a vector store of 80 question -> SQL example pairs



1.   text-embedding-3-large for creating embedding of just the question, the canonical SQL query (1 SQL query per intent) is the payload
2.   FAISS is the vector store






# 0. Mount drive and define paths

# Installs

In [None]:
from google.colab import drive
drive.mount('/content/drive')

DEV_PATH = "/content/drive/MyDrive/210_Capstone/210_Factory/210_dev"
EXAMPLE_PAIR_PATH = DEV_PATH + "/medintellagent_examples_all_intents.json"  # or .jsonl
OUT_DIR = DEV_PATH + "/vectorstores/medintellagent_faiss_v1"


Mounted at /content/drive


In [None]:
DEV_PATH

'/content/drive/MyDrive/210_Capstone/210_Factory/210_dev'

# Open AI Access Checks

In [None]:
!pip -q install --upgrade openai langchain langchain-community langchain-openai faiss-cpu pandas tqdm


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m964.9/964.9 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m53.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m76.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m97.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does n

In [None]:
# Open AI Access ChatCompletionChunk

# 1) Load your saved secret from Colab (left sidebar → Secrets → OPENAI_API_KEY)
from google.colab import userdata
import os
key = userdata.get('OPENAI_API_KEY')
if not key:
    raise ValueError("Add OPENAI_API_KEY via the Colab 'Secrets' panel first.")
os.environ['OPENAI_API_KEY'] = key  # OpenAI() will read this

# 2) Smoke tests
from openai import OpenAI
client = OpenAI()

# Embedding test (your choice: text-embedding-3-large)
emb = client.embeddings.create(
    model="text-embedding-3-large",
    input="What were my most recent lab results?"
)
print("Embedding dim:", len(emb.data[0].embedding))

# Chat test (SQL generation model)
resp = client.chat.completions.create(
    model="gpt-4o-mini",
    temperature=0,
    messages=[
        {"role":"system","content":"Return a single PostgreSQL SELECT only."},
        {"role":"user","content":"demo: SELECT 1;"}
    ]
)
print(resp.choices[0].message.content)

Embedding dim: 3072
```sql
SELECT 1;
```


# Load Example pairs from json

In [None]:
import json, pathlib

def load_examples(path: str):
    p = pathlib.Path(path)
    if not p.exists():
        raise FileNotFoundError(f"Not found: {p}")

    if p.suffix.lower() == ".jsonl":
        records = []
        with open(p, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line:
                    records.append(json.loads(line))
        return records
    elif p.suffix.lower() == ".json":
        with open(p, "r", encoding="utf-8") as f:
            return json.load(f)
    else:
        raise ValueError("File must be .json or .jsonl")

records = load_examples(EXAMPLE_PAIR_PATH)

# Basic validation / de-dupe by question
seen = set()
clean = []
for r in records:
    q = (r.get("question") or "").strip()
    s = (r.get("sql") or "").strip()
    if not q or not s:
        continue
    if q in seen:
        continue
    seen.add(q)
    clean.append({
        "question": q,
        "sql": s,
        "intent": r.get("intent"),
        "tables": r.get("tables", []),
    })

print(f"Loaded {len(records)} records; using {len(clean)} unique question→SQL pairs.")
assert len(clean) >= 1, "No valid examples loaded."


Loaded 80 records; using 80 unique question→SQL pairs.


# Embed Questions and Build FAISS

In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from pathlib import Path

# Use your chosen embedding model
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")  # keep same at query time

texts = [r["question"] for r in clean]
metadatas = [{"sql": r["sql"], "intent": r.get("intent"), "tables": r.get("tables")} for r in clean]

vectorstore = FAISS.from_texts(texts=texts, embedding=embeddings, metadatas=metadatas)

out_dir = Path(OUT_DIR)
out_dir.mkdir(parents=True, exist_ok=True)
vectorstore.save_local(str(out_dir))

# Write a tiny manifest for safety
manifest = {
    "model": "text-embedding-3-large",
    "examples": len(texts),
    "source_file": EXAMPLE_PAIR_PATH,
    "note": "MedIntellAgent FAISS index; embeddings of question text only; metadata carries SQL/intent/tables."
}
with open(out_dir / "manifest.json", "w", encoding="utf-8") as f:
    json.dump(manifest, f, ensure_ascii=False, indent=2)

print(f"Saved FAISS index to: {out_dir}")


Saved FAISS index to: /content/drive/MyDrive/210_Capstone/210_Factory/210_dev/vectorstores/medintellagent_faiss_v1


# Test load of FAISS from google drive and run some tests to see if it loaded

In [None]:
# Reload the saved index and run a similarity search
reloaded = FAISS.load_local(str(out_dir), embeddings, allow_dangerous_deserialization=True)

query = "My love for Pink Floyd knows no bounds"
docs_and_scores = reloaded.similarity_search_with_score(query, k=3)

for rank, (doc, score) in enumerate(docs_and_scores, 1):
    print(f"\n#{rank}  score={score:.4f}")
    print("Q:", doc.page_content)
    print("intent:", doc.metadata.get("intent"))
    print("tables:", doc.metadata.get("tables"))
    print("SQL:", doc.metadata.get("sql")[:200] + ("..." if len(doc.metadata.get("sql","")) > 200 else ""))



#1  score=1.7321
Q: Display my heart rate trend over time.
intent: vital_sign_trends
tables: ['observations']
SQL: SELECT
  o.patient_id,
  COALESCE(o.display, o.loinc_code) AS vital_name,
  o.value_num  AS value,
  o.value_unit AS unit,
  o.effective_datetime
FROM observations AS o
WHERE o.patient_id = :patient_i...

#2  score=1.7610
Q: Show my latest bloodwork.
intent: recent_lab_results
tables: ['observations']
SQL: SELECT *
FROM (
  SELECT DISTINCT ON (o.patient_id, COALESCE(o.loinc_code, o.display))
    o.patient_id,
    COALESCE(o.display, o.loinc_code) AS test_name,
    o.value_num AS value,
    o.value_unit ...

#3  score=1.7692
Q: Show my current meds and dosages.
intent: current_medications
tables: ['medication_requests']
SQL: SELECT DISTINCT ON (mr.patient_id, mr.med_name)
  mr.patient_id,
  mr.med_name AS medication,
  mr.dose,
  mr.route,
  mr.start_datetime,
  mr.end_datetime,
  mr.refills
FROM medication_requests mr
WH...
