In [1]:
# Install requests if not available
# pip install requests

import requests
import json
from pathlib import Path


In [3]:
import requests
import json
from pathlib import Path

# Pull OpenFDA drug labels (approx 500-1000 drugs)
url = "https://api.fda.gov/drug/label.json"
params = {
    "limit": 1000  # Pull maximum allowed in one shot
}

response = requests.get(url, params=params)
data = response.json()

# Save RAW
raw_output = Path("E:/MiiHA/app/data/raw/openfda_all_drugs.json")
raw_output.parent.mkdir(parents=True, exist_ok=True)

with open(raw_output, "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2)

print(f"✅ Saved raw OpenFDA drugs: {len(data.get('results', []))} records")


✅ Saved raw OpenFDA drugs: 1000 records


In [4]:
# Load RAW
with open(raw_output, "r", encoding="utf-8") as f:
    raw_data = json.load(f)

# Parse Meaningful Text
processed = []
for item in raw_data.get("results", []):
    usage = item.get("indications_and_usage", [""])[0] if "indications_and_usage" in item else ""
    warnings = item.get("warnings", [""])[0] if "warnings" in item else ""
    description = usage + "\n" + warnings
    if description.strip():
        record = {
            "id": item.get("id", ""),
            "brand_name": item.get("openfda", {}).get("brand_name", ["Unknown"])[0],
            "generic_name": item.get("openfda", {}).get("generic_name", ["Unknown"])[0],
            "route": item.get("openfda", {}).get("route", ["Unknown"])[0],
            "purpose": item.get("purpose", [""])[0] if "purpose" in item else "",
            "description": description.strip()
        }
        processed.append(record)

# Save Processed
processed_output = Path("E:/MiiHA/app/data/processed/openfda_all_drugs.jsonl")
processed_output.parent.mkdir(parents=True, exist_ok=True)

with open(processed_output, "w", encoding="utf-8") as f:
    for record in processed:
        json.dump(record, f)
        f.write("\n")

print(f"✅ Saved processed OpenFDA data: {len(processed)} records")


✅ Saved processed OpenFDA data: 974 records


In [7]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Load processed
docs = []
with open("E:/MiiHA/app/data/processed/openfda_all_drugs.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        docs.append(json.loads(line))

texts = [doc["description"] for doc in docs]

# Embed
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = model.encode(texts, show_progress_bar=True)

# Build FAISS
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings).astype("float32"))

# Save
faiss_output_path = Path("E:/MiiHA/app/db/openfda_all_drugs.index")
faiss_output_path.parent.mkdir(parents=True, exist_ok=True)
faiss.write_index(index, str(faiss_output_path))

# Save metadata
metadata = [{"id": doc["id"], "brand_name": doc["brand_name"], "generic_name": doc["generic_name"], "purpose": doc["purpose"], "route": doc["route"]} for doc in docs]

metadata_output_path = Path("E:/MiiHA/app/data/metadata/openfda_all_drugs_metadata.json")
metadata_output_path.parent.mkdir(parents=True, exist_ok=True)
with open(metadata_output_path, "w", encoding="utf-8") as f:
    json.dump(metadata, f, indent=2)

print(f"✅ FAISS index and metadata saved: {len(metadata)} entries")


Batches: 100%|██████████| 31/31 [00:09<00:00,  3.16it/s]

✅ FAISS index and metadata saved: 974 entries



