In [None]:
!pip install pandas numpy requests lxml sentence-transformers chromadb

Collecting chromadb
  Downloading chromadb-1.4.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.3.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.3-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.39.1-py3-none-any.whl.metadata (2.5 kB)
Collecting pypika>=0.48.9 (from chromadb)
  Downloading PyPika-0.48.9.tar.gz (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

## Project Scope

This system answers drug–drug interaction and contraindication questions
using authoritative pharmaceutical data sources. The system must refuse
to answer if sufficient evidence is not available.


In [None]:
import requests
import json

url = "https://api.fda.gov/drug/label.json?limit=500"
response = requests.get(url)
data = response.json()

print("Number of records:", len(data["results"]))

Number of records: 500


In [None]:
import pandas as pd

records = []

for item in data["results"]:
    drug_name = item.get("openfda", {}).get("generic_name", ["Unknown"])[0]

    interactions = " ".join(item.get("drug_interactions", []))
    contraindications = " ".join(item.get("contraindications", []))

    records.append({
        "drug": drug_name,
        "drug_interactions": interactions,
        "contraindications": contraindications
    })

df = pd.DataFrame(records)
df.head()

Unnamed: 0,drug,drug_interactions,contraindications
0,SILICEA,,
1,Unknown,,
2,POVIDONE-IODINE,,
3,Unknown,,
4,Unknown,Drug Interactions: Specific drug interaction s...,CONTRAINDICATIONS Ofloxacin ophthalmic solutio...


In [None]:
df_clean = df[
    (df["drug_interactions"].str.strip() != "") |
    (df["contraindications"].str.strip() != "")
]

print("Usable records:", len(df_clean))
df_clean.head()

Usable records: 185


Unnamed: 0,drug,drug_interactions,contraindications
4,Unknown,Drug Interactions: Specific drug interaction s...,CONTRAINDICATIONS Ofloxacin ophthalmic solutio...
5,NAPROXEN,7 DRUG INTERACTIONS See Table 1 for clinically...,4 CONTRAINDICATIONS Naproxen tablets and napro...
12,TRAMETINIB,7 DRUG INTERACTIONS MEKINIST is indicated for ...,4 CONTRAINDICATIONS None. None. ( 4 )
13,Unknown,Drug Interactions In vitro studies were conduc...,CONTRAINDICATIONS Gabapentin tablets USP are c...
14,GLIMEPIRIDE,7 DRUG INTERACTIONS Certain medications may af...,4 CONTRAINDICATIONS Glimepiride tablets are co...


In [None]:
evidence = []

for _, row in df_clean.iterrows():
    if row["drug_interactions"]:
        evidence.append({
            "drug": row["drug"],
            "type": "interaction",
            "text": row["drug_interactions"],
            "source": "FDA"
        })
    if row["contraindications"]:
        evidence.append({
            "drug": row["drug"],
            "type": "contraindication",
            "text": row["contraindications"],
            "source": "FDA"
        })

evidence_df = pd.DataFrame(evidence)
evidence_df.head()

Unnamed: 0,drug,type,text,source
0,Unknown,interaction,Drug Interactions: Specific drug interaction s...,FDA
1,Unknown,contraindication,CONTRAINDICATIONS Ofloxacin ophthalmic solutio...,FDA
2,NAPROXEN,interaction,7 DRUG INTERACTIONS See Table 1 for clinically...,FDA
3,NAPROXEN,contraindication,4 CONTRAINDICATIONS Naproxen tablets and napro...,FDA
4,TRAMETINIB,interaction,7 DRUG INTERACTIONS MEKINIST is indicated for ...,FDA


In [None]:
evidence_df.to_csv("pharma_evidence_fda.csv", index=False)
print("Dataset saved!")

Dataset saved!


## Refusal Policy

The system must refuse to answer when:
1. The drug identity cannot be confidently determined.
2. No authoritative evidence exists for the query.
3. Evidence is incomplete, contradictory, or outdated.
4. Answering may result in unsafe medical guidance.


In [None]:
def should_refuse(drug_name, interactions, contraindications):
    if drug_name == "Unknown":
        return True, "Drug identity is unclear."

    if not interactions and not contraindications:
        return True, "No authoritative interaction or contraindication data found."

    return False, None

In [None]:
refusal_results = []

for _, row in df_clean.iterrows():
    refuse, reason = should_refuse(
        row["drug"],
        row["drug_interactions"],
        row["contraindications"]
    )

    refusal_results.append({
        "drug": row["drug"],
        "refuse": refuse,
        "reason": reason
    })

refusal_df = pd.DataFrame(refusal_results)
refusal_df.head()


Unnamed: 0,drug,refuse,reason
0,Unknown,True,Drug identity is unclear.
1,NAPROXEN,False,
2,TRAMETINIB,False,
3,Unknown,True,Drug identity is unclear.
4,GLIMEPIRIDE,False,


In [None]:
import pandas as pd

evidence_df = pd.read_csv("pharma_evidence_fda.csv")
evidence_df.head()


Unnamed: 0,drug,type,text,source
0,Unknown,interaction,Drug Interactions: Specific drug interaction s...,FDA
1,Unknown,contraindication,CONTRAINDICATIONS Ofloxacin ophthalmic solutio...,FDA
2,NAPROXEN,interaction,7 DRUG INTERACTIONS See Table 1 for clinically...,FDA
3,NAPROXEN,contraindication,4 CONTRAINDICATIONS Naproxen tablets and napro...,FDA
4,TRAMETINIB,interaction,7 DRUG INTERACTIONS MEKINIST is indicated for ...,FDA


In [None]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
def build_embedding_text(row):
    return f"""
    Drug: {row['drug']}
    Type: {row['type']}
    Evidence: {row['text']}
    Source: FDA
    """


In [None]:
texts = evidence_df.apply(build_embedding_text, axis=1).tolist()

embeddings = embedding_model.encode(
    texts,
    show_progress_bar=True
)

print("Total embeddings:", len(embeddings))


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

Total embeddings: 331


In [None]:
import chromadb

client = chromadb.Client()
collection = client.create_collection(name="pharma_fda_evidence")


In [None]:
collection.add(
    documents=texts,
    embeddings=embeddings,
    metadatas=evidence_df.to_dict("records"),
    ids=[f"evidence_{i}" for i in range(len(texts))]
)


In [None]:
def retrieve_evidence_safe(query, drug_name, top_k=5):
    query_embedding = embedding_model.encode([query])

    results = collection.query(
        query_embeddings=query_embedding,
        n_results=top_k
    )

    safe_docs = []
    safe_meta = []

    for meta, doc in zip(results["metadatas"][0], results["documents"][0]):
        # Reject unknown drugs
        if meta["drug"] == "Unknown":
            continue

        # Enforce drug-specific matching
        if drug_name.lower() in meta["drug"].lower() or drug_name.lower() in doc.lower():
            safe_docs.append(doc)
            safe_meta.append(meta)

    return safe_meta, safe_docs


In [None]:
meta, docs = retrieve_evidence_safe(
    "Does Metformin interact with Ciprofloxacin?",
    drug_name="Metformin"
)

if len(docs) == 0:
    print("❌ REFUSAL: No authoritative evidence found for this drug interaction.")
else:
    print("✅ Evidence found:")
    for d in docs:
        print(d)


❌ REFUSAL: No authoritative evidence found for this drug interaction.


In [None]:
metadata, docs = retrieve_evidence_safe(
    "Does Warfarin interact with Aspirin?",
    drug_name="Warfarin"
)

for d in docs:
    print(d)



    Drug: NAPROXEN
    Type: interaction
    Source: FDA
    

    Drug: NAPROXEN
    Type: interaction
    Source: FDA
    

    Drug: NAPROXEN
    Type: interaction
    Source: FDA
    


## Claim-Level Verification Policy

Each retrieved evidence text is decomposed into atomic claims.
A claim is considered valid only if:
1. The drug identity matches the query drug.
2. The claim is explicitly supported by authoritative evidence.
3. The claim does not contradict other retrieved evidence.

Claims failing verification are discarded to prevent hallucinations.


In [None]:
import re

def extract_claims(text):
    # Split on sentence boundaries
    sentences = re.split(r'\.\s+', text)
    # Clean and keep meaningful sentences
    claims = [s.strip() for s in sentences if len(s.strip()) > 20]
    return claims


In [None]:
sample_text = """
Metformin may interact with fluoroquinolones.
This interaction can increase the risk of hypoglycemia.
Monitoring blood glucose is recommended.
"""

extract_claims(sample_text)


['Metformin may interact with fluoroquinolones',
 'This interaction can increase the risk of hypoglycemia',
 'Monitoring blood glucose is recommended']

In [None]:
def is_claim_about_drug(claim, drug_name):
    return drug_name.lower() in claim.lower()


In [None]:
is_claim_about_drug(
    "This interaction can increase hypoglycemia risk",
    "Metformin"
)


False

In [None]:
def verify_claims(evidence_docs, drug_name):
    verified_claims = []
    rejected_claims = []

    for doc in evidence_docs:
        claims = extract_claims(doc)

        for claim in claims:
            if is_claim_about_drug(claim, drug_name):
                verified_claims.append(claim)
            else:
                rejected_claims.append(claim)

    return verified_claims, rejected_claims


In [None]:
verified, rejected = verify_claims(docs, "Metformin")

print("✅ VERIFIED CLAIMS:")
for v in verified:
    print("-", v)

print("\n❌ REJECTED CLAIMS:")
for r in rejected:
    print("-", r)


✅ VERIFIED CLAIMS:

❌ REJECTED CLAIMS:
- Drug: NAPROXEN
    Type: interaction
    Evidence: 7 DRUG INTERACTIONS See Table 1 for clinically significant drug interactions with naproxen
- Table 1: Clinically Significant Drug Interactions with naproxen Drugs That Interfere with Hemostasis Clinical Impact: Naproxen and anticoagulants such as warfarin have a synergistic effect on bleeding
- The concomitant use of naproxen and anticoagulants have an increased risk of serious bleeding compared to the use of either drug alone
- Serotonin release by platelets plays an important role in hemostasis
- Case-control and cohort epidemiological studies showed that concomitant use of drugs that interfere with serotonin reuptake and an NSAID may potentiate the risk of bleeding more than an NSAID alone
- Aspirin Clinical Impact: A pharmacodynamic (PD) study has demonstrated an interaction in which lower dose naproxen (220mg/day or 220mg twice daily) interfered with the antiplatelet effect of low-dose imme

In [None]:
def final_decision(verified_claims):
    if len(verified_claims) == 0:
        return "REFUSE", "No verified drug-specific claims found."
    else:
        return "ANSWER", verified_claims


In [None]:
decision, output = final_decision(verified)

print(decision)
print(output)


REFUSE
No verified drug-specific claims found.


In [None]:
def classify_evidence_type_strict(doc, drug_a, drug_b=None):
    doc_lower = doc.lower()

    # Strong direct: explicit interaction wording
    if drug_b:
        patterns = [
            f"{drug_a.lower()} and {drug_b.lower()}",
            f"{drug_b.lower()} and {drug_a.lower()}",
            f"concomitant use of {drug_a.lower()} and {drug_b.lower()}",
            f"{drug_a.lower()} with {drug_b.lower()}"
        ]
        if any(p in doc_lower for p in patterns):
            return "direct_explicit", 1.0

    # Co-occurrence but not explicit
    if drug_b and drug_a.lower() in doc_lower and drug_b.lower() in doc_lower:
        return "direct_cooccurrence", 0.6

    # Indirect
    if drug_a.lower() in doc_lower:
        return "indirect", 0.4

    return "generic", 0.2



In [None]:
def compute_ecs_strict(docs, drug_a, drug_b=None):
    if not docs:
        return 0.0, []

    scores = []
    details = []

    for doc in docs:
        etype, score = classify_evidence_type_strict(doc, drug_a, drug_b)
        scores.append(score)
        details.append((etype, score))

    return max(scores), details



In [None]:
def ecs_decision(ecs_score, meta):
    """
    meta keys expected:
    - unknown_drug (bool)
    - black_box_warning (bool)
    - severity ("minor" | "moderate" | "major")
    - conditional_safe (bool)
    - too_broad (bool)
    """

    # 1️⃣ Unknown or hallucination-prone → refuse
    if meta.get("unknown_drug", False):
        return "REFUSE"

    # 2️⃣ Broad questions → refuse
    if meta.get("too_broad", False):
        return "REFUSE"

    # 3️⃣ Black-box truths (e.g., isotretinoin + pregnancy)
    if meta.get("black_box_warning", False):
        return "ANSWER"

    # 4️⃣ Major interactions should answer confidently
    if meta.get("severity") == "major" and ecs_score >= 0.6:
        return "ANSWER"

    # 5️⃣ Conditionally allowed combinations
    if meta.get("conditional_safe", False):
        return "PARTIAL"

    # 6️⃣ Default ECS-based thresholds
    if ecs_score >= 0.8:
        return "ANSWER"
    elif ecs_score >= 0.4:
        return "PARTIAL"
    else:
        return "REFUSE"



In [None]:
metadata, docs = retrieve_evidence_safe(
    "Does Warfarin interact with Aspirin?",
    drug_name="Warfarin"
)

ecs, details = compute_ecs_strict(docs, "Warfarin", "Aspirin")

meta = build_meta(metadata, details)
decision = ecs_decision(ecs, meta)

print("ECS:", ecs)
print("Meta:", meta)
print("Decision:", decision)


ECS: 0.6
Decision: PARTIAL


In [None]:
def build_meta(retrieved_metadata, ecs_details):
    meta_dict = {
        "unknown_drug": False,
        "black_box_warning": False,
        "severity": "minor",
        "conditional_safe": False,
        "too_broad": False,
    }

    # Infer severity from ecs_details (highest score type)
    max_score_type = ""
    if ecs_details:
        max_score = 0
        for etype, score in ecs_details:
            if score > max_score:
                max_score = score
                max_score_type = etype

    if max_score_type == "direct_explicit":
        meta_dict["severity"] = "major"
    elif max_score_type == "direct_cooccurrence":
        meta_dict["severity"] = "moderate"
    else: # indirect or generic or empty
        meta_dict["severity"] = "minor"

    for item in retrieved_metadata:
        text = item.get("text", "").lower()
        if "black box warning" in text:
            meta_dict["black_box_warning"] = True
        if "conditional" in text or "caution" in text or "monitor" in text:
            meta_dict["conditional_safe"] = True

    return meta_dict

def generate_safe_response(decision, drug_a, drug_b=None):
    if decision == "REFUSE":
        return (
            "There is insufficient authoritative evidence to determine this interaction. "
            "To ensure safety, the system cannot provide a definitive answer."
        )

    if decision == "PARTIAL":
        return (
            f"Related FDA evidence suggests potential safety concerns involving {drug_a}"
            + (f" and {drug_b}" if drug_b else "")
            + ", but no explicit authoritative statement confirms a direct interaction. "
            "Clinical monitoring and professional consultation are advised."
        )

    if decision == "ANSWER":
        return (
            f"Authoritative FDA evidence confirms an interaction involving {drug_a}"
            + (f" and {drug_b}" if drug_b else "")
            + ". Appropriate precautions should be taken."
        )


In [None]:
print(generate_safe_response(decision, "Warfarin", "Aspirin"))


Related FDA evidence suggests potential safety concerns involving Warfarin and Aspirin, but no explicit authoritative statement confirms a direct interaction. Clinical monitoring and professional consultation are advised.


In [None]:
# 1️⃣ Helper: extract drug name from query
def extract_drug_from_query(query, known_drugs):
    query_lower = query.lower()
    for drug in known_drugs:
        if drug.lower() in query_lower:
            return drug
    return None


# 2️⃣ Build known drug list from your dataset
known_drugs = list(
    set([d for d in evidence_df["drug"].unique() if d != "Unknown"])
)


# 3️⃣ Safe auto-retrieval wrapper
def retrieve_evidence_auto(query, top_k=5):
    drug = extract_drug_from_query(query, known_drugs)

    # If no recognizable drug → refuse
    if not drug:
        return [], []

    return retrieve_evidence_safe(query, drug, top_k)


In [None]:
query = "Does Naproxen interact with Warfarin?"

metadata, docs = retrieve_evidence_auto(query)

ecs, details = compute_ecs_strict(docs, "Naproxen", "Warfarin")
meta = build_meta(metadata, details)
decision = ecs_decision(ecs, meta)

print("ECS:", ecs)
print("Decision:", decision)
print(generate_safe_response(decision, "Naproxen", "Warfarin"))


TypeError: ecs_decision() missing 1 required positional argument: 'meta'