In [14]:
import re
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from typing import Dict, Optional

def _clean_label(label: str) -> str:
    return re.sub(r"\s+", " ", (label or "")).strip()

def _requests_session(user_agent: str = "pdf-to-json-row/1.0 (you@example.com)") -> requests.Session:
    s = requests.Session()
    retries = Retry(
        total=5, connect=3, read=5, backoff_factor=0.8,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET", "POST"]
    )
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.headers.update({"User-Agent": user_agent})
    return s

def _mediawiki_exact_qid(s: requests.Session, label_en: str) -> Optional[str]:
    # Try to find an exact English label match first (fast & robust).
    r = s.get(
        "https://www.wikidata.org/w/api.php",
        params={
            "action": "wbsearchentities",
            "format": "json",
            "language": "en",
            "search": label_en,
            "limit": 5,
            "strictlanguage": 1
        },
        timeout=(5, 30)
    )
    r.raise_for_status()
    hits = r.json().get("search", [])
    for h in hits:
        if h.get("label", "") == label_en:
            return h.get("id")
    # fall back to first hit if you want a “best” match:
    # return hits[0]["id"] if hits else None
    return None

def _claims_for_qid(s: requests.Session, qid: str) -> Dict[str, str]:
    r = s.get(
        "https://www.wikidata.org/w/api.php",
        params={
            "action": "wbgetentities",
            "format": "json",
            "ids": qid,
            "props": "claims"
        },
        timeout=(5, 30)
    )
    r.raise_for_status()
    ent = r.json()["entities"][qid].get("claims", {})
    def first(prop):
        try:
            return ent[prop][0]["mainsnak"]["datavalue"]["value"]
        except Exception:
            return ""
    omim = str(first("P492") or "")
    orpha = str(first("P1550") or "")
    return {"OMIM": f"OMIM:{omim}" if omim else "", "OrphaNet": f"Orphanet:{orpha}" if orpha else ""}

def resolve_omim_and_orphanet_from_disease(label: str) -> Dict[str, str]:
    """
    Returns {"OMIM": "OMIM:123456", "OrphaNet": "Orphanet:123"} (or blanks if not found).
    Strategy: MediaWiki exact label -> SPARQL fallback -> EBI OLS Orphanet fallback.
    """
    q = _clean_label(label)
    if not q:
        return {"OMIM": "", "OrphaNet": ""}

    s = _requests_session()

    # 1) MediaWiki exact label → QID → claims
    try:
        qid = _mediawiki_exact_qid(s, q)
        if qid:
            ids = _claims_for_qid(s, qid)
            if ids["OMIM"] or ids["OrphaNet"]:
                return ids
    except Exception as e:
        print(f"⚠️ MediaWiki lookup failed for '{q}': {e}")

    # 2) SPARQL fallback (exact English label)
    omim_id, orpha_id = "", ""
    sparql = f"""
    SELECT ?omim ?orpha WHERE {{
      ?d rdfs:label "{q.replace('"','\\"')}"@en .
      OPTIONAL {{ ?d wdt:P492 ?omim. }}
      OPTIONAL {{ ?d wdt:P1550 ?orpha. }}
    }}
    ORDER BY ?d
    LIMIT 1
    """
    try:
        resp = s.post(
            "https://query.wikidata.org/sparql",
            data={"query": sparql},
            headers={"Accept": "application/sparql-results+json"},
            timeout=(5, 60)
        )
        resp.raise_for_status()
        rows = resp.json()["results"]["bindings"]
        if rows:
            r0 = rows[0]
            omim_id = (r0.get("omim", {}) or {}).get("value", "") or ""
            orpha_id = (r0.get("orpha", {}) or {}).get("value", "") or ""
    except Exception as e:
        print(f"⚠️ SPARQL failed for '{q}': {e}")

    # 3) Orphanet fallback via EBI OLS if missing
    if not orpha_id:
        try:
            ols = s.get(
                "https://www.ebi.ac.uk/ols4/api/search",
                params={"q": q, "ontology": "ordo", "queryFields": "label", "exact": "true"},
                timeout=(5, 30)
            )
            if ols.ok:
                js = ols.json()
                if js.get("response", {}).get("numFound", 0) > 0:
                    doc = js["response"]["docs"][0]
                    curie = next((x for x in doc.get("obo_id", []) if x.startswith("Orphanet_")), "")
                    if curie:
                        orpha_id = curie.split("_", 1)[-1]
        except Exception:
            pass

    # Normalize prefixes and return (single return point, consistent type)
    omim = f"OMIM:{omim_id}" if omim_id else ""
    orpha = f"Orphanet:{orpha_id}" if orpha_id else ""
    return {"OMIM": omim, "OrphaNet": orpha}

In [16]:
ids = resolve_omim_and_orphanet_from_disease("Marfan syndrome")
print(ids)  # Expect something like {'OMIM': 'OMIM:154700', 'OrphaNet': 'Orphanet:558'}

{'OMIM': 'OMIM:154700', 'OrphaNet': 'Orphanet:558'}


In [20]:
resolve_omim_and_orphanet_from_disease("B-cell lymphoblastic leukemia")

{'OMIM': '', 'OrphaNet': ''}