Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions src/mavedb/lib/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,18 @@ class UniProtPollingEnqueueError(ValueError):
pass


class HGNCGeneNotFoundError(Exception):
"""Raised when HGNC REST does not recognize a gene symbol."""

pass


class HGNCServiceError(Exception):
"""Raised when HGNC REST cannot provide gene information."""

pass


class LDHSubmissionFailureError(Exception):
"""Raised when submission to ClinGen Linked Data Hub (LDH) fails for all submissions."""

Expand Down
Empty file added src/mavedb/lib/hgnc/__init__.py
Empty file.
99 changes: 99 additions & 0 deletions src/mavedb/lib/hgnc/client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import logging
from dataclasses import dataclass
from typing import Optional
from urllib.parse import quote

import requests

from mavedb.lib.exceptions import HGNCGeneNotFoundError, HGNCServiceError
from mavedb.lib.hgnc.constants import HGNC_REST_BASE_URL
from mavedb.lib.logging.context import format_raised_exception_info_as_dict, logging_context, save_to_logging_context

logger = logging.getLogger(__name__)

HGNC_FETCH_HEADERS = {"Accept": "application/json"}
HGNC_FETCH_TIMEOUT = 10


def _optional_doc_string(doc: dict, key: str) -> Optional[str]:
value = doc.get(key)
if value is None:
return None
if not isinstance(value, str):
logger.error(msg=f"HGNC REST response contained malformed {key}.", extra=logging_context())
raise HGNCServiceError("Gene information service temporarily unavailable")
return value


@dataclass
class HGNCGeneInfo:
symbol: str
name: str
hgnc_id: Optional[str] = None
locus_group: Optional[str] = None
location: Optional[str] = None
omim_id: Optional[str] = None


def fetch_gene_info(symbol: str) -> HGNCGeneInfo:
quoted_symbol = quote(symbol, safe="")
url = f"{HGNC_REST_BASE_URL}/fetch/symbol/{quoted_symbol}"
save_to_logging_context({"hgnc_symbol": symbol})

try:
response = requests.get(url, headers=HGNC_FETCH_HEADERS, timeout=HGNC_FETCH_TIMEOUT)
response.raise_for_status()
data = response.json()
except requests.exceptions.RequestException as exc:
save_to_logging_context(format_raised_exception_info_as_dict(exc))
logger.error(msg="HGNC REST request failed.", exc_info=exc, extra=logging_context())
raise HGNCServiceError("Gene information service temporarily unavailable") from exc
except ValueError as exc:
save_to_logging_context(format_raised_exception_info_as_dict(exc))
logger.error(msg="HGNC REST returned invalid JSON.", exc_info=exc, extra=logging_context())
raise HGNCServiceError("Gene information service temporarily unavailable") from exc

if not isinstance(data, dict):
logger.error(msg="HGNC REST response was not a JSON object.", extra=logging_context())
raise HGNCServiceError("Gene information service temporarily unavailable")

response_data = data.get("response")
if not isinstance(response_data, dict):
logger.error(msg="HGNC REST response did not contain expected response object.", extra=logging_context())
raise HGNCServiceError("Gene information service temporarily unavailable")

docs = response_data.get("docs")
if not isinstance(docs, list):
logger.error(msg="HGNC REST response did not contain expected docs list.", extra=logging_context())
raise HGNCServiceError("Gene information service temporarily unavailable")

if not docs:
raise HGNCGeneNotFoundError(f"Gene symbol not found: {symbol}")

doc = docs[0]
if not isinstance(doc, dict):
logger.error(msg="HGNC REST response contained a malformed gene document.", extra=logging_context())
raise HGNCServiceError("Gene information service temporarily unavailable")

doc_symbol = doc.get("symbol", symbol)
doc_name = doc.get("name")
if not isinstance(doc_symbol, str) or not doc_symbol or not isinstance(doc_name, str) or not doc_name:
logger.error(msg="HGNC REST response contained invalid gene identity fields.", extra=logging_context())
raise HGNCServiceError("Gene information service temporarily unavailable")

omim_ids = doc.get("omim_id") or []
if omim_ids and not isinstance(omim_ids, list):
logger.error(msg="HGNC REST response contained malformed OMIM identifiers.", extra=logging_context())
raise HGNCServiceError("Gene information service temporarily unavailable")
if omim_ids and not isinstance(omim_ids[0], str):
logger.error(msg="HGNC REST response contained malformed OMIM identifiers.", extra=logging_context())
raise HGNCServiceError("Gene information service temporarily unavailable")

return HGNCGeneInfo(
symbol=doc_symbol,
name=doc_name,
hgnc_id=_optional_doc_string(doc, "hgnc_id"),
locus_group=_optional_doc_string(doc, "locus_group"),
location=_optional_doc_string(doc, "location"),
omim_id=omim_ids[0] if omim_ids else None,
)
1 change: 1 addition & 0 deletions src/mavedb/lib/hgnc/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
HGNC_REST_BASE_URL = "https://rest.genenames.org"
159 changes: 159 additions & 0 deletions src/mavedb/routers/genes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
import logging
from typing import Any

from fastapi import APIRouter, Depends, Query
from sqlalchemy import func
from sqlalchemy.orm import Session, joinedload, selectinload
from starlette.convertors import Convertor, register_url_convertor

from mavedb import deps
from mavedb.lib.experiments import enrich_experiment_with_num_score_sets
from mavedb.lib.hgnc.client import fetch_gene_info
from mavedb.lib.logging import LoggedRoute
from mavedb.lib.logging.context import logging_context, save_to_logging_context
from mavedb.models.ensembl_offset import EnsemblOffset
from mavedb.models.experiment import Experiment
from mavedb.models.experiment_controlled_keyword import ExperimentControlledKeywordAssociation
from mavedb.models.experiment_publication_identifier import ExperimentPublicationIdentifierAssociation
from mavedb.models.refseq_offset import RefseqOffset
from mavedb.models.score_set import ScoreSet
from mavedb.models.score_set_publication_identifier import ScoreSetPublicationIdentifierAssociation
from mavedb.models.target_gene import TargetGene
from mavedb.models.target_sequence import TargetSequence
from mavedb.models.uniprot_offset import UniprotOffset
from mavedb.routers.shared import GATEWAY_ERROR_RESPONSES, PUBLIC_ERROR_RESPONSES, ROUTER_BASE_PREFIX
from mavedb.view_models.gene import GeneResponse
from mavedb.view_models.score_set import ShortScoreSet

TAG_NAME = "Genes"
logger = logging.getLogger(__name__)

GENE_SCORE_SETS_MAX_LIMIT = 100


# See the equivalent pattern in publication_identifiers.py for context on this approach.
# HGNC-approved symbols contain uppercase Latin letters and Arabic numerals, with hyphens
# allowed for specific gene groups (e.g. HLA-A, BRCA1).
class GeneSymbolConverter(Convertor):
regex = r"[A-Za-z][A-Za-z0-9-]*"

def convert(self, value: str) -> str:
return value

def to_string(self, value: str) -> str:
return str(value)


register_url_convertor("gene_symbol", GeneSymbolConverter())

router = APIRouter(
prefix=f"{ROUTER_BASE_PREFIX}/genes",
tags=[TAG_NAME],
responses={**PUBLIC_ERROR_RESPONSES, **GATEWAY_ERROR_RESPONSES},
route_class=LoggedRoute,
)

metadata = {
"name": TAG_NAME,
"description": "Retrieve gene identity and associated public MaveDB score sets.",
}


def _gene_score_set_base_query(db: Session, symbol: str):
return db.query(ScoreSet).filter(
ScoreSet.target_genes.any(TargetGene.mapped_hgnc_name == symbol),
ScoreSet.private.is_(False),
ScoreSet.published_date.isnot(None),
~ScoreSet.superseding_score_set.has(ScoreSet.published_date.isnot(None)),
)


def _score_set_load_options():
return (
selectinload(ScoreSet.experiment).options(
selectinload(Experiment.experiment_set),
selectinload(Experiment.keyword_objs).joinedload(ExperimentControlledKeywordAssociation.controlled_keyword),
selectinload(Experiment.created_by),
selectinload(Experiment.modified_by),
selectinload(Experiment.doi_identifiers),
selectinload(Experiment.publication_identifier_associations).joinedload(
ExperimentPublicationIdentifierAssociation.publication
),
selectinload(Experiment.raw_read_identifiers),
selectinload(Experiment.score_sets),
selectinload(Experiment.official_collections),
),
selectinload(ScoreSet.license),
selectinload(ScoreSet.doi_identifiers),
selectinload(ScoreSet.publication_identifier_associations).joinedload(
ScoreSetPublicationIdentifierAssociation.publication
),
selectinload(ScoreSet.target_genes).options(
joinedload(TargetGene.ensembl_offset).joinedload(EnsemblOffset.identifier),
joinedload(TargetGene.refseq_offset).joinedload(RefseqOffset.identifier),
joinedload(TargetGene.uniprot_offset).joinedload(UniprotOffset.identifier),
joinedload(TargetGene.target_sequence).joinedload(TargetSequence.taxonomy),
joinedload(TargetGene.target_accession),
),
)


@router.get(
"/{symbol:gene_symbol}",
status_code=200,
response_model=GeneResponse,
response_model_exclude_none=True,
summary="Fetch a gene and associated published score sets",
)
def get_gene(
symbol: str,
limit: int = Query(
default=20,
ge=1,
le=GENE_SCORE_SETS_MAX_LIMIT,
description=f"Number of score sets to return (maximum {GENE_SCORE_SETS_MAX_LIMIT}).",
),
offset: int = Query(default=0, ge=0, description="Number of score sets to skip."),
db: Session = Depends(deps.get_db),
) -> Any:
save_to_logging_context({"requested_resource": "gene", "hgnc_symbol": symbol, "limit": limit, "offset": offset})
gene_info = fetch_gene_info(symbol)

base_query = _gene_score_set_base_query(db, gene_info.symbol)
total = base_query.order_by(None).limit(None).offset(None).count()
total_scored_variants = (
base_query.order_by(None)
.limit(None)
.offset(None)
.with_entities(func.coalesce(func.sum(ScoreSet.num_variants), 0))
.scalar()
)
score_sets = (
base_query.options(*_score_set_load_options())
.order_by(ScoreSet.published_date.desc(), ScoreSet.urn.desc())
.offset(offset)
.limit(limit)
.all()
)

logger.debug(msg=f"Gene endpoint yielded {len(score_sets)} score sets.", extra=logging_context())
response_score_sets = []
for score_set in score_sets:
enriched_experiment = enrich_experiment_with_num_score_sets(score_set.experiment, None)
response_score_sets.append(
ShortScoreSet.model_validate(score_set).model_copy(update={"experiment": enriched_experiment})
)

return GeneResponse(
symbol=gene_info.symbol,
name=gene_info.name,
hgnc_id=gene_info.hgnc_id,
locus_group=gene_info.locus_group,
location=gene_info.location,
omim_id=gene_info.omim_id,
score_sets=response_score_sets,
limit=limit,
offset=offset,
total=total,
total_scored_variants=total_scored_variants,
)
21 changes: 21 additions & 0 deletions src/mavedb/server_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
from mavedb import __version__
from mavedb.lib.exceptions import (
AmbiguousIdentifierError,
HGNCGeneNotFoundError,
HGNCServiceError,
MixedTargetError,
NonexistentIdentifierError,
)
Expand All @@ -44,6 +46,7 @@
doi_identifiers,
experiment_sets,
experiments,
genes,
hgvs,
job_runs,
licenses,
Expand Down Expand Up @@ -95,6 +98,7 @@
app.include_router(doi_identifiers.router)
app.include_router(experiment_sets.router)
app.include_router(experiments.router)
app.include_router(genes.router)
app.include_router(hgvs.router)
app.include_router(job_runs.router)
app.include_router(licenses.router)
Expand Down Expand Up @@ -153,6 +157,22 @@ async def nonexistent_identifier_error_exception_handler(request: Request, exc:
return response


@app.exception_handler(HGNCGeneNotFoundError)
async def hgnc_gene_not_found_error_exception_handler(request: Request, exc: HGNCGeneNotFoundError):
response = JSONResponse(status_code=404, content={"message": str(exc)})
save_to_logging_context(format_raised_exception_info_as_dict(exc))
log_request(request, response, time.time_ns())
return response


@app.exception_handler(HGNCServiceError)
async def hgnc_service_error_exception_handler(request: Request, exc: HGNCServiceError):
response = JSONResponse(status_code=503, content={"message": "Gene information service temporarily unavailable"})
save_to_logging_context(format_raised_exception_info_as_dict(exc))
log_request(request, response, time.time_ns())
return response


@app.exception_handler(EutilsRequestError)
async def nonexistent_pmid_error_exception_handler(request: Request, exc: EutilsRequestError):
response = JSONResponse(status_code=404, content={"message": str(exc)})
Expand Down Expand Up @@ -226,6 +246,7 @@ def customize_openapi_schema():
doi_identifiers.metadata,
experiment_sets.metadata,
experiments.metadata,
genes.metadata,
hgvs.metadata,
licenses.metadata,
# log.metadata,
Expand Down
21 changes: 21 additions & 0 deletions src/mavedb/view_models/gene.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from typing import Optional

from mavedb.view_models.base.base import BaseModel
from mavedb.view_models.score_set import ShortScoreSet


class GeneResponse(BaseModel):
symbol: str
name: str
hgnc_id: Optional[str] = None
locus_group: Optional[str] = None
location: Optional[str] = None
omim_id: Optional[str] = None
score_sets: list[ShortScoreSet]
limit: int
offset: int
total: int
total_scored_variants: int

class Config:
from_attributes = True
Empty file added tests/lib/hgnc/__init__.py
Empty file.
Empty file.
32 changes: 32 additions & 0 deletions tests/lib/hgnc/network/test_hgnc_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# ruff: noqa: E402
"""Network tests for the HGNC REST client. Require a live connection to rest.genenames.org."""

import pytest

# starlette is required for logging context functionality.
pytest.importorskip("starlette")

from mavedb.lib.exceptions import HGNCGeneNotFoundError
from mavedb.lib.hgnc.client import HGNCGeneInfo, fetch_gene_info


@pytest.mark.network
class TestFetchGeneInfoNetwork:
def test_known_gene_returns_expected_fields(self):
result = fetch_gene_info("BRCA1")

assert isinstance(result, HGNCGeneInfo)
assert result.symbol == "BRCA1"
assert result.name
assert result.hgnc_id and result.hgnc_id.startswith("HGNC:")

def test_hyphen_containing_symbol_resolves(self):
result = fetch_gene_info("HLA-A")

assert isinstance(result, HGNCGeneInfo)
assert result.symbol == "HLA-A"
assert result.hgnc_id and result.hgnc_id.startswith("HGNC:")

def test_unknown_symbol_raises_not_found(self):
with pytest.raises(HGNCGeneNotFoundError):
fetch_gene_info("NOTAREALSYMBOL99999")
Loading
Loading