diff --git a/src/mavedb/lib/exceptions.py b/src/mavedb/lib/exceptions.py index 416ff8b3..8b8633f1 100644 --- a/src/mavedb/lib/exceptions.py +++ b/src/mavedb/lib/exceptions.py @@ -210,6 +210,18 @@ class UniProtPollingEnqueueError(ValueError): pass +class HGNCGeneNotFoundError(Exception): + """Raised when HGNC REST does not recognize a gene symbol.""" + + pass + + +class HGNCServiceError(Exception): + """Raised when HGNC REST cannot provide gene information.""" + + pass + + class LDHSubmissionFailureError(Exception): """Raised when submission to ClinGen Linked Data Hub (LDH) fails for all submissions.""" diff --git a/src/mavedb/lib/hgnc/__init__.py b/src/mavedb/lib/hgnc/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/mavedb/lib/hgnc/client.py b/src/mavedb/lib/hgnc/client.py new file mode 100644 index 00000000..5a1636ea --- /dev/null +++ b/src/mavedb/lib/hgnc/client.py @@ -0,0 +1,99 @@ +import logging +from dataclasses import dataclass +from typing import Optional +from urllib.parse import quote + +import requests + +from mavedb.lib.exceptions import HGNCGeneNotFoundError, HGNCServiceError +from mavedb.lib.hgnc.constants import HGNC_REST_BASE_URL +from mavedb.lib.logging.context import format_raised_exception_info_as_dict, logging_context, save_to_logging_context + +logger = logging.getLogger(__name__) + +HGNC_FETCH_HEADERS = {"Accept": "application/json"} +HGNC_FETCH_TIMEOUT = 10 + + +def _optional_doc_string(doc: dict, key: str) -> Optional[str]: + value = doc.get(key) + if value is None: + return None + if not isinstance(value, str): + logger.error(msg=f"HGNC REST response contained malformed {key}.", extra=logging_context()) + raise HGNCServiceError("Gene information service temporarily unavailable") + return value + + +@dataclass +class HGNCGeneInfo: + symbol: str + name: str + hgnc_id: Optional[str] = None + locus_group: Optional[str] = None + location: Optional[str] = None + omim_id: Optional[str] = None + + +def fetch_gene_info(symbol: str) -> HGNCGeneInfo: + quoted_symbol = quote(symbol, safe="") + url = f"{HGNC_REST_BASE_URL}/fetch/symbol/{quoted_symbol}" + save_to_logging_context({"hgnc_symbol": symbol}) + + try: + response = requests.get(url, headers=HGNC_FETCH_HEADERS, timeout=HGNC_FETCH_TIMEOUT) + response.raise_for_status() + data = response.json() + except requests.exceptions.RequestException as exc: + save_to_logging_context(format_raised_exception_info_as_dict(exc)) + logger.error(msg="HGNC REST request failed.", exc_info=exc, extra=logging_context()) + raise HGNCServiceError("Gene information service temporarily unavailable") from exc + except ValueError as exc: + save_to_logging_context(format_raised_exception_info_as_dict(exc)) + logger.error(msg="HGNC REST returned invalid JSON.", exc_info=exc, extra=logging_context()) + raise HGNCServiceError("Gene information service temporarily unavailable") from exc + + if not isinstance(data, dict): + logger.error(msg="HGNC REST response was not a JSON object.", extra=logging_context()) + raise HGNCServiceError("Gene information service temporarily unavailable") + + response_data = data.get("response") + if not isinstance(response_data, dict): + logger.error(msg="HGNC REST response did not contain expected response object.", extra=logging_context()) + raise HGNCServiceError("Gene information service temporarily unavailable") + + docs = response_data.get("docs") + if not isinstance(docs, list): + logger.error(msg="HGNC REST response did not contain expected docs list.", extra=logging_context()) + raise HGNCServiceError("Gene information service temporarily unavailable") + + if not docs: + raise HGNCGeneNotFoundError(f"Gene symbol not found: {symbol}") + + doc = docs[0] + if not isinstance(doc, dict): + logger.error(msg="HGNC REST response contained a malformed gene document.", extra=logging_context()) + raise HGNCServiceError("Gene information service temporarily unavailable") + + doc_symbol = doc.get("symbol", symbol) + doc_name = doc.get("name") + if not isinstance(doc_symbol, str) or not doc_symbol or not isinstance(doc_name, str) or not doc_name: + logger.error(msg="HGNC REST response contained invalid gene identity fields.", extra=logging_context()) + raise HGNCServiceError("Gene information service temporarily unavailable") + + omim_ids = doc.get("omim_id") or [] + if omim_ids and not isinstance(omim_ids, list): + logger.error(msg="HGNC REST response contained malformed OMIM identifiers.", extra=logging_context()) + raise HGNCServiceError("Gene information service temporarily unavailable") + if omim_ids and not isinstance(omim_ids[0], str): + logger.error(msg="HGNC REST response contained malformed OMIM identifiers.", extra=logging_context()) + raise HGNCServiceError("Gene information service temporarily unavailable") + + return HGNCGeneInfo( + symbol=doc_symbol, + name=doc_name, + hgnc_id=_optional_doc_string(doc, "hgnc_id"), + locus_group=_optional_doc_string(doc, "locus_group"), + location=_optional_doc_string(doc, "location"), + omim_id=omim_ids[0] if omim_ids else None, + ) diff --git a/src/mavedb/lib/hgnc/constants.py b/src/mavedb/lib/hgnc/constants.py new file mode 100644 index 00000000..d3883db8 --- /dev/null +++ b/src/mavedb/lib/hgnc/constants.py @@ -0,0 +1 @@ +HGNC_REST_BASE_URL = "https://rest.genenames.org" diff --git a/src/mavedb/routers/genes.py b/src/mavedb/routers/genes.py new file mode 100644 index 00000000..5e6b10a0 --- /dev/null +++ b/src/mavedb/routers/genes.py @@ -0,0 +1,159 @@ +import logging +from typing import Any + +from fastapi import APIRouter, Depends, Query +from sqlalchemy import func +from sqlalchemy.orm import Session, joinedload, selectinload +from starlette.convertors import Convertor, register_url_convertor + +from mavedb import deps +from mavedb.lib.experiments import enrich_experiment_with_num_score_sets +from mavedb.lib.hgnc.client import fetch_gene_info +from mavedb.lib.logging import LoggedRoute +from mavedb.lib.logging.context import logging_context, save_to_logging_context +from mavedb.models.ensembl_offset import EnsemblOffset +from mavedb.models.experiment import Experiment +from mavedb.models.experiment_controlled_keyword import ExperimentControlledKeywordAssociation +from mavedb.models.experiment_publication_identifier import ExperimentPublicationIdentifierAssociation +from mavedb.models.refseq_offset import RefseqOffset +from mavedb.models.score_set import ScoreSet +from mavedb.models.score_set_publication_identifier import ScoreSetPublicationIdentifierAssociation +from mavedb.models.target_gene import TargetGene +from mavedb.models.target_sequence import TargetSequence +from mavedb.models.uniprot_offset import UniprotOffset +from mavedb.routers.shared import GATEWAY_ERROR_RESPONSES, PUBLIC_ERROR_RESPONSES, ROUTER_BASE_PREFIX +from mavedb.view_models.gene import GeneResponse +from mavedb.view_models.score_set import ShortScoreSet + +TAG_NAME = "Genes" +logger = logging.getLogger(__name__) + +GENE_SCORE_SETS_MAX_LIMIT = 100 + + +# See the equivalent pattern in publication_identifiers.py for context on this approach. +# HGNC-approved symbols contain uppercase Latin letters and Arabic numerals, with hyphens +# allowed for specific gene groups (e.g. HLA-A, BRCA1). +class GeneSymbolConverter(Convertor): + regex = r"[A-Za-z][A-Za-z0-9-]*" + + def convert(self, value: str) -> str: + return value + + def to_string(self, value: str) -> str: + return str(value) + + +register_url_convertor("gene_symbol", GeneSymbolConverter()) + +router = APIRouter( + prefix=f"{ROUTER_BASE_PREFIX}/genes", + tags=[TAG_NAME], + responses={**PUBLIC_ERROR_RESPONSES, **GATEWAY_ERROR_RESPONSES}, + route_class=LoggedRoute, +) + +metadata = { + "name": TAG_NAME, + "description": "Retrieve gene identity and associated public MaveDB score sets.", +} + + +def _gene_score_set_base_query(db: Session, symbol: str): + return db.query(ScoreSet).filter( + ScoreSet.target_genes.any(TargetGene.mapped_hgnc_name == symbol), + ScoreSet.private.is_(False), + ScoreSet.published_date.isnot(None), + ~ScoreSet.superseding_score_set.has(ScoreSet.published_date.isnot(None)), + ) + + +def _score_set_load_options(): + return ( + selectinload(ScoreSet.experiment).options( + selectinload(Experiment.experiment_set), + selectinload(Experiment.keyword_objs).joinedload(ExperimentControlledKeywordAssociation.controlled_keyword), + selectinload(Experiment.created_by), + selectinload(Experiment.modified_by), + selectinload(Experiment.doi_identifiers), + selectinload(Experiment.publication_identifier_associations).joinedload( + ExperimentPublicationIdentifierAssociation.publication + ), + selectinload(Experiment.raw_read_identifiers), + selectinload(Experiment.score_sets), + selectinload(Experiment.official_collections), + ), + selectinload(ScoreSet.license), + selectinload(ScoreSet.doi_identifiers), + selectinload(ScoreSet.publication_identifier_associations).joinedload( + ScoreSetPublicationIdentifierAssociation.publication + ), + selectinload(ScoreSet.target_genes).options( + joinedload(TargetGene.ensembl_offset).joinedload(EnsemblOffset.identifier), + joinedload(TargetGene.refseq_offset).joinedload(RefseqOffset.identifier), + joinedload(TargetGene.uniprot_offset).joinedload(UniprotOffset.identifier), + joinedload(TargetGene.target_sequence).joinedload(TargetSequence.taxonomy), + joinedload(TargetGene.target_accession), + ), + ) + + +@router.get( + "/{symbol:gene_symbol}", + status_code=200, + response_model=GeneResponse, + response_model_exclude_none=True, + summary="Fetch a gene and associated published score sets", +) +def get_gene( + symbol: str, + limit: int = Query( + default=20, + ge=1, + le=GENE_SCORE_SETS_MAX_LIMIT, + description=f"Number of score sets to return (maximum {GENE_SCORE_SETS_MAX_LIMIT}).", + ), + offset: int = Query(default=0, ge=0, description="Number of score sets to skip."), + db: Session = Depends(deps.get_db), +) -> Any: + save_to_logging_context({"requested_resource": "gene", "hgnc_symbol": symbol, "limit": limit, "offset": offset}) + gene_info = fetch_gene_info(symbol) + + base_query = _gene_score_set_base_query(db, gene_info.symbol) + total = base_query.order_by(None).limit(None).offset(None).count() + total_scored_variants = ( + base_query.order_by(None) + .limit(None) + .offset(None) + .with_entities(func.coalesce(func.sum(ScoreSet.num_variants), 0)) + .scalar() + ) + score_sets = ( + base_query.options(*_score_set_load_options()) + .order_by(ScoreSet.published_date.desc(), ScoreSet.urn.desc()) + .offset(offset) + .limit(limit) + .all() + ) + + logger.debug(msg=f"Gene endpoint yielded {len(score_sets)} score sets.", extra=logging_context()) + response_score_sets = [] + for score_set in score_sets: + enriched_experiment = enrich_experiment_with_num_score_sets(score_set.experiment, None) + response_score_sets.append( + ShortScoreSet.model_validate(score_set).model_copy(update={"experiment": enriched_experiment}) + ) + + return GeneResponse( + symbol=gene_info.symbol, + name=gene_info.name, + hgnc_id=gene_info.hgnc_id, + locus_group=gene_info.locus_group, + location=gene_info.location, + omim_id=gene_info.omim_id, + score_sets=response_score_sets, + limit=limit, + offset=offset, + total=total, + total_scored_variants=total_scored_variants, + ) diff --git a/src/mavedb/server_main.py b/src/mavedb/server_main.py index c8296555..15071488 100644 --- a/src/mavedb/server_main.py +++ b/src/mavedb/server_main.py @@ -22,6 +22,8 @@ from mavedb import __version__ from mavedb.lib.exceptions import ( AmbiguousIdentifierError, + HGNCGeneNotFoundError, + HGNCServiceError, MixedTargetError, NonexistentIdentifierError, ) @@ -44,6 +46,7 @@ doi_identifiers, experiment_sets, experiments, + genes, hgvs, job_runs, licenses, @@ -95,6 +98,7 @@ app.include_router(doi_identifiers.router) app.include_router(experiment_sets.router) app.include_router(experiments.router) +app.include_router(genes.router) app.include_router(hgvs.router) app.include_router(job_runs.router) app.include_router(licenses.router) @@ -153,6 +157,22 @@ async def nonexistent_identifier_error_exception_handler(request: Request, exc: return response +@app.exception_handler(HGNCGeneNotFoundError) +async def hgnc_gene_not_found_error_exception_handler(request: Request, exc: HGNCGeneNotFoundError): + response = JSONResponse(status_code=404, content={"message": str(exc)}) + save_to_logging_context(format_raised_exception_info_as_dict(exc)) + log_request(request, response, time.time_ns()) + return response + + +@app.exception_handler(HGNCServiceError) +async def hgnc_service_error_exception_handler(request: Request, exc: HGNCServiceError): + response = JSONResponse(status_code=503, content={"message": "Gene information service temporarily unavailable"}) + save_to_logging_context(format_raised_exception_info_as_dict(exc)) + log_request(request, response, time.time_ns()) + return response + + @app.exception_handler(EutilsRequestError) async def nonexistent_pmid_error_exception_handler(request: Request, exc: EutilsRequestError): response = JSONResponse(status_code=404, content={"message": str(exc)}) @@ -226,6 +246,7 @@ def customize_openapi_schema(): doi_identifiers.metadata, experiment_sets.metadata, experiments.metadata, + genes.metadata, hgvs.metadata, licenses.metadata, # log.metadata, diff --git a/src/mavedb/view_models/gene.py b/src/mavedb/view_models/gene.py new file mode 100644 index 00000000..c4917236 --- /dev/null +++ b/src/mavedb/view_models/gene.py @@ -0,0 +1,21 @@ +from typing import Optional + +from mavedb.view_models.base.base import BaseModel +from mavedb.view_models.score_set import ShortScoreSet + + +class GeneResponse(BaseModel): + symbol: str + name: str + hgnc_id: Optional[str] = None + locus_group: Optional[str] = None + location: Optional[str] = None + omim_id: Optional[str] = None + score_sets: list[ShortScoreSet] + limit: int + offset: int + total: int + total_scored_variants: int + + class Config: + from_attributes = True diff --git a/tests/lib/hgnc/__init__.py b/tests/lib/hgnc/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/lib/hgnc/network/__init__.py b/tests/lib/hgnc/network/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/lib/hgnc/network/test_hgnc_client.py b/tests/lib/hgnc/network/test_hgnc_client.py new file mode 100644 index 00000000..4a8df303 --- /dev/null +++ b/tests/lib/hgnc/network/test_hgnc_client.py @@ -0,0 +1,32 @@ +# ruff: noqa: E402 +"""Network tests for the HGNC REST client. Require a live connection to rest.genenames.org.""" + +import pytest + +# starlette is required for logging context functionality. +pytest.importorskip("starlette") + +from mavedb.lib.exceptions import HGNCGeneNotFoundError +from mavedb.lib.hgnc.client import HGNCGeneInfo, fetch_gene_info + + +@pytest.mark.network +class TestFetchGeneInfoNetwork: + def test_known_gene_returns_expected_fields(self): + result = fetch_gene_info("BRCA1") + + assert isinstance(result, HGNCGeneInfo) + assert result.symbol == "BRCA1" + assert result.name + assert result.hgnc_id and result.hgnc_id.startswith("HGNC:") + + def test_hyphen_containing_symbol_resolves(self): + result = fetch_gene_info("HLA-A") + + assert isinstance(result, HGNCGeneInfo) + assert result.symbol == "HLA-A" + assert result.hgnc_id and result.hgnc_id.startswith("HGNC:") + + def test_unknown_symbol_raises_not_found(self): + with pytest.raises(HGNCGeneNotFoundError): + fetch_gene_info("NOTAREALSYMBOL99999") diff --git a/tests/lib/hgnc/test_hgnc_client.py b/tests/lib/hgnc/test_hgnc_client.py new file mode 100644 index 00000000..d65bdd7a --- /dev/null +++ b/tests/lib/hgnc/test_hgnc_client.py @@ -0,0 +1,201 @@ +# ruff: noqa: E402 + +import pytest + +# starlette is required for logging context functionality. +pytest.importorskip("starlette") + +from unittest import mock + +import requests + +from mavedb.lib.exceptions import HGNCGeneNotFoundError, HGNCServiceError +from mavedb.lib.hgnc.client import HGNCGeneInfo, fetch_gene_info + + +def _valid_doc(symbol="BRCA1"): + return { + "symbol": symbol, + "name": "BRCA1 DNA repair associated", + "hgnc_id": "HGNC:1100", + "locus_group": "protein-coding gene", + "location": "17q21.31", + "omim_id": ["113705"], + } + + +def _valid_response(symbol="BRCA1"): + return { + "response": { + "numFound": 1, + "docs": [_valid_doc(symbol)], + } + } + + +@pytest.mark.unit +@mock.patch("mavedb.lib.hgnc.client.requests.get") +class TestFetchGeneInfo: + def test_valid_response_returns_gene_info(self, mock_get): + mock_response = mock.Mock() + mock_response.json.return_value = _valid_response() + mock_get.return_value = mock_response + + result = fetch_gene_info("BRCA1") + + assert isinstance(result, HGNCGeneInfo) + assert result.symbol == "BRCA1" + assert result.name == "BRCA1 DNA repair associated" + assert result.hgnc_id == "HGNC:1100" + assert result.locus_group == "protein-coding gene" + assert result.location == "17q21.31" + assert result.omim_id == "113705" + + def test_optional_fields_absent_returns_none(self, mock_get): + doc = {"symbol": "BRCA1", "name": "BRCA1 DNA repair associated"} + mock_response = mock.Mock() + mock_response.json.return_value = {"response": {"numFound": 1, "docs": [doc]}} + mock_get.return_value = mock_response + + result = fetch_gene_info("BRCA1") + + assert result.hgnc_id is None + assert result.locus_group is None + assert result.location is None + assert result.omim_id is None + + def test_hyphen_symbol_is_preserved_in_request_url(self, mock_get): + mock_response = mock.Mock() + mock_response.json.return_value = _valid_response("HLA-A") + mock_get.return_value = mock_response + + fetch_gene_info("HLA-A") + + call_url = mock_get.call_args[0][0] + assert "HLA-A" in call_url + + def test_empty_docs_raises_not_found(self, mock_get): + mock_response = mock.Mock() + mock_response.json.return_value = {"response": {"numFound": 0, "docs": []}} + mock_get.return_value = mock_response + + with pytest.raises(HGNCGeneNotFoundError): + fetch_gene_info("NOTAREAL") + + def test_http_error_raises_service_error(self, mock_get): + mock_response = mock.Mock() + mock_response.raise_for_status.side_effect = requests.exceptions.HTTPError("503 Service Unavailable") + mock_get.return_value = mock_response + + with pytest.raises(HGNCServiceError): + fetch_gene_info("BRCA1") + + def test_connection_error_raises_service_error(self, mock_get): + mock_get.side_effect = requests.exceptions.ConnectionError("Connection refused") + + with pytest.raises(HGNCServiceError): + fetch_gene_info("BRCA1") + + def test_invalid_json_raises_service_error(self, mock_get): + mock_response = mock.Mock() + mock_response.json.side_effect = ValueError("No JSON object could be decoded") + mock_get.return_value = mock_response + + with pytest.raises(HGNCServiceError): + fetch_gene_info("BRCA1") + + def test_non_dict_top_level_response_raises_service_error(self, mock_get): + mock_response = mock.Mock() + mock_response.json.return_value = ["not", "a", "dict"] + mock_get.return_value = mock_response + + with pytest.raises(HGNCServiceError): + fetch_gene_info("BRCA1") + + def test_missing_response_key_raises_service_error(self, mock_get): + mock_response = mock.Mock() + mock_response.json.return_value = {"unexpected": "shape"} + mock_get.return_value = mock_response + + with pytest.raises(HGNCServiceError): + fetch_gene_info("BRCA1") + + def test_non_dict_response_data_raises_service_error(self, mock_get): + mock_response = mock.Mock() + mock_response.json.return_value = {"response": "not a dict"} + mock_get.return_value = mock_response + + with pytest.raises(HGNCServiceError): + fetch_gene_info("BRCA1") + + def test_non_list_docs_raises_service_error(self, mock_get): + mock_response = mock.Mock() + mock_response.json.return_value = {"response": {"docs": "not a list"}} + mock_get.return_value = mock_response + + with pytest.raises(HGNCServiceError): + fetch_gene_info("BRCA1") + + def test_non_dict_doc_raises_service_error(self, mock_get): + mock_response = mock.Mock() + mock_response.json.return_value = {"response": {"docs": ["not a dict"]}} + mock_get.return_value = mock_response + + with pytest.raises(HGNCServiceError): + fetch_gene_info("BRCA1") + + def test_missing_symbol_key_falls_back_to_input(self, mock_get): + """When the doc lacks a 'symbol' key, the input symbol is used as a fallback.""" + doc = {"name": "BRCA1 DNA repair associated"} + mock_response = mock.Mock() + mock_response.json.return_value = {"response": {"docs": [doc]}} + mock_get.return_value = mock_response + + result = fetch_gene_info("BRCA1") + + assert result.symbol == "BRCA1" + + def test_empty_symbol_raises_service_error(self, mock_get): + doc = {**_valid_doc(), "symbol": ""} + mock_response = mock.Mock() + mock_response.json.return_value = {"response": {"docs": [doc]}} + mock_get.return_value = mock_response + + with pytest.raises(HGNCServiceError): + fetch_gene_info("BRCA1") + + def test_missing_name_raises_service_error(self, mock_get): + doc = {"symbol": "BRCA1"} + mock_response = mock.Mock() + mock_response.json.return_value = {"response": {"docs": [doc]}} + mock_get.return_value = mock_response + + with pytest.raises(HGNCServiceError): + fetch_gene_info("BRCA1") + + def test_non_list_omim_id_raises_service_error(self, mock_get): + doc = {**_valid_doc(), "omim_id": "113705"} + mock_response = mock.Mock() + mock_response.json.return_value = {"response": {"docs": [doc]}} + mock_get.return_value = mock_response + + with pytest.raises(HGNCServiceError): + fetch_gene_info("BRCA1") + + def test_non_string_omim_id_element_raises_service_error(self, mock_get): + doc = {**_valid_doc(), "omim_id": [113705]} + mock_response = mock.Mock() + mock_response.json.return_value = {"response": {"docs": [doc]}} + mock_get.return_value = mock_response + + with pytest.raises(HGNCServiceError): + fetch_gene_info("BRCA1") + + def test_non_string_optional_field_raises_service_error(self, mock_get): + doc = {**_valid_doc(), "locus_group": ["protein-coding gene"]} + mock_response = mock.Mock() + mock_response.json.return_value = {"response": {"docs": [doc]}} + mock_get.return_value = mock_response + + with pytest.raises(HGNCServiceError): + fetch_gene_info("BRCA1") diff --git a/tests/routers/test_genes.py b/tests/routers/test_genes.py new file mode 100644 index 00000000..70463e5a --- /dev/null +++ b/tests/routers/test_genes.py @@ -0,0 +1,246 @@ +# ruff: noqa: E402 + +from copy import deepcopy +from datetime import date +from unittest.mock import patch +from urllib.parse import quote + +import pytest + +pytestmark = pytest.mark.unit + +fastapi = pytest.importorskip("fastapi") + +from mavedb.lib.exceptions import HGNCGeneNotFoundError, HGNCServiceError +from mavedb.lib.hgnc.client import HGNCGeneInfo +from mavedb.models.score_set import ScoreSet as ScoreSetDbModel +from tests.helpers.constants import TEST_MINIMAL_SEQ_SCORESET, VALID_GENE +from tests.helpers.util.experiment import create_experiment +from tests.helpers.util.score_set import create_multi_target_score_set, create_seq_score_set + +INVALID_GENE = "NOTAREAL" +SLASH_GENE = "TRAV29/DV5" + + +def _hgnc_gene_info(symbol=VALID_GENE): + return HGNCGeneInfo( + symbol=symbol, + name="BRCA1 DNA repair associated", + hgnc_id="HGNC:1100", + locus_group="protein-coding gene", + location="17q21.31", + omim_id="113705", + ) + + +def _make_score_set( + client, + session, + *, + symbol=VALID_GENE, + title="Gene endpoint score set", + private=False, + published=True, + num_variants=7, +): + experiment = create_experiment(client, {"title": f"{title} experiment"}) + payload = deepcopy(TEST_MINIMAL_SEQ_SCORESET) + payload["title"] = title + payload["shortDescription"] = f"{title} short description" + score_set = create_seq_score_set(client, experiment["urn"], payload) + + db_score_set = session.query(ScoreSetDbModel).filter(ScoreSetDbModel.urn == score_set["urn"]).one() + db_score_set.private = private + db_score_set.published_date = date(2024, 1, 1) if published else None + db_score_set.num_variants = num_variants + for target_gene in db_score_set.target_genes: + target_gene.mapped_hgnc_name = symbol + session.commit() + return db_score_set.urn + + +def _make_multi_target_score_set(client, session, second_symbol="TP53"): + experiment = create_experiment(client, {"title": "Multi-target experiment"}) + score_set = create_multi_target_score_set(client, experiment["urn"]) + db_score_set = session.query(ScoreSetDbModel).filter(ScoreSetDbModel.urn == score_set["urn"]).one() + db_score_set.private = False + db_score_set.published_date = date(2024, 1, 1) + db_score_set.num_variants = 11 + db_score_set.target_genes[0].mapped_hgnc_name = VALID_GENE + db_score_set.target_genes[1].mapped_hgnc_name = second_symbol + session.commit() + return db_score_set.urn + + +def test_get_gene_valid_with_data(client, session, setup_router_db): + visible_score_set_urn = _make_score_set(client, session, title="BRCA1 visible score set", num_variants=13) + + with patch("mavedb.routers.genes.fetch_gene_info") as mock_fetch: + mock_fetch.return_value = _hgnc_gene_info() + response = client.get(f"/api/v1/genes/{VALID_GENE}") + + mock_fetch.assert_called_once_with(VALID_GENE) + assert response.status_code == 200 + body = response.json() + assert body["symbol"] == VALID_GENE + assert body["name"] == "BRCA1 DNA repair associated" + assert body["hgncId"] == "HGNC:1100" + assert body["locusGroup"] == "protein-coding gene" + assert body["location"] == "17q21.31" + assert body["omimId"] == "113705" + assert body["limit"] == 20 + assert body["offset"] == 0 + assert body["total"] == 1 + assert body["totalScoredVariants"] == 13 + assert body["scoreSets"][0]["urn"] == visible_score_set_urn + assert body["scoreSets"][0]["numVariants"] == 13 + assert body["scoreSets"][0]["targetGenes"][0]["mappedHgncName"] == VALID_GENE + + +def test_get_gene_valid_no_data(client, setup_router_db): + with patch("mavedb.routers.genes.fetch_gene_info") as mock_fetch: + mock_fetch.return_value = _hgnc_gene_info() + response = client.get(f"/api/v1/genes/{VALID_GENE}") + + assert response.status_code == 200 + body = response.json() + assert body["scoreSets"] == [] + assert body["total"] == 0 + assert body["totalScoredVariants"] == 0 + + +def test_get_gene_slash_symbol_returns_404(): + # No approved HGNC symbols currently contain a slash — the GeneSymbolConverter regex + # intentionally excludes them. This test documents that behaviour: a percent-encoded + # slash in the URL path is not routed to the gene handler. + from mavedb.server_main import app + from starlette.testclient import TestClient as StarletteTestClient + + with StarletteTestClient(app) as c: + response = c.get(f"/api/v1/genes/{quote(SLASH_GENE, safe='')}") + + assert response.status_code == 404 + + +def test_get_gene_multi_target_includes_all_target_genes(client, session, setup_router_db): + multi_target_score_set_urn = _make_multi_target_score_set(client, session) + + with patch("mavedb.routers.genes.fetch_gene_info") as mock_fetch: + mock_fetch.return_value = _hgnc_gene_info() + response = client.get(f"/api/v1/genes/{VALID_GENE}") + + assert response.status_code == 200 + score_set = response.json()["scoreSets"][0] + assert score_set["urn"] == multi_target_score_set_urn + assert len(score_set["targetGenes"]) == 2 + assert {target["mappedHgncName"] for target in score_set["targetGenes"]} == {VALID_GENE, "TP53"} + + +def test_get_gene_multi_target_same_gene_counted_once(client, session, setup_router_db): + multi_target_score_set_urn = _make_multi_target_score_set(client, session, second_symbol=VALID_GENE) + + with patch("mavedb.routers.genes.fetch_gene_info") as mock_fetch: + mock_fetch.return_value = _hgnc_gene_info() + response = client.get(f"/api/v1/genes/{VALID_GENE}?limit=1") + + assert response.status_code == 200 + body = response.json() + assert body["total"] == 1 + assert body["totalScoredVariants"] == 11 + assert len(body["scoreSets"]) == 1 + assert body["scoreSets"][0]["urn"] == multi_target_score_set_urn + + +def test_get_gene_invalid_symbol(client, setup_router_db): + with patch("mavedb.routers.genes.fetch_gene_info") as mock_fetch: + mock_fetch.side_effect = HGNCGeneNotFoundError(f"Gene symbol not found: {INVALID_GENE}") + response = client.get(f"/api/v1/genes/{INVALID_GENE}") + + assert response.status_code == 404 + + +def test_get_gene_private_excluded(client, session, setup_router_db): + private_score_set_urn = _make_score_set(client, session, title="BRCA1 private score set", private=True) + + with patch("mavedb.routers.genes.fetch_gene_info") as mock_fetch: + mock_fetch.return_value = _hgnc_gene_info() + response = client.get(f"/api/v1/genes/{VALID_GENE}") + + assert response.status_code == 200 + returned_urns = {score_set["urn"] for score_set in response.json()["scoreSets"]} + assert private_score_set_urn not in returned_urns + assert response.json()["total"] == 0 + assert response.json()["totalScoredVariants"] == 0 + + +def test_get_gene_unpublished_excluded(client, session, setup_router_db): + unpublished_score_set_urn = _make_score_set( + client, session, title="BRCA1 unpublished score set", private=False, published=False + ) + + with patch("mavedb.routers.genes.fetch_gene_info") as mock_fetch: + mock_fetch.return_value = _hgnc_gene_info() + response = client.get(f"/api/v1/genes/{VALID_GENE}") + + assert response.status_code == 200 + returned_urns = {score_set["urn"] for score_set in response.json()["scoreSets"]} + assert unpublished_score_set_urn not in returned_urns + assert response.json()["total"] == 0 + assert response.json()["totalScoredVariants"] == 0 + + +def test_get_gene_superseded_excluded(client, session, setup_router_db): + superseded_urn = _make_score_set(client, session, title="BRCA1 superseded score set", num_variants=3) + superseding_urn = _make_score_set(client, session, title="BRCA1 superseding score set", num_variants=5) + + superseded = session.query(ScoreSetDbModel).filter(ScoreSetDbModel.urn == superseded_urn).one() + superseding = session.query(ScoreSetDbModel).filter(ScoreSetDbModel.urn == superseding_urn).one() + superseding.superseded_score_set_id = superseded.id + session.commit() + + with patch("mavedb.routers.genes.fetch_gene_info") as mock_fetch: + mock_fetch.return_value = _hgnc_gene_info() + response = client.get(f"/api/v1/genes/{VALID_GENE}") + + assert response.status_code == 200 + body = response.json() + returned_urns = {ss["urn"] for ss in body["scoreSets"]} + assert superseded_urn not in returned_urns + assert superseding_urn in returned_urns + assert body["total"] == 1 + + +def test_get_gene_pagination(client, session, setup_router_db): + first_urn = _make_score_set(client, session, title="BRCA1 first score set", num_variants=5) + second_urn = _make_score_set(client, session, title="BRCA1 second score set", num_variants=7) + third_urn = _make_score_set(client, session, title="BRCA1 third score set", num_variants=11) + + first = session.query(ScoreSetDbModel).filter(ScoreSetDbModel.urn == first_urn).one() + second = session.query(ScoreSetDbModel).filter(ScoreSetDbModel.urn == second_urn).one() + third = session.query(ScoreSetDbModel).filter(ScoreSetDbModel.urn == third_urn).one() + first.published_date = date(2024, 1, 3) + second.published_date = date(2024, 1, 2) + third.published_date = date(2024, 1, 1) + session.commit() + + with patch("mavedb.routers.genes.fetch_gene_info") as mock_fetch: + mock_fetch.return_value = _hgnc_gene_info() + response = client.get(f"/api/v1/genes/{VALID_GENE}?limit=1&offset=1") + + assert response.status_code == 200 + body = response.json() + assert body["limit"] == 1 + assert body["offset"] == 1 + assert body["total"] == 3 + assert body["totalScoredVariants"] == 23 + assert len(body["scoreSets"]) == 1 + assert body["scoreSets"][0]["urn"] == second_urn + + +def test_get_gene_hgnc_service_error_returns_503(client, setup_router_db): + with patch("mavedb.routers.genes.fetch_gene_info") as mock_fetch: + mock_fetch.side_effect = HGNCServiceError("Gene information service temporarily unavailable") + response = client.get(f"/api/v1/genes/{VALID_GENE}") + + assert response.status_code == 503 + assert "Gene information service temporarily unavailable" in response.json()["message"]