In [9]:
from ga4gh.vrs.utils.hgvs_tools import HgvsTools
from bioutils.accessions import coerce_namespace
import hgvs.parser
import hgvs.location
import hgvs.posedit
import hgvs.edit
import hgvs.sequencevariant
import hgvs.dataproviders.uta

In [1]:
from ga4gh.vrs.dataproxy import SeqRepoRESTDataProxy
from biocommons.seqrepo import SeqRepo

seqrepo_rest_service_url = "https://services.genomicmedlab.org/seqrepo"
dp = SeqRepoRESTDataProxy(base_url=seqrepo_rest_service_url)

Removing allOf attribute from AbsoluteCopyNumber to avoid python-jsonschema-objects error.
Removing allOf attribute from SequenceInterval to avoid python-jsonschema-objects error.
Removing allOf attribute from RepeatedSequenceExpression to avoid python-jsonschema-objects error.


In [2]:
example = 'NC_000016.10:1510991:GG:G'

In [16]:
def ir_stype(a):
    if a.startswith("refseq:NM_"):
        return "n"
    if a.startswith("refseq:NP_"):
        return "p"
    if a.startswith("refseq:NG_"):
        return "g"
    if a.startswith("refseq:NC_"):
        return "g"
    if a.startswith("GRCh"):
        return "g"
    return None

In [3]:
sequence_id = 'NC_000016.10'
start, end = 1510991-10, 1510991+10

In [None]:
dp.get_sequence(sequence_id,start,end)

In [5]:
if start == end:    # insert: hgvs uses *exclusive coords*
    ref = None
    end += 1
else:               # else: hgvs uses *inclusive coords*
    ref = dp.get_sequence(sequence_id, start, end)
    start += 1

In [6]:
print(sequence_id,start,end)

NC_000016.10 1510982 1511001


In [11]:
ival = hgvs.location.Interval(
    start=hgvs.location.SimplePosition(base=start),
    end=hgvs.location.SimplePosition(base=end))
print(ival)

1510982_1511001


In [13]:
# TODO: checking what the state of the mutaiton is. because im trying to do something different than vrs i need to figure out how to do it. 
# alt = str(vo.state.sequence) or None  # "" => None

edit = hgvs.edit.NARefAlt(ref=ref, alt=None)
print(edit)

del


In [14]:
posedit = hgvs.posedit.PosEdit(pos=ival, edit=edit)
var = hgvs.sequencevariant.SequenceVariant(
    ac=None,
    type='stype',
    posedit=posedit)


In [15]:
print(posedit)
print(var)

1510982_1511001del
g.1510982_1511001del


In [21]:
from ga4gh.vrs.extras.translator import Translator
tlr = Translator(data_proxy=dp,
                 translate_sequence_identifiers=True,  # default
                 normalize=True,                       # default
                 identify=True)                        # default

In [26]:
namespace = 'refseq'
hgvs_exprs = []
for alias in dp.translate_sequence_identifier(sequence_id,'hgvs'):
    ns, a = alias.split(":")
    # skip GRCh accessions unless specifically requested
    # because they are ambiguous without their namespace,
    # which can't be included in HGVS expressions
    # TODO: use default_assembly_name here
    if ns.startswith("GRC") and namespace is None:
        continue

    if not (any(a.startswith(pfx) for pfx in ("NM", "NP", "NC", "NG"))):
        continue

    var.ac = a

    try:
        if not namespace.startswith("GRC"):
            # if the namespace is GRC, can't normalize, since hgvs can't deal with it
            hgvs_tools = tlr._get_hgvs_tools()
            parsed = hgvs_tools.parse(str(var))
            var = hgvs_tools.normalize(parsed)

        hgvs_exprs += [str(var)]
    except hgvs.exceptions.HGVSDataNotAvailableError:
        print(f"No data found for accession {a}")

print(hgvs_exprs) 

[]
