In [3]:
from ga4gh.core import sha512t24u
from ga4gh.core import ga4gh_digest, ga4gh_identify, ga4gh_serialize
from ga4gh.vrs import __version__, models
from ga4gh.vrs.dataproxy import SeqRepoRESTDataProxy
import json



In [4]:
# Requires seqrepo REST interface is running on this URL (e.g., using docker image)
seqrepo_rest_service_url = "https://services.genomicmedlab.org/seqrepo"
dp = SeqRepoRESTDataProxy(base_url=seqrepo_rest_service_url)

In [5]:
def ppo(o, indent=2):
    """pretty print object as json"""
    print(json.dumps(o.as_dict(), sort_keys=True, indent=indent))

# Guide of Breaking down varation. 

## What time of varation is occuring? 
    - substitution, deletion, or insertion ?
        - NOTE: this will help you identify how you will use interbase residue to calculate your start and end locaiton. 
        
## Location
    - Chromosome or Sequence level 
    - What location is present to infer from? 
        - Start 
        - End
## State
    - Literal, Derived, Repeated ? 

Citation:
* These examples were contributed by Ronak Patel to assess the ClinGen Allele Registry implementation of VR. 
* https://github.com/ga4gh/vrs-python/blob/8161aef297b257fe118c1bcc306a456851a7df1e/notebooks/appendices/Contributed%20Tests.ipynb

### Example 1: NC_000013.11:g.32936732C=

Varation
* “=” (equals) is used to indicate a sequence was tested but found unchanged; p.(Arg234=)
* NOTE: Im assuming I treat this as a Substitution

Location
* Sequence Level
* Start: 32936731
* End: 32936732

State
* Literal: "C"

In [6]:
ex1_interval = models.SequenceInterval(start = models.Number(value= 32936731, type = "Number"),end = models.Number(value= 32936732, type = "Number"), type = "SequenceInterval")
ex1_compid = dp.translate_sequence_identifier("NC_000013.11", "ga4gh")
ex1_location = models.SequenceLocation(
    sequence_id = "ga4gh:SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT",
        interval = ex1_interval,
        type = "SequenceLocation")
ex1_state = models.LiteralSequenceExpression(sequence="C", type="LiteralSequenceExpression")
ex1_allele = models.Allele(location = ex1_location, state = ex1_state, type = "Allele")
ppo(ex1_allele)
print(ga4gh_identify(ex1_allele))

{
  "location": {
    "interval": {
      "end": {
        "type": "Number",
        "value": 32936732
      },
      "start": {
        "type": "Number",
        "value": 32936731
      },
      "type": "SequenceInterval"
    },
    "sequence_id": "ga4gh:SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT",
    "type": "SequenceLocation"
  },
  "state": {
    "sequence": "C",
    "type": "LiteralSequenceExpression"
  },
  "type": "Allele"
}
ga4gh:VA.DkZLLMnwoH6zIncSRh2c05nzCNLdTqHl


### Example 2: NC_000007.14:g.55181320A>T

Varation
* “>” (greater than) is used to describe substitution variants (DNA and RNA level); g.12345A>T, r.123a>u (see DNA, RNA)
* NOTE: Treat as a Substitution

Location
* Sequence Level
* Start: 55181319
* End: 55181320

State
* Literal: "T"

In [7]:
ex2_compid = dp.translate_sequence_identifier("NC_000007.14", "ga4gh")
print(ex2_compid)

['ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul']


In [8]:
ex2_allele = models.Allele(

    location = models.Location(
        type = "SequenceLocation",
        sequence_id = "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
        interval = models.SequenceInterval(
            start = models.Number(value = 55181319, type = "Number"),
            end = models.Number(value = 55181320, type = "Number")
        )
    ),

    state = models.LiteralSequenceExpression(sequence = "T",type = "LiteralSequenceExpression"),

    type = "Allele"
)

ppo(ex2_allele)
print(ga4gh_identify(ex2_allele))

{
  "location": {
    "interval": {
      "end": {
        "type": "Number",
        "value": 55181320
      },
      "start": {
        "type": "Number",
        "value": 55181319
      },
      "type": "SequenceInterval"
    },
    "sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
    "type": "SequenceLocation"
  },
  "state": {
    "sequence": "T",
    "type": "LiteralSequenceExpression"
  },
  "type": "Allele"
}
ga4gh:VA.5Z7gWQGUuGAPe4Pw2_kJvnkhS2Q5jRhY


### Example 3: NC_000007.14:g.55181220del

Varation
* “del” indicates a deletion; c.76delA
* NOTE: Treat as a Del

Location
* Sequence Level
* Start: 55181219
* End: 55181220

State
* Literal: " "

In [9]:
ex3_compID = dp.translate_sequence_identifier("NC_000007.14", "ga4gh")
print(ex3_compID)

['ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul']


In [10]:
ex3_allele = models.Allele(

    location = models.SequenceLocation(
        sequence_id = 'ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul',

        interval = models.SequenceInterval(
            start = models.Number(value = 55181219, type = "Number"),
            end = models.Number(value = 55181220, type = "Number"),
            type = "SequenceInterval" ),

        type = "SequenceLocation",
        ),

    state = models.LiteralSequenceExpression(sequence = '', type = "LiteralSequenceExpression"),

    type = 'Allele'
)

ppo(ex3_allele)
print(ga4gh_identify(ex3_allele))

{
  "location": {
    "interval": {
      "end": {
        "type": "Number",
        "value": 55181220
      },
      "start": {
        "type": "Number",
        "value": 55181219
      },
      "type": "SequenceInterval"
    },
    "sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
    "type": "SequenceLocation"
  },
  "state": {
    "sequence": "",
    "type": "LiteralSequenceExpression"
  },
  "type": "Allele"
}
ga4gh:VA.h6WuolTwZJYZh86qP2a8YVA1WXpHuY_X


### Example 4: NC_000007.14:g.55181230_55181231insGGCT

Varation
* “ins” indicates an insertion; c.76_77insGm -- (duplicating insertions are described as duplications, not as insertions)
* NOTE: Treat as a insertion

Location
* Sequence Level
* Start: 55181230
* End: 55181230

State
* Literal: "GGCT"

In [11]:
ex4_compID = dp.translate_sequence_identifier("NC_000007.14", "ga4gh")
print(ex4_compID)

['ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul']


In [12]:
ex4_allele = models.Allele(
    type = 'Allele',

    location = models.SequenceLocation(
        type = 'SequenceLocation',
        sequence_id = 'ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul',
        interval = models.SequenceInterval(
            type = 'SequenceInterval',
            start = models.Number(value = 55181230, type = 'Number'),
            end = models.Number(value = 55181230, type = 'Number'),
        )
    ),

    state = models.LiteralSequenceExpression(sequence = 'GGCT',type = 'LiteralSequenceExpression')        
    )

ppo(ex4_allele)
print(ga4gh_identify(ex4_allele))

{
  "location": {
    "interval": {
      "end": {
        "type": "Number",
        "value": 55181230
      },
      "start": {
        "type": "Number",
        "value": 55181230
      },
      "type": "SequenceInterval"
    },
    "sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
    "type": "SequenceLocation"
  },
  "state": {
    "sequence": "GGCT",
    "type": "LiteralSequenceExpression"
  },
  "type": "Allele"
}
ga4gh:VA.JKGCs07cFu2wlDydCAe2ea06jMFXyK56


By default, Translator
1) translates sequence identifiers to ga4gh digest-based identifiers,
2) normalizes alleles
3) adds a ga4gh identifier. These may be disabled as desired. (However, ga4gh_identify requires that all objects use identifiers, including sequence identifiers, in the ga4gh namespace.)


In [13]:
from ga4gh.vrs.extras.translator import Translator


In [14]:
# taking all those examples that were manuly created and trying to use an automated way that was presented in the vrs-python notebooks

tlr = Translator(data_proxy=dp,
                 translate_sequence_identifiers=True,  # default
                 normalize=True,                       # default
                 identify=False)                        # default



In [17]:
def pjson(o, indent=2):
    """pretty print object as json"""
    return json.dumps(o.as_dict(), sort_keys=True, indent=indent)

In [24]:
# hgvs_exprs = [
#     "NC_000013.11:g.32936732C=",
#     "NC_000007.14:g.55181320A>T",
#     "NC_000007.14:g.55181220del",
#     "NC_000007.14:g.55181230_55181231insGGCT"]

test_exprs = [ 
    "NC_000017.11:7674857:C:T", # SPDI example 
    "NC_000013.11:g.32936732C=", #HGVS exapmle 
    "1-55516888-G-GA" # vcf format 
]


for expr in test_exprs:
    trans = tlr.translate_from(expr)
    #trans = tlr.translate_from(expr,"hgvs")
    vrs_id = ga4gh_identify(trans)
    print(vrs_id, pjson(trans), sep = '\n')


# Need to do the vrs-python deployment, need to generate read the doc, generate the swagger UI, 

ga4gh:VA.n5Q-4Nx8xGkaKZH8T2oWXHkdyacFA3pZ
{
  "location": {
    "interval": {
      "end": {
        "type": "Number",
        "value": 7674858
      },
      "start": {
        "type": "Number",
        "value": 7674857
      },
      "type": "SequenceInterval"
    },
    "sequence_id": "ga4gh:SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7",
    "type": "SequenceLocation"
  },
  "state": {
    "sequence": "T",
    "type": "LiteralSequenceExpression"
  },
  "type": "Allele"
}
ga4gh:VA.DkZLLMnwoH6zIncSRh2c05nzCNLdTqHl
{
  "location": {
    "interval": {
      "end": {
        "type": "Number",
        "value": 32936732
      },
      "start": {
        "type": "Number",
        "value": 32936731
      },
      "type": "SequenceInterval"
    },
    "sequence_id": "ga4gh:SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT",
    "type": "SequenceLocation"
  },
  "state": {
    "sequence": "C",
    "type": "LiteralSequenceExpression"
  },
  "type": "Allele"
}
ga4gh:VA.dovgcwGwoUvhK3bWMzaoAuvKQrdkKYHG
{
  "location": 

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("~/Downloads/erepo.tabbed.txt", sep='\t')

In [None]:
def df_info(data):
    print(data.shape)
    print(data.columns)
    return data.head()

In [None]:
df_info(df)

In [None]:
# Editing dataframe
mini_data = df[["#Variation","HGVS Expressions"]]
mini_data.columns = ["variation",'hgvs_exprs']
mini_data.head()


In [None]:
print('Total Numbers of Nan in variation column = {}'.format(mini_data['variation'].isnull().sum()))
# df = df[df['variation'].notna()] --> pulling the NAN values in Dataframe


In [None]:
mini_data.sample(n = 15)

1) 1434 ---	NM_004360.5(CDH1):c.1312del (p.Thr438fs)
2) 4247	--- NM_005629.4(SLC6A8):c.626_627del (p.Pro209fs)
3) 3636	--- NM_000277.3(PAH):c.1117G>A (p.Ala373Thr)	
4) 3509	--- NM_000527.5(LDLR):c.895G>T (p.Ala299Ser)
5) 3595	--- NM_000261.2:c.1187_1189dup

In [None]:
# Sample dataframe
testDF = mini_data.loc[[1434,4247,3636,3509,3595,3245],] # 
testDF

In [None]:
# creating a dictionary where the keys: varation and values: hgvs_exprs 

varList = list(testDF["variation"])
exprsList = list(testDF["hgvs_exprs"])

val_exp_dict = {}

for i in varList:
    for j in exprsList:
        exprs = j.split(',')
        val_exp_dict[i] = exprs

In [None]:
print('key: {}'.format(varList[5]))
print('values: {}'.format(exprsList[5]))

In [None]:
clingenexamples = {
 'NM_004360.5(CDH1)c.1312del (p.Thr438fs)' : ['ENST00000612417.4:c.1309del','NC_000016.9:g.68847387del', 'NM_004360.5:c.1309del'],
 'NM_005629.4(SLC6A8):c.626_627del (p.Pro209fs)' : ['ENST00000253122.10:c.626_627del', 'NC_000023.11:g.153691535_153691536del','NM_005629.3:c.626_627del'],
 'NM_000277.3(PAH):c.1117G>A (p.Ala373Thr)' : ['ENST00000635528.1:n.632G>A', 'NM_000277.3:c.1117G>A', 'NC_000012.12:g.102843728C>T'],
 'NM_000527.5(LDLR):c.895G>T (p.Ala299Ser)' : ['ENST00000558013.5:c.895G>T', 'NM_000527.5:c.895G>T', 'NC_000019.10:g.11107469G>T'],
 'NM_000261.2:c.1187_1189dup' : ['ENST00000037502.10:c.1187_1189dup', 'NM_000261.2:c.1187_1189dup', 'NC_000001.11:g.171636252_171636254dup'],
 'NM_000051.4(ATM):c.5556_5557delinsGA (p.Asp1853Asn)' : ['ENST00000683174.1:n.7040_7041delinsGA', 'NM_000051.4:c.5556_5557delinsGA', 'NC_000011.10:g.108304734_108304735delinsGA']
 }


print([x for x in varList])
print('\n')

for x in clingenexamples:
    for expr in clingenexamples[x]:

        trans = tlr.translate_from(expr,"hgvs")
        vrs_id = ga4gh_identify(trans)

        print(vrs_id, pjson(trans), sep = '\n')
        print('\n')




In [None]:
# come back to this idea

# exprsLists = list(testDF["hgvs_exprs"])

# mylist = []

# for exprs in exprsLists:
#     values = exprs.strip().split(',')
#     for x in values: 
#         val_nam = x.split('_')
#         if ("NM" in val_nam[0]) or  ("NC" in val_nam[0]): 
#             mylist.append(x)

In [None]:
# Come back to this concept
# Everything works, but there needs to be some edits
# need to change that fact that I need to index the dictionary. 
# also some of the values cant be translated

# refseq = []


# for x in  val_exp_dict['NM_000261.2:c.1187_1189dup']:

#     names = x.split('_',maxsplit = 1 )
#     if ("NM" in names[0]) or  ("NC" in names[0]):
#         refseq.append(x)
# refseq


# for expr in refseq:
#     trans = tlr.translate_from(expr,"hgvs")
#     vrs_id = ga4gh_identify(trans)
#     print(vrs_id, pjson(trans), sep = '\n')

