In [1]:
using Requests

In [2]:
# single letter amino acid codes and STOP
ipuac="ACDEFGHIKLMNPQRSTVWY*";

In [3]:
# all transitions from one aa to another (inc symmeteric)
allalt = []
for aa in ipuac
    for bb in ipuac
        # if(aa != bb)  # now including silent subsitutions   
            push!(allalt, string(aa) * string(bb))
        #end
    end
end
length(allalt)

441

Allowing silent subsitutions here means we may have to check for exactly equal codons later  

```human_aa_codon_freq.txt```
is derived from the first three columns of a table I think I got from wikiopedia   
a year or so ago when I was thinking about gray codes for protien sequences.  
I am not having luck finding the exact source just now, 
but here is a similar table from a different source  
http://www.genscript.com/tools/codon-frequency-table



In [4]:
aa_codon = Dict{String, Array{String,1}}() 
codon_bias = Dict{String, Float64}()

for line in readlines("./human_aa_codon_freq.txt")
    (aa, codon, freq) = split(line)
    if haskey(aa_codon, aa)
        push!(aa_codon[aa], codon)
    else
        aa_codon[aa] = [codon]
    end
    codon_bias[codon] =  float(freq)
end   

Although we have generated data for all possible replacements of one
amino acid for another, PR has indicated we will be limiting ourselves to
replacments due to Single Nuclotide Variations (SNV)s which eliminates
a fair number of choices.   
It also means we need to check.

In [5]:
# check that a reference->variant pair of codons differ only by a single nucleotide
function is_snv(r::String, v::String)
    n = 0
    for i in 1:3
        n+=r[i]==v[i]?0:1
    end
    return n==1
end

is_snv (generic function with 1 method)

given a replacement  
want the nuculotide choices for both 
reference and variant

In [6]:
# WIP
# Types as light weight objects closer to a struct)
# immutable 
#type Key <: AbstractString
#        alteration::String
#end

In [7]:
# By defining a Composite Type
# we can get variable names in in the json
type Replacement
    ref_codon::String
    var_codon::String
    likelyhood::Float64
end

In [8]:
lookup = Dict{String, Array{Replacement,1}}()

scale=10000
for alteration in allalt  
    reference = string(alteration[1]) 
    variant = string(alteration[2])
    #alteration = Key(alteration[1:2])
    for ref_codon in aa_codon[reference]
        #println(ref_codon)
        for var_codon in aa_codon[variant]      
            if is_snv(ref_codon, var_codon)
                likelyhood = (round(codon_bias[ref_codon] * codon_bias[var_codon] * scale)/scale)
                if haskey(lookup, alteration)                     
                    push!(lookup[alteration], Replacement(ref_codon, var_codon, likelyhood))
                else 
                   lookup[alteration] = [Replacement(ref_codon, var_codon, likelyhood)]
                end
            end
        end
    end
    if haskey(lookup, alteration)
        sort!(lookup[alteration], by = x -> x.likelyhood, rev = true)
    end
end

In [9]:
typeof(lookup)

Dict{String,Array{Replacement,1}}

In [10]:
# dump the lookup table as json for future use
fh = open("replacement_codon_score.json","w")
write(fh, JSON.json(lookup))
close(fh)

In [11]:
# report single variant and location within a codon
type Varloc 
    index::Integer
    transition::String
end

function SNV(r::Replacement)
    a=r.ref_codon;b=r.var_codon
    for i in 1:3 
        if a[i]!=b[i] 
            return Varloc(i-1, string(a[i]) * ">" * string(b[i])) 
        end
    end
end

SNV (generic function with 1 method)

In [12]:
# read in the Onckb variant 
oncodb = JSON.parse(readstring("variants.json"));

In [13]:
# pick a random record
# n =  Int(floor(length(oncodb)*rand()))
n = 73

73

In [14]:
oncodb[n::Integer]    

Dict{String,Any} with 9 entries:
  "uniqueId"        => "BCL2L11&R153W&R153W&MUTATION&missense_variant&153&153&R…
  "name"            => "R153W"
  "gene"            => Dict{String,Any}(Pair{String,Any}("curatedIsoform","ENST…
  "consequence"     => Dict{String,Any}(Pair{String,Any}("term","missense_varia…
  "variantResidues" => "W"
  "alteration"      => "R153W"
  "refResidues"     => "R"
  "proteinStart"    => 153
  "proteinEnd"      => 153

In [15]:
oncodb[n]["gene"]

Dict{String,Any} with 8 entries:
  "curatedIsoform" => "ENST00000393256"
  "name"           => "BCL2 like 11"
  "curatedRefSeq"  => "NM_138621.4"
  "tsg"            => true
  "entrezGeneId"   => 10018
  "oncogene"       => false
  "geneAliases"    => Any["BOD","BIM","BAM"]
  "hugoSymbol"     => "BCL2L11"

In [16]:
k = oncodb[n]["refResidues"] * oncodb[n]["variantResidues"]
variant_choices = lookup[k]
length(variant_choices)

2

The RefSeq should give us a variant substrate sequence 

In [17]:
function fetch_sequence(refseq_id::String)
    # refseq_id = oncodb[n]["gene"]["curatedRefSeq"]
    efetch = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&rettype=fasta&retmode=text&id=" 
    resp = get(efetch * refseq_id ) 
    if resp != 200  printnl("Didnt get sequence") end   
    page = readstring(resp)
    lines = split(page, "\n")
    # loose the defline  > id ,,,
    lines[1] = ""
    return join(lines)
end        

fetch_sequence (generic function with 1 method)

In [18]:
refseq_id = oncodb[n]["gene"]["curatedRefSeq"]
seq = fetch_sequence(refseq_id)
offset = 3 * oncodb[n]["proteinStart"]
refseq_codon = seq[offset:offset + 2]
refseq_codon

LoadError: UndefVarError: printnl not defined

In [19]:
for vc in variant_choices 
    println(vc, "\t", SNV(vc))
end
# 0 based index here

Replacement("AGG","TGG",0.21)	Varloc(0,"A>T")
Replacement("CGG","TGG",0.2)	Varloc(0,"C>T")


In [20]:
# constructor
possible_variants = Array{Varloc,1}();                          

In [21]:
println("Reference codon observed \t",refseq_codon)
println(possible_variants)

LoadError: UndefVarError: refseq_codon not defined

In [22]:
refseq_id * ":c." * string(offset - possible_variants[1].index) * possible_variants[1].transition

LoadError: UndefVarError: offset not defined

Striking out finding variants with this type of syntax "NM_138621.4:c.459C>T"  
try to get them all and see how it goes 

In [23]:
for rec in oncodb
    refseq_id  = rec["gene"]["curatedRefSeq"]
    seq = fetch_sequence(refseq_id)
    offset = 3 * rec["proteinStart"]
    refseq_codon = seq[offset:offset+2]
    key = rec["refResidues"] * rec["variantResidues"]
    variant_choices = lookup[key]
    rec["possible_allele"]=[]
    
    for vc in variant_choices
        if refseq_codon == vc.ref_codon
            codon_snv = SNV(vc)   
            push!(rec["possible_allele_id"], 
                refseq_id * ":c." * string(offset - codon_snv.index) * codon_snv.transition)
        end    
    end

    #if 1 > length(rec["possible_allele_id"])
    #    println("corblimy, Observed ", refseq_codon ," at ", offset,
    #        " BP within ", refseq_id, " but it is not found in\n", variant_choices)
    #end   
end

LoadError: UndefVarError: printnl not defined

In [24]:
code_llvm(is_snv, (String,String))


define i8 @julia_is_snv_71655(%jl_value_t*, %jl_value_t*) #0 {
top:
  %2 = call i32 @jlsys_getindex_44220(%jl_value_t* %0, i64 1)
  %3 = call i32 @jlsys_getindex_44220(%jl_value_t* %1, i64 1)
  %4 = icmp eq i32 %2, %3
  br i1 %4, label %if6, label %L3

L3:                                               ; preds = %top, %if6
  %"#temp#1.0" = phi i64 [ 0, %if6 ], [ 1, %top ]
  %5 = call i32 @jlsys_getindex_44220(%jl_value_t* %0, i64 2)
  %6 = call i32 @jlsys_getindex_44220(%jl_value_t* %1, i64 2)
  %7 = icmp eq i32 %5, %6
  br i1 %7, label %if6.1, label %L3.1

if6:                                              ; preds = %top
  br label %L3

if6.1:                                            ; preds = %L3
  br label %L3.1

L3.1:                                             ; preds = %if6.1, %L3
  %"#temp#1.0.1" = phi i64 [ 0, %if6.1 ], [ 1, %L3 ]
  %8 = call i32 @jlsys_getindex_44220(%jl_value_t* %0, i64 3)
  %9 = call i32 @jlsys_getindex_44220(%jl_value_t* %1, i64 3)
  %10 = icmp eq i32 %8, 

In [25]:
oncodb[6]["possible_allele_id"]


LoadError: KeyError: key "possible_allele_id" not found

Just for fun look at how to go from go from high levle python-y script-y  
all the way down to assembly with one of the little functions defined above

In [26]:
code_native(is_snv, (String,String))

	.text
Filename: In[5]
	pushq	%rbp
	movq	%rsp, %rbp
	pushq	%r15
	pushq	%r14
	pushq	%r13
	pushq	%r12
	pushq	%rbx
	pushq	%rax
	movq	%rsi, %r14
	movq	%rdi, %r15
Source line: 5
	movabsq	$getindex, %r13
	movl	$1, %r12d
	movl	$1, %esi
	callq	*%r13
	movl	%eax, %ebx
	movl	$1, %esi
	movq	%r14, %rdi
	callq	*%r13
	cmpl	%eax, %ebx
	jne	L64
	xorl	%r12d, %r12d
L64:
	movq	%r12, -48(%rbp)
	movl	$2, %esi
	movq	%r15, %rdi
	callq	*%r13
	movl	%eax, %ebx
	movl	$2, %esi
	movq	%r14, %rdi
	callq	*%r13
	movl	$1, %r12d
	cmpl	%eax, %ebx
	jne	L105
	xorl	%r12d, %r12d
L105:
	movl	$3, %esi
	movq	%r15, %rdi
	callq	*%r13
	movl	%eax, %ebx
	movl	$3, %esi
	movq	%r14, %rdi
	callq	*%r13
	movl	$1, %ecx
	cmpl	%eax, %ebx
	jne	L140
	xorl	%ecx, %ecx
L140:
	addq	-48(%rbp), %r12
	addq	%rcx, %r12
Source line: 7
	cmpq	$1, %r12
	sete	%al
	addq	$8, %rsp
	popq	%rbx
	popq	%r12
	popq	%r13
	popq	%r14
	popq	%r15
	popq	%rbp
	retq
	nopl	(%rax)


M.Dayhoff's PAM1 
http://www.deduveinstitute.be/~opperd/private/pam1.html