In [1]:
%run setup.ipynb
import pyfasta

In [2]:
tbl_variants_selected = etl.frompickle('../data/tbl_variants_missense_selected.pkl')

## Add housefly numbering

In [3]:
#to easily split the codon numbers from the codon letters
import re

#load the codon map from the blog post (with the header info removed)
md_tbl = etl.fromtsv('../data/domestica_gambiae_map.txt')

#get codons
dom = list(md_tbl['domestica_codon'])
ano = list(md_tbl['gambiae_codon'])

#make a dictionary with a as the key and d as the value
map_dict = {a: d for a, d in zip(ano, dom)}

#get the snpeff annotations
gam_cod = list(tbl_variants_selected['AGAP004707-RA'])

#ditch the codon letters - using re the regex module - LOVELY
gam_cod_cl = []
r = re.compile("([a-zA-Z]+)([0-9]+)")
for c in gam_cod:
    if c:
        d = c[0:6]
        m = r.match(d)
        g = m.group(2)
        gam_cod_cl.append(g)
len(gam_cod_cl)

MD = [map_dict[c] for c in gam_cod_cl]
MD

['261',
 '410',
 '410',
 '.',
 '508',
 '508',
 '549',
 '724',
 '810',
 '1014',
 '1014',
 '.',
 '1532',
 '1575',
 '1602',
 '1608',
 '1751',
 '1858',
 '1873',
 '1879',
 '1879',
 '1939',
 '1945']

In [5]:
#get the musca codon letter at these positions and add
fs = pyfasta.Fasta('../data/domestica_gambiae_PROT_MEGA.fas')

#grab the right sample
dom = fs.get('domestica_vgsc')

#remove the '-' from the aligned fasta so the numbering makes sense
dom_fix = [p for p in dom if p != '-']
#check
dom_fix[261-1],dom_fix[1945-1]

#add them to the position

MD_fix = []
for p in MD:
    if p == '.':
        MD_fix.append('-')
    if p != '.':
        MD_fix.append(dom_fix[int(p)-1]+p)

MD_fix

['R261',
 'V410',
 'V410',
 '-',
 'M508',
 'M508',
 'G549',
 'Q724',
 'T810',
 'L1014',
 'L1014',
 '-',
 'I1532',
 'N1575',
 'E1602',
 'K1608',
 'A1751',
 'V1858',
 'I1873',
 'P1879',
 'P1879',
 'A1939',
 'I1945']

## Build Latex table

In [8]:
tbl_function = etl.wrap([
    ['AGAP004707-RA', 'domain', 'phenotype', 'evidence', 'study'],
    ['R254K', 'IL45', r'putative \texttt{L995F} enhancer', '-', '-'],
    ['V402L', 'IS6', r'known \texttt{I1527T} driver/enhancer)', 'assoc./\emph{in vitro}', '\cite{Yoon2008,Hopkins2010,Park1997,Lee2013,Haddi2017}'],  
    ['D466H', 'LI/II', r'putative \texttt{L995F} enhancer', '-', '-'],
    ['M490I', 'LI/II', 'no known or putative phenotype', '-', '-'],
    ['G531V', 'LI/II', 'no known or putative phenotype', '-', '-'],
    ['Q697P', 'LI/II', 'no known or putative phenotype', '-', '-'],
    ['T791M', 'IIS1', r'putative \texttt{L995F} enhancer', '-', '-'],
    ['L995S', 'IIS6', r'known driver', 'assoc./\emph{in vitro}', '\cite{Burton2011}'],
    ['L995F', 'IIS6', r'known driver', 'assoc./\emph{in vitro}', '\cite{Burton2011}'],
    ['V1507I','IIIL56', 'no known or putative phenotype', '-', '-'],
    ['I1527T', 'IIIS6', r'putative driver and two residues from known enhancer', '\emph{in vitro}', '\cite{Haddi2017}'],
    ['N1570Y', 'LIII/IV', r' known \texttt{L995F} enhancer', 'assoc./\emph{in vitro}', '\cite{Jones2012,Wang2015}'],
    ['E1597G', 'LIII/IV', r'putative \texttt{L995F} enhancer', '-', '-'],
    ['K1603T', 'IVS1', r'putative \texttt{L995F} enhancer', '-', '-'],
    ['A1746S', 'IVS5', r'putative \texttt{L995F} enhancer', '-', '-'],
    ['V1853I', 'COOH', r'putative \texttt{L995F} enhancer', '-', '-'],
    ['I1868T', 'COOH', r'putative \texttt{L995F} enhancer', '-', '-'],
    ['P1874S', 'COOH', r'putative \texttt{L995F} enhancer', 'assoc.', '\cite{Sonoda2008}'],
    ['P1874L', 'COOH', r'putative \texttt{L995F} enhancer', 'assoc.', '\cite{Sonoda2008}'],
    ['A1934V', 'COOH', r'putative \texttt{L995F} enhancer', '-', '-'],
    ['I1940T', 'COOH', r'putative \texttt{L995F} enhancer', '-', '-'],
])

In [9]:
pop_ids = phase2_ar1.pop_ids

tbl_variants_display = (
    tbl_variants_selected
    # keep only the fields we need
    .cut(['POS', 'REF', 'ALT', 'ALTIX', 'FILTER_PASS', 'AGAP004707-RA'] + 
         ['AF_' + p for p in pop_ids])
    # join in function
    .leftjoin(tbl_function, key='AGAP004707-RA', missing='')
    # resort by position
    .sort(key='POS')
    # round allele frequencies to integer
    .convert(['AF_' + p for p in pop_ids], lambda v: int(np.rint(v * 100)))
    # add the column of M. domestica codons
    .addcolumn('Md', MD_fix)
    # add a formatted "substitution" field
    .addfield('substitution', lambda row: '{:,} {}>{}'.format(row['POS'], row['REF'], row['ALT']), index=5)
    .convert(['substitution', 'Md', 'AGAP004707-RA', 'domain'], lambda v: r'\texttt{%s}' % v)
    .convert('substitution', lambda v, row: v + '*' if not row['FILTER_PASS'] else v, pass_row=True)
#    .cutout('POS', 'REF', 'ALT', 'ALTIX', 'FILTER_PASS')
)
tbl_variants_display.displayall()

0|POS,1|REF,2|ALT,3|ALTIX,4|FILTER_PASS,5|substitution,6|AGAP004707-RA,7|AF_AOcol,8|AF_GHcol,9|AF_BFcol,10|AF_CIcol,11|AF_GNcol,12|AF_GW,13|AF_GM,14|AF_CMgam,15|AF_GHgam,16|AF_BFgam,17|AF_GNgam,18|AF_GAgam,19|AF_UGgam,20|AF_GQgam,21|AF_FRgam,22|AF_KE,23|domain,24|phenotype,25|evidence,26|study,27|Md
2390177,G,A,0,True,"\texttt{2,390,177 G>A}",\texttt{R254K},0,1,0,0,0,0,0,31,0,0,0,20,0,0,0,0,\texttt{IL45},putative \texttt{L995F} enhancer,-,-,\texttt{R261}
2391228,G,C,0,True,"\texttt{2,391,228 G>C}",\texttt{V402L},0,13,7,8,12,0,0,0,0,0,0,0,0,0,0,0,\texttt{IS6},known \texttt{I1527T} driver/enhancer),assoc./\emph{in vitro},"\cite{Yoon2008,Hopkins2010,Park1997,Lee2013,Haddi2017}",\texttt{V410}
2391228,G,T,1,True,"\texttt{2,391,228 G>T}",\texttt{V402L},0,5,6,0,0,0,0,0,0,0,0,0,0,0,0,0,\texttt{IS6},known \texttt{I1527T} driver/enhancer),assoc./\emph{in vitro},"\cite{Yoon2008,Hopkins2010,Park1997,Lee2013,Haddi2017}",\texttt{V410}
2399997,G,C,0,True,"\texttt{2,399,997 G>C}",\texttt{D466H},0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,\texttt{LI/II},putative \texttt{L995F} enhancer,-,-,\texttt{-}
2400071,G,A,0,True,"\texttt{2,400,071 G>A}",\texttt{M490I},0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,19,\texttt{LI/II},no known or putative phenotype,-,-,\texttt{M508}
2400071,G,T,1,True,"\texttt{2,400,071 G>T}",\texttt{M490I},0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,\texttt{LI/II},no known or putative phenotype,-,-,\texttt{M508}
2402466,G,T,0,True,"\texttt{2,402,466 G>T}",\texttt{G531V},0,0,0,0,0,0,0,0,0,0,0,1,0,6,0,0,\texttt{LI/II},no known or putative phenotype,-,-,\texttt{G549}
2407967,A,C,0,True,"\texttt{2,407,967 A>C}",\texttt{Q697P},0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,\texttt{LI/II},no known or putative phenotype,-,-,\texttt{Q724}
2416980,C,T,0,True,"\texttt{2,416,980 C>T}",\texttt{T791M},0,1,2,0,0,0,0,0,29,15,11,0,0,0,0,0,\texttt{IIS1},putative \texttt{L995F} enhancer,-,-,\texttt{T810}
2422651,T,C,0,True,"\texttt{2,422,651 T>C}",\texttt{L995S},0,0,0,0,0,0,0,16,0,0,0,67,100,0,0,76,\texttt{IIS6},known driver,assoc./\emph{in vitro},\cite{Burton2011},\texttt{L1014}


In [12]:
prologue = r"""
\begin{tabular}{llllrrrrrrrrr}
\toprule
\multicolumn{4}{c}{Variant} &
\multicolumn{9}{c}{Population allele frequency (\%)}\\
\cmidrule(r){1-4}
\cmidrule(r){5-13}
Position\tnote{1} & 
\emph{Ag}\tnote{2} & 
\emph{Md}\tnote{3} &
Domain\tnote{4} &
AO\emph{Ac} &
GH\emph{Ac} &
BF\emph{Ac} &
CI\emph{Ac} &
GN\emph{Ac} &
GW &
GM &
CM\emph{Ag} &
GH\emph{Ag} &
BF\emph{Ag} & 
GN\emph{Ag} &  
GA\emph{Ag} & 
UG\emph{Ag} &
GQ\emph{Ag} &
FR\emph{Ag} & 
KE\\
\midrule
"""
template = r"""
{substitution} & {AGAP004707-RA} & {Md} & {domain} & {AF_AOcol} & {AF_GHcol} & {AF_BFcol} & {AF_CIcol} & {AF_GNcol} & {AF_GW} & {AF_GM} & {AF_CMgam} & {AF_GHgam} & {AF_BFgam} & {AF_GNgam} & {AF_GAgam} & {AF_UGgam} & {AF_GQgam} & {AF_FRgam} & {AF_KE} \\
"""
epilogue = r"""
\bottomrule
\end{tabular}
"""
tbl_variants_display.totext('../tables/variants_missense.tex', 
                            encoding='ascii',
                            prologue=prologue, 
                            template=template,
                            epilogue=epilogue)

!cat ../tables/variants_missense.tex


\begin{tabular}{llllrrrrrrrrr}
\toprule
\multicolumn{4}{c}{Variant} &
\multicolumn{9}{c}{Population allele frequency (\%)}\\
\cmidrule(r){1-4}
\cmidrule(r){5-13}
Position\tnote{1} & 
\emph{Ag}\tnote{2} & 
\emph{Md}\tnote{3} &
Domain\tnote{4} &
AO\emph{Ac} &
GH\emph{Ac} &
BF\emph{Ac} &
CI\emph{Ac} &
GN\emph{Ac} &
GW &
GM &
CM\emph{Ag} &
GH\emph{Ag} &
BF\emph{Ag} & 
GN\emph{Ag} &  
GA\emph{Ag} & 
UG\emph{Ag} &
GQ\emph{Ag} &
FR\emph{Ag} & 
KE\\
\midrule

\texttt{2,390,177 G>A} & \texttt{R254K} & \texttt{R261} & \texttt{IL45} & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 31 & 0 & 0 & 0 & 20 & 0 & 0 & 0 & 0 \\

\texttt{2,391,228 G>C} & \texttt{V402L} & \texttt{V410} & \texttt{IS6} & 0 & 13 & 7 & 8 & 12 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 \\

\texttt{2,391,228 G>T} & \texttt{V402L} & \texttt{V410} & \texttt{IS6} & 0 & 5 & 6 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 \\

\texttt{2,399,997 G>C} & \texttt{D466H} & \texttt{-} & \texttt{LI/II} & 0 & 0 & 0 & 0 

## Table 2. Phenotype

In [8]:
# tbl_pheno = etl.wrap([
#     ['AGAP004707-RA', 'Md', 'domain', 'phenotype', 'evidence', 'study'],
#     ['R254K', 'R261', 'IN (I.S4--I.S5)', r'putative \texttt{L995F} enhancer', '-', '-'],
#     ['V402L', 'V410', 'TM (I.S6)', r'known driver/enhancer', 'assoc./\emph{in vitro}', '\cite{Yoon2008,Hopkins2010,Park1997,Lee2013,Haddi2017}'],
#     ['D466H', '-', 'IN (I.S6--II.S1)', r'putative \texttt{L995F} enhancer', '-', '-'],
#     ['M490I', 'M508', 'IN (I.S6--II.S1)', 'no known or putative phenotype', '-', '-'],
#     ['T791M', 'T810', 'TM (II.S1)', r'putative \texttt{L995F} enhancer', '-', '-'],
#     ['L995S', 'L1014', 'TM (II.S6)', r'known driver', 'assoc./\emph{in vitro}', '\cite{Burton2011}'],
#     ['L995F', 'L1014', 'TM (II.S6)', r'known driver', 'assoc./\emph{in vitro}', '\cite{Burton2011}'],
#     ['A1125V', 'K1133', 'IN (II.S6--III.S1)', r'no known or putative phenotype', '-', '-'],
#     ['V1254I', 'I1262', 'IN (II.S6--III.S1)', r'no known or putative phenotype', '-', '-'],
#     ['I1527T', 'I1532', 'TM (III.S6)', r'putative driver\tnote{4}', '\emph{in vitro}', '\cite{Haddi2017}'],
#     ['N1570Y', 'N1575', 'IN (III.S6--IV.S1)', r' known \texttt{L995F} enhancer', 'assoc./\emph{in vitro}', '\cite{Jones2012,Wang2015}'],
#     ['E1597G', 'E1602', 'IN (III.S6--IV.S1)', r'putative \texttt{L995F} enhancer', '-', '-'],
#     ['K1603T', 'K1608', 'TM (IV.S1)', r'putative \texttt{L995F} enhancer', '-', '-'],
#     ['A1746S', 'A1751', 'TM (IV.S5)', r'putative \texttt{L995F} enhancer', '-', '-'],
#     ['V1853I', 'V1858', 'IN (IV.S6--)', r'putative \texttt{L995F} enhancer', '-', '-'],
#     ['I1868T', 'I1873', 'IN (IV.S6--)', r'putative \texttt{L995F} enhancer', '-', '-'],
#     ['P1874S', 'P1879', 'IN (IV.S6--)', r'putative \texttt{L995F} enhancer', 'assoc.', '\cite{Sonoda2008}'],
#     ['P1874L', 'P1879', 'IN (IV.S6--)', r'putative \texttt{L995F} enhancer', 'assoc.', '\cite{Sonoda2008}'],
#     ['F1920S', 'Y1925', 'IN (IV.S6--)', r'putative \texttt{L995F} enhancer', '-', '-'],
#     ['A1934V', 'A1939', 'IN (IV.S6--)', r'putative \texttt{L995F} enhancer', '-', '-'],
#     ['I1940T', 'I1945', 'IN (IV.S6--)', r'putative \texttt{L995F} enhancer', '-', '-'],
# ])

# prologue = r"""
# \begin{tabular}{llllll}
# \toprule
# \multicolumn{2}{c}{Variant} &
# \multicolumn{4}{c}{Function}\\
# \cmidrule(r){1-2}
# \cmidrule(r){3-6}
# \emph{Ag} & 
# \emph{Md} & Domain\tnote{1} & 
# Phenotype\tnote{2} &
# Experimental evidence\tnote{3} &
# Publication\\
# \midrule
# """
# template = r"""
# {AGAP004707-RA} & {Md} & {domain} & {phenotype} & {evidence} & {study} \\
# """
# epilogue = r"""
# \bottomrule
# \end{tabular}
# """
# tbl_pheno.totext('../tables/variants_pheno.tex', 
#                             encoding='ascii',
#                             prologue=prologue, 
#                             template=template,
#                             epilogue=epilogue)

# !cat ../tables/variants_pheno.tex