# Extract mutations in VGSC

This notebook extracts data on all mutations in the VGSC gene.

## Setup

In [1]:
%run setup.ipynb

In [2]:
# download gene annotations from vectorbase
!wget \
    --no-clobber \
    -O ../data/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.4.gff3.gz \
    https://www.vectorbase.org/download/anopheles-gambiae-pestbasefeaturesagamp44gff3gz

File `../data/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.4.gff3.gz' already there; not retrieving.


In [3]:
# download the Davies et al. (2007) gene models
!wget \
    --no-clobber \
    -O ../data/davies_vgsc_model_20170125.gff3 \
    http://alimanfoo.github.io/assets/davies_vgsc_model_20170125.gff3

File `../data/davies_vgsc_model_20170125.gff3' already there; not retrieving.


In [4]:
# load the vectorbase geneset
geneset_agamp44 = allel.FeatureTable.from_gff3('../data/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.4.gff3.gz',
                                               attributes=['ID', 'Parent'])
geneset_agamp44 = geneset_to_pandas(geneset_agamp44)
geneset_agamp44.head()

  return pandas.DataFrame.from_items(items)


Unnamed: 0,seqid,source,type,start,end,score,strand,phase,ID,Parent
0,2L,VectorBase,chromosome,1,49364325,-1.0,.,-1,2L,.
1,2L,VectorBase,gene,157348,186936,-1.0,-,-1,AGAP004677,.
2,2L,VectorBase,mRNA,157348,181305,-1.0,-,-1,AGAP004677-RA,AGAP004677
3,2L,VectorBase,three_prime_UTR,157348,157495,-1.0,-,-1,.,AGAP004677-RA
4,2L,VectorBase,exon,157348,157623,-1.0,-,-1,.,AGAP004677-RA


In [5]:
# subset to VGSC
geneset_agamp44_vgsc = geneset_agamp44.query(region_vgsc.query_str).copy()
# replace CDS IDs as not informative
geneset_agamp44_vgsc['ID'].values[(geneset_agamp44_vgsc.type == 'CDS').values] = ''
geneset_agamp44_vgsc.type.value_counts()

CDS     93
exon    93
mRNA     3
gene     1
Name: type, dtype: int64

In [6]:
# load the Davies geneset
geneset_davies = allel.FeatureTable.from_gff3('../data/davies_vgsc_model_20170125.gff3',
                                              attributes=['ID', 'Parent'])
geneset_davies = geneset_to_pandas(geneset_davies)
geneset_davies.head()

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,ID,Parent
0,2L,Davies et al. (2007),mRNA,2358158,2431617,-1.0,+,-1,Davies-C1N2,AGAP004707
1,2L,Davies et al. (2007),mRNA,2358158,2431617,-1.0,+,-1,Davies-C3N2,AGAP004707
2,2L,Davies et al. (2007),mRNA,2358158,2431617,-1.0,+,-1,Davies-C5N2,AGAP004707
3,2L,Davies et al. (2007),mRNA,2358158,2431617,-1.0,+,-1,Davies-C7N2,AGAP004707
4,2L,Davies et al. (2007),mRNA,2358158,2431617,-1.0,+,-1,Davies-C8N2,AGAP004707


In [7]:
# make a combined geneset
geneset_vgsc_combined = pandas.concat([geneset_agamp44_vgsc, geneset_davies])
geneset_vgsc_combined.query("type == 'mRNA'")

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,ID,Parent
666,2L,VectorBase,mRNA,2358158,2431617,-1.0,+,-1,AGAP004707-RA,AGAP004707
729,2L,VectorBase,mRNA,2358158,2431617,-1.0,+,-1,AGAP004707-RB,AGAP004707
792,2L,VectorBase,mRNA,2358158,2431617,-1.0,+,-1,AGAP004707-RC,AGAP004707
0,2L,Davies et al. (2007),mRNA,2358158,2431617,-1.0,+,-1,Davies-C1N2,AGAP004707
1,2L,Davies et al. (2007),mRNA,2358158,2431617,-1.0,+,-1,Davies-C3N2,AGAP004707
2,2L,Davies et al. (2007),mRNA,2358158,2431617,-1.0,+,-1,Davies-C5N2,AGAP004707
3,2L,Davies et al. (2007),mRNA,2358158,2431617,-1.0,+,-1,Davies-C7N2,AGAP004707
4,2L,Davies et al. (2007),mRNA,2358158,2431617,-1.0,+,-1,Davies-C8N2,AGAP004707
5,2L,Davies et al. (2007),mRNA,2358158,2431617,-1.0,+,-1,Davies-C10N2,AGAP004707
6,2L,Davies et al. (2007),mRNA,2358158,2431617,-1.0,+,-1,Davies-C11N2,AGAP004707


In [8]:
# setup a variant annotator
annotator = veff.Annotator(
    fasta_path='../phase2.AR1/genome/agamP3/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP3.fa', 
    gff3_path=['../data/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.4.gff3.gz',
               '../data/davies_vgsc_model_20170125.gff3'],
    seqid='2L'
)

In [9]:
# identify VGSC transcripts
transcript_ids = [f.feature_id for f in annotator.get_children('AGAP004707')]
transcript_ids

['AGAP004707-RA',
 'AGAP004707-RB',
 'AGAP004707-RC',
 'Davies-C1N2',
 'Davies-C3N2',
 'Davies-C5N2',
 'Davies-C7N2',
 'Davies-C8N2',
 'Davies-C10N2',
 'Davies-C11N2',
 'Davies-C1N9',
 'Davies-C8N9',
 'Davies-C1N9ck']

In [10]:
# tabulate Davies exons
tbl_davies_exons = (
    etl
    .fromdataframe(geneset_davies)
    .eq('type', 'CDS')
    .cutout('Parent', 'source', 'type', 'score', 'strand', 'phase')
    .merge(key=('start', 'end'))
    .rename('seqid', 'exon_seqid')
    .rename('ID', 'exon')
    .rename('start', 'exon_start')
    .rename('end', 'exon_end')
    .movefield('exon_seqid', 0)
)
tbl_davies_exons.displayall()

0|exon_seqid,1|exon_start,2|exon_end,3|exon
2L,2358158,2358304,1
2L,2359640,2359672,2j
2L,2361989,2362144,3
2L,2381065,2381270,4
2L,2382270,2382398,5
2L,2385694,2385785,6
2L,2390129,2390341,7
2L,2390425,2390485,8
2L,2390594,2390738,9
2L,2391156,2391320,10


## Extract table of variants

In [11]:
callset = phase2_ar1.callset
callset

<zarr.hierarchy.Group '/' read-only>

In [12]:
# what fields are available?
print(', '.join(callset['2L/variants']))

AC, ALT, AN, CHROM, FILTER_PASS, FS, HRun, HighCoverage, HighMQ0, LowCoverage, LowMQ, NoCoverage, POS, QD, REF, ReadPosRankSum, RepeatDUST, RepeatMasker, RepeatTRF, num_alleles


In [13]:
# #get SNPEFF annotations from HDF5 - also find out why these aren't in zarr format
# snpeff_h5_fn = '../phase2.AR1/variation/main/hdf5/all_snpeff/ag1000g.phase2.ar1.snpeff.AgamP4.2.2L.h5'
# snpf = h5py.File(snpeff_h5_fn, mode='r')

In [13]:
# # what SNPEFF fields are available?
# print(', '.join(snpf['2L/variants/ANN'].dtype.names))

In [14]:
samples = phase2_ar1.df_samples
samples.head()

Unnamed: 0_level_0,src_code,population,country,region,contributor,contact,year,m_s,sex,n_sequences,mean_coverage
ox_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AA0040-C,Twifo_Praso__E2,GHcol,Ghana,Twifo_Praso,David Weetman,,2012.0,M,F,95033368,30.99
AA0041-C,Twifo_Praso__H3,GHcol,Ghana,Twifo_Praso,David Weetman,,2012.0,M,F,95843804,31.7
AA0042-C,Takoradi_C7,GHcol,Ghana,Takoradi,David Weetman,,2012.0,M,F,107420666,35.65
AA0043-C,Takoradi_H8,GHcol,Ghana,Takoradi,David Weetman,,2012.0,M,F,95993752,29.46
AA0044-C,Takoradi_D10,GHcol,Ghana,Takoradi,David Weetman,,2012.0,M,F,103044262,33.67


In [23]:
#samples needs to have a numeric index - this should probably be tested in future versions
samples = samples.reset_index()

In [24]:
samples

Unnamed: 0,ox_code,src_code,population,country,region,contributor,contact,year,m_s,sex,n_sequences,mean_coverage
0,AA0040-C,Twifo_Praso__E2,GHcol,Ghana,Twifo_Praso,David Weetman,,2012.0,M,F,95033368,30.99
1,AA0041-C,Twifo_Praso__H3,GHcol,Ghana,Twifo_Praso,David Weetman,,2012.0,M,F,95843804,31.70
2,AA0042-C,Takoradi_C7,GHcol,Ghana,Takoradi,David Weetman,,2012.0,M,F,107420666,35.65
3,AA0043-C,Takoradi_H8,GHcol,Ghana,Takoradi,David Weetman,,2012.0,M,F,95993752,29.46
4,AA0044-C,Takoradi_D10,GHcol,Ghana,Takoradi,David Weetman,,2012.0,M,F,103044262,33.67
5,AA0048-C,Madina_A1,GHgam,Ghana,Madina,David Weetman,,2012.0,S,F,101183648,32.64
6,AA0049-C,Madina_E3,GHcol,Ghana,Madina,David Weetman,,2012.0,M,F,106641272,32.84
7,AA0050-C,Madina_E5,GHgam,Ghana,Madina,David Weetman,,2012.0,S,F,93961924,29.82
8,AA0051-C,Twifo_Praso__C1,GHcol,Ghana,Twifo_Praso,David Weetman,,2012.0,M,F,99910300,33.33
9,AA0052-C,Twifo_Praso__F2,GHcol,Ghana,Twifo_Praso,David Weetman,,2012.0,M,F,100899280,32.29


In [37]:
def tabulate_variants(callset, seqid, start, end, pop_ids, subpops):
    """Build a table of variants for a given callset and genome region."""
    
    variants = callset[seqid]['variants']
    #ann = snpeff[seqid]['variants']['ANN']
    pos = allel.SortedIndex(variants['POS'])
    loc = pos.locate_range(start, end)
    genotype = allel.GenotypeArray(callset[seqid]['calldata/genotype'][loc])
    acs = genotype.count_alleles_subpops(max_allele=3, subpops=subpops)
    
    # extract columns
    variants_fields = [
        'CHROM',
        'POS',
        'num_alleles',
        'REF',
        'ALT',
        'AC',
        'FILTER_PASS',
        'NoCoverage',
        'LowCoverage',
        'HighCoverage',
        'LowMQ',
        'HighMQ0',
        'RepeatDUST',
        'RepeatMasker',
        'RepeatTRF',
        'FS',
        'HRun',
        'QD',
        'ReadPosRankSum',
    ]
#     ann_fields = ['Allele', 'Annotation', 'HGVS_c', 'HGVS_p', 'Feature_ID', 'CDS_pos']
    cols = (
        [variants[f][loc] for f in variants_fields] + 
#         [ann[loc][f] for f in ann_fields] + 
        [acs[p].to_frequencies() for p in pop_ids]
    )

    def split_alleles(row):
        for i in range(row.num_alleles - 1):
            # break down alleles
            out = [
                row['CHROM'], 
                row['POS'], 
                row['num_alleles'], 
                row['REF'], 
                row['ALT'][i], 
                row['AC'][i], 
                i, 
            ]
#             # add in remaining variant annotations
#             out += [row[f] for f in variants_fields[6:]]
#             # SNPEFF annotation only applies to first allele
#             if i == 0:
#                 out += [row[f] for f in ann_fields]
#             else:
#                 out += [None for f in ann_fields]
            # add in population allele frequencies
            out += [row[p][i+1] for p in pop_ids]
            yield out
        
    tbl = (
        etl
        .fromcolumns(cols, header=variants_fields + list(pop_ids))
        .rowmapmany(split_alleles, header=variants_fields[:6] + ['ALTIX'] + variants_fields[6:] + list(pop_ids), failonerror=True)
        .convert('CHROM REF ALT'.split(), lambda v: str(v, 'ascii'))
        .rename({p: 'AF_%s' % p for p in pop_ids})
    )
    
    return tbl

In [38]:
pop_ids = phase2_ar1.pop_ids
print(', '.join(pop_ids))

AOcol, GHcol, BFcol, CIcol, GNcol, GW, GM, CMgam, GHgam, BFgam, GNgam, GAgam, UGgam, GQgam, FRgam, KE


In [39]:
subpops = {p: samples[samples.population == p].index.values.tolist() for p in pop_ids}

In [40]:
# build a table of variants from phase 1
tbl_variants_phase2 = tabulate_variants(callset, 
                                        seqid=region_vgsc.seqid, start=region_vgsc.start, end=region_vgsc.end, 
                                        pop_ids=pop_ids, subpops=subpops)
tbl_variants_phase2

0|CHROM,1|POS,2|num_alleles,3|REF,4|ALT,5|AC,6|ALTIX,7|FILTER_PASS,8|NoCoverage,9|LowCoverage,10|HighCoverage,11|LowMQ,12|HighMQ0,13|RepeatDUST,14|RepeatMasker,15|RepeatTRF,16|FS,17|HRun,18|QD,19|ReadPosRankSum,20|AF_AOcol,21|AF_GHcol,22|AF_BFcol,23|AF_CIcol,24|AF_GNcol,25|AF_GW,26|AF_GM,27|AF_CMgam,28|AF_GHgam,29|AF_BFgam,30|AF_GNgam,31|AF_GAgam,32|AF_UGgam,33|AF_GQgam,34|AF_FRgam,35|AF_KE
2L,2358254,2,G,A,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0016835016835016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,
2L,2358309,2,A,G,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0016835016835016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,
2L,2358316,2,T,G,81,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1363636363636363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,
2L,2358328,2,T,C,8,0,0.0,0.0,0.0066666666666666,0.0,0.0,0.0164835164835164,0.0307692307692307,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,
2L,2358353,2,C,T,1,0,0.0,0.0,0.0,0.0,0.0,0.0054945054945054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,


## Annotate effects for all transcripts

In [26]:
cds_effects = [
    'NON_SYNONYMOUS_CODING', 
    'SYNONYMOUS_CODING',    
]
intron_effects = [
    'INTRONIC', 
    'SPLICE_CORE',
    'SPLICE_REGION',        
]
selected_effects = cds_effects + intron_effects

In [28]:
def lpop(l, default=None):
    """Pop the first item from a list if not empty."""
    try:
        return l[0]
    except IndexError:
        return default

In [29]:
def transcript_effect(transcript_id):
    def f(row):
        e = lpop([e for e in row.VEFF if e.transcript_id == transcript_id])
        if e and e.effect in cds_effects:
            return (e.effect, e.aa_change)
        elif e and e.effect in intron_effects:
            return (e.effect, e.intron_cds_5prime, e.intron_5prime_dist, e.intron_cds_3prime, e.intron_3prime_dist)
        else:
            return None
    return f

In [30]:
#the latest version of intervaltree breaks petl - here I roll back to the previous version
# conda uninstall intervaltree
# conda install intervaltree=2.1.0

In [32]:
#Check versions
conda list

# packages in environment at /home/chris/malariagen/binder/conda/envs/ag1000g-phase2-vgsc-report-13624de:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                        main  
adal                      1.2.2                      py_0    conda-forge
aioeasywebdav             2.4.0                 py36_1000    conda-forge
aiohttp                   3.5.4            py36h14c3975_0    conda-forge
alabaster                 0.7.12                   pypi_0    pypi
alembic                   1.0.11                     py_0    conda-forge
anhima                    0.11.2                   pypi_0    pypi
appdirs                   1.4.3                      py_1    conda-forge
argh                      0.26.2                   pypi_0    pypi
asciitree                 0.3.3                      py_2    conda-forge
asn1crypto                0.24.0                py36_1003    conda-forge
asteval                   0.9.14          


Note: you may need to restart the kernel to use updated packages.


In [82]:
import intervaltree

In [34]:
tbl_variants_phase2_eff = (
    tbl_variants_phase2
    # join in Davies exon information
    .intervalleftjoin(
        # don't include shorter exon alternatives
        tbl_davies_exons.select('exon', lambda v: v[-1] != '-'),
        lkey='CHROM', rkey='exon_seqid', lstart='POS', rstart='exon_start', lstop='POS', rstop='exon_end', include_stop=True)
    .cutout('exon_seqid')
    .addfield('VEFF', lambda row: [e for e in annotator.get_effects(chrom=row.CHROM, pos=row.POS, ref=row.REF, alt=row.ALT) if e.effect in selected_effects])
    .addfield(transcript_ids[0], transcript_effect(transcript_ids[0]))
    .addfield(transcript_ids[1], transcript_effect(transcript_ids[1]))
    .addfield(transcript_ids[2], transcript_effect(transcript_ids[2]))
    .addfield(transcript_ids[3], transcript_effect(transcript_ids[3]))
    .addfield(transcript_ids[4], transcript_effect(transcript_ids[4]))
    .addfield(transcript_ids[5], transcript_effect(transcript_ids[5]))
    .addfield(transcript_ids[6], transcript_effect(transcript_ids[6]))
    .addfield(transcript_ids[7], transcript_effect(transcript_ids[7]))
    .addfield(transcript_ids[8], transcript_effect(transcript_ids[8]))
    .addfield(transcript_ids[9], transcript_effect(transcript_ids[9]))
    .addfield(transcript_ids[10], transcript_effect(transcript_ids[10]))
    .addfield(transcript_ids[11], transcript_effect(transcript_ids[11]))
    .addfield(transcript_ids[12], transcript_effect(transcript_ids[12]))
    .cutout('VEFF')
    .replaceall('.', None)
    .replaceall('', None)
    .cache()
)

In [35]:
tbl_variants_phase2_eff

0|CHROM,1|POS,2|num_alleles,3|REF,4|ALT,5|AC,6|ALTIX,7|FILTER_PASS,8|NoCoverage,9|LowCoverage,10|HighCoverage,11|LowMQ,12|HighMQ0,13|RepeatDUST,14|RepeatMasker,15|RepeatTRF,16|FS,17|HRun,18|QD,19|ReadPosRankSum,20|SNPEFF_Allele,21|SNPEFF_Annotation,22|SNPEFF_HGVS_c,23|SNPEFF_HGVS_p,24|SNPEFF_Feature_ID,25|SNPEFF_CDS_pos,26|AF_AOcol,27|AF_GHcol,28|AF_BFcol,29|AF_CIcol,30|AF_GNcol,31|AF_GW,32|AF_GM,33|AF_CMgam,34|AF_GHgam,35|AF_BFgam,36|AF_GNgam,37|AF_GAgam,38|AF_UGgam,39|AF_GQgam,40|AF_FRgam,41|AF_KE,42|check_allele,43|exon_start,44|exon_end,45|exon,46|AGAP004707-RA,47|AGAP004707-RB,48|AGAP004707-RC,49|Davies-C1N2,50|Davies-C3N2,51|Davies-C5N2,52|Davies-C7N2,53|Davies-C8N2,54|Davies-C10N2,55|Davies-C11N2,56|Davies-C1N9,57|Davies-C8N9,58|Davies-C1N9ck
2L,2358254,2,G,A,1,0,True,0,0,15,0,0,False,False,False,11.836,1,17.3,-0.022,A,missense_variant,n.97G>A,p.Asp33Asn,AGAP004707-RA,97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0016835016835016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,2358158.0,2358304.0,1.0,"('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')"
2L,2358309,2,A,G,1,0,True,0,0,20,0,0,False,False,False,2.266,0,16.39,-2.092,G,splice_region_variant&intron_varia,n.147+5A>G,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0016835016835016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,"('SPLICE_REGION', 'AGAP004707-PA', 5, 'AGAP004707-PA', -3698)","('SPLICE_REGION', 'AGAP004707-PB', 5, 'AGAP004707-PB', -3698)","('SPLICE_REGION', 'AGAP004707-PC', 5, 'AGAP004707-PC', -3698)","('SPLICE_REGION', '1', 5, '3', -3680)","('SPLICE_REGION', '1', 5, '3', -3680)","('SPLICE_REGION', '1', 5, '3', -3680)","('SPLICE_REGION', '1', 5, '3', -3680)","('SPLICE_REGION', '1', 5, '2j', -1331)","('SPLICE_REGION', '1', 5, '3', -3680)","('SPLICE_REGION', '1', 5, '3', -3680)","('SPLICE_REGION', '1', 5, '3', -3680)","('SPLICE_REGION', '1', 5, '2j', -1331)","('SPLICE_REGION', '1', 5, '3', -3680)"
2L,2358316,2,T,G,81,0,True,0,0,20,0,0,False,False,False,2.404,0,16.11,1.204,G,intron_variant,n.147+12T>G,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1363636363636363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,"('INTRONIC', 'AGAP004707-PA', 12, 'AGAP004707-PA', -3691)","('INTRONIC', 'AGAP004707-PB', 12, 'AGAP004707-PB', -3691)","('INTRONIC', 'AGAP004707-PC', 12, 'AGAP004707-PC', -3691)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '2j', -1324)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '2j', -1324)","('INTRONIC', '1', 12, '3', -3673)"
2L,2358328,2,T,C,8,0,True,0,0,18,0,0,False,False,False,3.373,0,14.76,-0.945,C,intron_variant,n.147+24T>C,,AGAP004707-RA,-1,0.0,0.0,0.0066666666666666,0.0,0.0,0.0164835164835164,0.0307692307692307,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,"('INTRONIC', 'AGAP004707-PA', 24, 'AGAP004707-PA', -3679)","('INTRONIC', 'AGAP004707-PB', 24, 'AGAP004707-PB', -3679)","('INTRONIC', 'AGAP004707-PC', 24, 'AGAP004707-PC', -3679)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '2j', -1312)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '2j', -1312)","('INTRONIC', '1', 24, '3', -3661)"
2L,2358353,2,C,T,1,0,True,0,2,19,0,0,False,False,False,7.008,0,9.79,1.307,T,intron_variant,n.147+49C>T,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0054945054945054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,"('INTRONIC', 'AGAP004707-PA', 49, 'AGAP004707-PA', -3654)","('INTRONIC', 'AGAP004707-PB', 49, 'AGAP004707-PB', -3654)","('INTRONIC', 'AGAP004707-PC', 49, 'AGAP004707-PC', -3654)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '2j', -1287)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '2j', -1287)","('INTRONIC', '1', 49, '3', -3636)"


## Inspect missense variants

In [38]:
def simplify_missense_effect(v):
    if v and v[0] == 'NON_SYNONYMOUS_CODING':
        return v[1]
    else:
        return ''

    
td_styles = {
    'FILTER_PASS': lambda v: 'background-color: red' if not v else '',
    'NoCoverage': lambda v: 'background-color: red' if v > 1 else '',
    'LowCoverage': lambda v: 'background-color: red' if v > 76 else '',
    'HighCoverage': lambda v: 'background-color: red' if v > 15 else '',
    'LowMQ': lambda v: 'background-color: red' if v > 76 else '',
    'HighMQ0': lambda v: 'background-color: red' if v > 1 else '',
    'RepeatDUST': lambda v: 'background-color: red' if v else '',
    'FS': lambda v: 'background-color: red' if v > 60 else '',
    'QD': lambda v: 'background-color: red' if v < 5 else '',
    'ReadPosRankSum': lambda v: 'background-color: red' if v < -8 else '',
    'HRun': lambda v: 'background-color: red' if v > 4 else '',
    'num_alleles': lambda v: 'background-color: orange' if v > 2 else '',
}


def tr_style(row):
    """Colour row by alternate allele count."""
    return 'background-color:rgba(0, 255, 0, %.3f)' % (min(1, row['AC']/100))


tbl_variants_phase2_missense = (
    tbl_variants_phase2_eff
    .select(lambda row: any(row[t] and row[t][0] == 'NON_SYNONYMOUS_CODING' for t in transcript_ids))
    .convert(transcript_ids, simplify_missense_effect)
)
tbl_variants_phase2_missense.displayall(td_styles=td_styles, tr_style=tr_style)

0|CHROM,1|POS,2|num_alleles,3|REF,4|ALT,5|AC,6|ALTIX,7|FILTER_PASS,8|NoCoverage,9|LowCoverage,10|HighCoverage,11|LowMQ,12|HighMQ0,13|RepeatDUST,14|RepeatMasker,15|RepeatTRF,16|FS,17|HRun,18|QD,19|ReadPosRankSum,20|SNPEFF_Allele,21|SNPEFF_Annotation,22|SNPEFF_HGVS_c,23|SNPEFF_HGVS_p,24|SNPEFF_Feature_ID,25|SNPEFF_CDS_pos,26|AF_AOcol,27|AF_GHcol,28|AF_BFcol,29|AF_CIcol,30|AF_GNcol,31|AF_GW,32|AF_GM,33|AF_CMgam,34|AF_GHgam,35|AF_BFgam,36|AF_GNgam,37|AF_GAgam,38|AF_UGgam,39|AF_GQgam,40|AF_FRgam,41|AF_KE,42|check_allele,43|exon_start,44|exon_end,45|exon,46|AGAP004707-RA,47|AGAP004707-RB,48|AGAP004707-RC,49|Davies-C1N2,50|Davies-C3N2,51|Davies-C5N2,52|Davies-C7N2,53|Davies-C8N2,54|Davies-C10N2,55|Davies-C11N2,56|Davies-C1N9,57|Davies-C8N9,58|Davies-C1N9ck
2L,2358254,2,G,A,1,0,True,0,0,15,0,0,False,False,False,11.836,1,17.3,-0.022,A,missense_variant,n.97G>A,p.Asp33Asn,AGAP004707-RA,97.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0016835016835016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,2358158,2358304,1,D33N,D33N,D33N,D33N,D33N,D33N,D33N,D33N,D33N,D33N,D33N,D33N,D33N
2L,2359670,2,G,A,7,0,False,1,271,1,1,0,False,False,False,5.78,6,14.13,-0.201,A,intron_variant,n.147+1366G>,,AGAP004707-RA,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0101010101010101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0104166666666666,True,2359640,2359672,2j,,,,,,,,E60K,,,,E60K,
2L,2362002,2,A,T,3,0,True,0,1,3,0,0,False,False,False,6.375,0,13.59,-0.221,T,splice_region_variant&intron_varia,n.148-5A>T,,AGAP004707-RA,-1.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,2361989,2362144,3,,,,D54V,D54V,D54V,D54V,D65V,D54V,D54V,D54V,D65V,D54V
2L,2362019,2,G,T,3,0,True,0,0,6,0,0,False,False,False,7.254,0,14.89,-0.303,T,missense_variant,n.160G>T,p.Gly54Cys,AGAP004707-RA,160.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,2361989,2362144,3,G54C,G54C,G54C,G60C,G60C,G60C,G60C,G71C,G60C,G60C,G60C,G71C,G60C
2L,2362023,2,C,T,1,0,True,0,1,4,0,0,False,False,False,0.0,0,13.4,-2.068,T,missense_variant,n.164C>T,p.Pro55Leu,AGAP004707-RA,164.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0054347826086956,0.0,0.0,0.0,0.0,0.0,0.0,True,2361989,2362144,3,P55L,P55L,P55L,P61L,P61L,P61L,P61L,P72L,P61L,P61L,P61L,P72L,P61L
2L,2390168,2,A,G,2,0,True,0,2,17,0,0,False,False,False,0.0,1,15.01,-0.057,G,missense_variant,n.752A>G,p.Lys251Arg,AGAP004707-RA,752.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0144927536231884,0.0,0.0,0.0,0.0,True,2390129,2390341,7,K251R,K251R,K251R,K257R,K214R,K257R,K257R,K268R,K257R,K257R,K257R,K268R,K257R
2L,2390177,2,G,A,215,0,True,0,4,13,0,0,False,False,False,0.479,1,19.5,1.877,A,missense_variant,n.761G>A,p.Arg254Lys,AGAP004707-RA,761.0,0.0,0.009090909090909,0.0,0.0,0.0,0.0,0.0,0.3131313131313131,0.0,0.0,0.0,0.2028985507246377,0.0,0.0,0.0,0.0,True,2390129,2390341,7,R254K,R254K,R254K,R260K,R217K,R260K,R260K,R271K,R260K,R260K,R260K,R271K,R260K
2L,2390305,2,A,T,1,0,True,0,1,18,0,0,False,False,False,15.07,0,10.16,1.525,T,missense_variant,n.889A>T,p.Thr297Ser,AGAP004707-RA,889.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0044642857142857,0.0,0.0,0.0,True,2390129,2390341,7,T297S,T297S,T297S,T303S,T260S,T303S,T303S,T314S,T303S,T303S,T303S,T314S,T303S
2L,2390311,2,G,A,1,0,True,0,1,15,0,0,False,False,False,0.922,3,12.84,-0.744,A,missense_variant,n.895G>A,p.Glu299Lys,AGAP004707-RA,895.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0016835016835016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,2390129,2390341,7,E299K,E299K,E299K,E305K,E262K,E305K,E305K,E316K,E305K,E305K,E305K,E316K,E305K
2L,2390448,2,G,A,6,0,True,0,0,18,0,0,False,False,False,1.945,0,16.11,-0.958,A,missense_variant,n.949G>A,p.Gly317Ser,AGAP004707-RA,949.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0101010101010101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,2390425,2390485,8,G317S,G317S,G317S,G323S,G280S,G323S,G323S,G334S,G323S,G323S,G323S,G334S,G323S


## Inspect splice site variants

In [39]:
def simplify_intron_effect(v):
    if v and v[0] in ['SPLICE_REGION', 'SPLICE_CORE']:
        if math.fabs(v[2]) < math.fabs(v[4]):
            return v[1], v[2]
        else:
            return v[3], v[4]
    else:
        return ''

    
td_styles = {
    'FILTER_PASS': lambda v: 'background-color: red' if not v else '',
    'NoCoverage': lambda v: 'background-color: red' if v > 1 else '',
    'LowCoverage': lambda v: 'background-color: red' if v > 76 else '',
    'HighCoverage': lambda v: 'background-color: red' if v > 15 else '',
    'LowMQ': lambda v: 'background-color: red' if v > 76 else '',
    'HighMQ0': lambda v: 'background-color: red' if v > 1 else '',
    'RepeatDUST': lambda v: 'background-color: red' if v else '',
    'FS': lambda v: 'background-color: red' if v > 60 else '',
    'QD': lambda v: 'background-color: red' if v < 5 else '',
    'ReadPosRankSum': lambda v: 'background-color: red' if v < -8 else '',
    'HRun': lambda v: 'background-color: red' if v > 4 else '',
    'num_alleles': lambda v: 'background-color: orange' if v > 2 else '',
}


def tr_style(row):
    """Colour row by alternate allele count."""
    return 'background-color:rgba(0, 255, 0, %.3f)' % (min(1, row['AC']/100))


tbl_variants_phase2_splice = (
    tbl_variants_phase2_eff
    .select(lambda row: any(row[t] and row[t][0] in ['SPLICE_REGION', 'SPLICE_CORE'] for t in transcript_ids))
    .convert(transcript_ids, simplify_intron_effect)
)
tbl_variants_phase2_splice.displayall(td_styles=td_styles, tr_style=tr_style)

0|CHROM,1|POS,2|num_alleles,3|REF,4|ALT,5|AC,6|ALTIX,7|FILTER_PASS,8|NoCoverage,9|LowCoverage,10|HighCoverage,11|LowMQ,12|HighMQ0,13|RepeatDUST,14|RepeatMasker,15|RepeatTRF,16|FS,17|HRun,18|QD,19|ReadPosRankSum,20|SNPEFF_Allele,21|SNPEFF_Annotation,22|SNPEFF_HGVS_c,23|SNPEFF_HGVS_p,24|SNPEFF_Feature_ID,25|SNPEFF_CDS_pos,26|AF_AOcol,27|AF_GHcol,28|AF_BFcol,29|AF_CIcol,30|AF_GNcol,31|AF_GW,32|AF_GM,33|AF_CMgam,34|AF_GHgam,35|AF_BFgam,36|AF_GNgam,37|AF_GAgam,38|AF_UGgam,39|AF_GQgam,40|AF_FRgam,41|AF_KE,42|check_allele,43|exon_start,44|exon_end,45|exon,46|AGAP004707-RA,47|AGAP004707-RB,48|AGAP004707-RC,49|Davies-C1N2,50|Davies-C3N2,51|Davies-C5N2,52|Davies-C7N2,53|Davies-C8N2,54|Davies-C10N2,55|Davies-C11N2,56|Davies-C1N9,57|Davies-C8N9,58|Davies-C1N9ck
2L,2358309,2,A,G,1,0,True,0,0,20,0,0,False,False,False,2.266,0,16.39,-2.092,G,splice_region_variant&intron_varia,n.147+5A>G,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0016835016835016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,"('AGAP004707-PA', 5)","('AGAP004707-PB', 5)","('AGAP004707-PC', 5)","('1', 5)","('1', 5)","('1', 5)","('1', 5)","('1', 5)","('1', 5)","('1', 5)","('1', 5)","('1', 5)","('1', 5)"
2L,2362002,2,A,T,3,0,True,0,1,3,0,0,False,False,False,6.375,0,13.59,-0.221,T,splice_region_variant&intron_varia,n.148-5A>T,,AGAP004707-RA,-1,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,2361989.0,2362144.0,3.0,"('AGAP004707-PA', -5)","('AGAP004707-PB', -5)","('AGAP004707-PC', -5)",,,,,,,,,,
2L,2362003,2,C,T,3,0,True,0,1,4,0,0,False,False,False,1.722,0,14.51,0.279,T,splice_region_variant&intron_varia,n.148-4C>T,,AGAP004707-RA,-1,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,2361989.0,2362144.0,3.0,"('AGAP004707-PA', -4)","('AGAP004707-PB', -4)","('AGAP004707-PC', -4)",,,,,,,,,,
2L,2382263,2,A,G,180,0,True,0,50,2,0,0,False,False,False,11.8,0,25.22,-3.188,G,splice_region_variant&intron_varia,n.492-7A>G,,AGAP004707-RA,-1,0.0064102564102564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4732142857142857,0.0,0.0,0.7604166666666666,True,,,,"('AGAP004707-PA', -7)","('AGAP004707-PB', -7)","('AGAP004707-PC', -7)","('5', -7)",,"('5', -7)","('5', -7)","('5', -7)","('5', -7)","('5', -7)","('5', -7)","('5', -7)","('5', -7)"
2L,2390125,2,A,C,1,0,True,0,5,15,0,0,False,False,False,2.447,1,15.18,0.548,C,splice_region_variant&intron_varia,n.713-4A>C,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0070422535211267,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,"('AGAP004707-PA', -4)","('AGAP004707-PB', -4)","('AGAP004707-PC', -4)","('7', -4)","('7', -4)","('7', -4)","('7', -4)","('7', -4)","('7', -4)","('7', -4)","('7', -4)","('7', -4)","('7', -4)"
2L,2390126,2,C,T,2,0,True,0,5,15,0,0,False,False,False,5.613,0,14.27,-1.009,T,splice_region_variant&intron_varia,n.713-3C>T,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0033670033670033,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,"('AGAP004707-PA', -3)","('AGAP004707-PB', -3)","('AGAP004707-PC', -3)","('7', -3)","('7', -3)","('7', -3)","('7', -3)","('7', -3)","('7', -3)","('7', -3)","('7', -3)","('7', -3)","('7', -3)"
2L,2400176,2,A,G,1,0,True,0,1,13,0,0,False,False,False,0.0,0,22.17,0.751,G,splice_region_variant&intron_varia,n.1572+3A>G,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0054945054945054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,"('AGAP004707-PA', 3)","('AGAP004707-PB', 3)","('AGAP004707-PC', 3)","('11i+', 3)","('11i+', 3)","('11i+', 3)","('11i+', 3)","('11i+', 3)","('11i+', 3)","('11i+', 3)","('11i+', 3)","('11i+', 3)","('11i+', 3)"
2L,2407888,2,T,C,9,0,True,0,1,14,0,0,False,False,False,2.314,0,16.67,0.381,C,splice_region_variant&intron_varia,n.2017-6T>C,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0494505494505494,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,"('AGAP004707-PA', -6)","('AGAP004707-PB', -6)","('AGAP004707-PC', -6)","('16', -6)","('16', -6)","('16', -6)","('16', -6)","('16', -6)","('16', -6)","('16', -6)","('16', -6)","('16', -6)","('16', -6)"
2L,2408000,2,A,G,1,0,True,0,8,9,0,0,False,False,False,3.068,0,11.13,0.794,G,splice_region_variant&intron_varia,n.2116+7A>G,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0555555555555555,0.0,0.0,True,,,,"('AGAP004707-PA', 7)","('AGAP004707-PB', 7)","('AGAP004707-PC', 7)","('16', 7)","('16', 7)","('16', 7)","('16', 7)","('16', 7)","('16', 7)","('16', 7)","('16', 7)","('16', 7)","('16', 7)"
2L,2417362,2,A,G,838,0,False,5,1067,0,0,0,False,False,False,43.97,1,29.84,1.503,G,splice_region_variant&intron_varia,n.2637+4A>G,,AGAP004707-RA,-1,0.8181818181818182,0.8272727272727273,0.8466666666666667,0.9154929577464788,0.875,0.0555555555555555,0.023076923076923,0.0929054054054054,1.0,1.0,1.0,0.0,0.0,0.1666666666666666,0.0,0.0,True,,,,"('AGAP004707-PA', 4)","('AGAP004707-PB', 4)","('AGAP004707-PC', 4)","('19', 4)","('19', 4)","('19', 4)","('19', 4)","('19', 4)","('19', 4)","('19', 4)","('19', 4)","('19', 4)","('19', 4)"


## Write out variants to file

In [40]:
(tbl_variants_phase2_eff
 .teepickle('../data/tbl_variants_phase2.pkl')
 .convert(transcript_ids, lambda v: ':'.join(map(str, v)))
 .replaceall(None, 'NA')
 .tocsv('../data/tbl_variants_phase2.csv')
)

In [41]:
# check OK
etl.frompickle('../data/tbl_variants_phase2.pkl')

0|CHROM,1|POS,2|num_alleles,3|REF,4|ALT,5|AC,6|ALTIX,7|FILTER_PASS,8|NoCoverage,9|LowCoverage,10|HighCoverage,11|LowMQ,12|HighMQ0,13|RepeatDUST,14|RepeatMasker,15|RepeatTRF,16|FS,17|HRun,18|QD,19|ReadPosRankSum,20|SNPEFF_Allele,21|SNPEFF_Annotation,22|SNPEFF_HGVS_c,23|SNPEFF_HGVS_p,24|SNPEFF_Feature_ID,25|SNPEFF_CDS_pos,26|AF_AOcol,27|AF_GHcol,28|AF_BFcol,29|AF_CIcol,30|AF_GNcol,31|AF_GW,32|AF_GM,33|AF_CMgam,34|AF_GHgam,35|AF_BFgam,36|AF_GNgam,37|AF_GAgam,38|AF_UGgam,39|AF_GQgam,40|AF_FRgam,41|AF_KE,42|check_allele,43|exon_start,44|exon_end,45|exon,46|AGAP004707-RA,47|AGAP004707-RB,48|AGAP004707-RC,49|Davies-C1N2,50|Davies-C3N2,51|Davies-C5N2,52|Davies-C7N2,53|Davies-C8N2,54|Davies-C10N2,55|Davies-C11N2,56|Davies-C1N9,57|Davies-C8N9,58|Davies-C1N9ck
2L,2358254,2,G,A,1,0,True,0,0,15,0,0,False,False,False,11.836,1,17.3,-0.022,A,missense_variant,n.97G>A,p.Asp33Asn,AGAP004707-RA,97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0016835016835016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,2358158.0,2358304.0,1.0,"('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')"
2L,2358309,2,A,G,1,0,True,0,0,20,0,0,False,False,False,2.266,0,16.39,-2.092,G,splice_region_variant&intron_varia,n.147+5A>G,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0016835016835016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,"('SPLICE_REGION', 'AGAP004707-PA', 5, 'AGAP004707-PA', -3698)","('SPLICE_REGION', 'AGAP004707-PB', 5, 'AGAP004707-PB', -3698)","('SPLICE_REGION', 'AGAP004707-PC', 5, 'AGAP004707-PC', -3698)","('SPLICE_REGION', '1', 5, '3', -3680)","('SPLICE_REGION', '1', 5, '3', -3680)","('SPLICE_REGION', '1', 5, '3', -3680)","('SPLICE_REGION', '1', 5, '3', -3680)","('SPLICE_REGION', '1', 5, '2j', -1331)","('SPLICE_REGION', '1', 5, '3', -3680)","('SPLICE_REGION', '1', 5, '3', -3680)","('SPLICE_REGION', '1', 5, '3', -3680)","('SPLICE_REGION', '1', 5, '2j', -1331)","('SPLICE_REGION', '1', 5, '3', -3680)"
2L,2358316,2,T,G,81,0,True,0,0,20,0,0,False,False,False,2.404,0,16.11,1.204,G,intron_variant,n.147+12T>G,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1363636363636363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,"('INTRONIC', 'AGAP004707-PA', 12, 'AGAP004707-PA', -3691)","('INTRONIC', 'AGAP004707-PB', 12, 'AGAP004707-PB', -3691)","('INTRONIC', 'AGAP004707-PC', 12, 'AGAP004707-PC', -3691)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '2j', -1324)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '2j', -1324)","('INTRONIC', '1', 12, '3', -3673)"
2L,2358328,2,T,C,8,0,True,0,0,18,0,0,False,False,False,3.373,0,14.76,-0.945,C,intron_variant,n.147+24T>C,,AGAP004707-RA,-1,0.0,0.0,0.0066666666666666,0.0,0.0,0.0164835164835164,0.0307692307692307,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,"('INTRONIC', 'AGAP004707-PA', 24, 'AGAP004707-PA', -3679)","('INTRONIC', 'AGAP004707-PB', 24, 'AGAP004707-PB', -3679)","('INTRONIC', 'AGAP004707-PC', 24, 'AGAP004707-PC', -3679)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '2j', -1312)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '2j', -1312)","('INTRONIC', '1', 24, '3', -3661)"
2L,2358353,2,C,T,1,0,True,0,2,19,0,0,False,False,False,7.008,0,9.79,1.307,T,intron_variant,n.147+49C>T,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0054945054945054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,"('INTRONIC', 'AGAP004707-PA', 49, 'AGAP004707-PA', -3654)","('INTRONIC', 'AGAP004707-PB', 49, 'AGAP004707-PB', -3654)","('INTRONIC', 'AGAP004707-PC', 49, 'AGAP004707-PC', -3654)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '2j', -1287)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '2j', -1287)","('INTRONIC', '1', 49, '3', -3636)"


In [42]:
etl.fromcsv('../data/tbl_variants_phase2.csv')

0|CHROM,1|POS,2|num_alleles,3|REF,4|ALT,5|AC,6|ALTIX,7|FILTER_PASS,8|NoCoverage,9|LowCoverage,10|HighCoverage,11|LowMQ,12|HighMQ0,13|RepeatDUST,14|RepeatMasker,15|RepeatTRF,16|FS,17|HRun,18|QD,19|ReadPosRankSum,20|SNPEFF_Allele,21|SNPEFF_Annotation,22|SNPEFF_HGVS_c,23|SNPEFF_HGVS_p,24|SNPEFF_Feature_ID,25|SNPEFF_CDS_pos,26|AF_AOcol,27|AF_GHcol,28|AF_BFcol,29|AF_CIcol,30|AF_GNcol,31|AF_GW,32|AF_GM,33|AF_CMgam,34|AF_GHgam,35|AF_BFgam,36|AF_GNgam,37|AF_GAgam,38|AF_UGgam,39|AF_GQgam,40|AF_FRgam,41|AF_KE,42|check_allele,43|exon_start,44|exon_end,45|exon,46|AGAP004707-RA,47|AGAP004707-RB,48|AGAP004707-RC,49|Davies-C1N2,50|Davies-C3N2,51|Davies-C5N2,52|Davies-C7N2,53|Davies-C8N2,54|Davies-C10N2,55|Davies-C11N2,56|Davies-C1N9,57|Davies-C8N9,58|Davies-C1N9ck
2L,2358254,2,G,A,1,0,True,0,0,15,0,0,False,False,False,11.836,1,17.3,-0.022,A,missense_variant,n.97G>A,p.Asp33Asn,AGAP004707-RA,97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0016835016835016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,2358158.0,2358304.0,1.0,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N
2L,2358309,2,A,G,1,0,True,0,0,20,0,0,False,False,False,2.266,0,16.39,-2.092,G,splice_region_variant&intron_varia,n.147+5A>G,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0016835016835016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,SPLICE_REGION:AGAP004707-PA:5:AGAP004707-PA:-3698,SPLICE_REGION:AGAP004707-PB:5:AGAP004707-PB:-3698,SPLICE_REGION:AGAP004707-PC:5:AGAP004707-PC:-3698,SPLICE_REGION:1:5:3:-3680,SPLICE_REGION:1:5:3:-3680,SPLICE_REGION:1:5:3:-3680,SPLICE_REGION:1:5:3:-3680,SPLICE_REGION:1:5:2j:-1331,SPLICE_REGION:1:5:3:-3680,SPLICE_REGION:1:5:3:-3680,SPLICE_REGION:1:5:3:-3680,SPLICE_REGION:1:5:2j:-1331,SPLICE_REGION:1:5:3:-3680
2L,2358316,2,T,G,81,0,True,0,0,20,0,0,False,False,False,2.404,0,16.11,1.204,G,intron_variant,n.147+12T>G,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1363636363636363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,INTRONIC:AGAP004707-PA:12:AGAP004707-PA:-3691,INTRONIC:AGAP004707-PB:12:AGAP004707-PB:-3691,INTRONIC:AGAP004707-PC:12:AGAP004707-PC:-3691,INTRONIC:1:12:3:-3673,INTRONIC:1:12:3:-3673,INTRONIC:1:12:3:-3673,INTRONIC:1:12:3:-3673,INTRONIC:1:12:2j:-1324,INTRONIC:1:12:3:-3673,INTRONIC:1:12:3:-3673,INTRONIC:1:12:3:-3673,INTRONIC:1:12:2j:-1324,INTRONIC:1:12:3:-3673
2L,2358328,2,T,C,8,0,True,0,0,18,0,0,False,False,False,3.373,0,14.76,-0.945,C,intron_variant,n.147+24T>C,,AGAP004707-RA,-1,0.0,0.0,0.0066666666666666,0.0,0.0,0.0164835164835164,0.0307692307692307,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,INTRONIC:AGAP004707-PA:24:AGAP004707-PA:-3679,INTRONIC:AGAP004707-PB:24:AGAP004707-PB:-3679,INTRONIC:AGAP004707-PC:24:AGAP004707-PC:-3679,INTRONIC:1:24:3:-3661,INTRONIC:1:24:3:-3661,INTRONIC:1:24:3:-3661,INTRONIC:1:24:3:-3661,INTRONIC:1:24:2j:-1312,INTRONIC:1:24:3:-3661,INTRONIC:1:24:3:-3661,INTRONIC:1:24:3:-3661,INTRONIC:1:24:2j:-1312,INTRONIC:1:24:3:-3661
2L,2358353,2,C,T,1,0,True,0,2,19,0,0,False,False,False,7.008,0,9.79,1.307,T,intron_variant,n.147+49C>T,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0054945054945054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,INTRONIC:AGAP004707-PA:49:AGAP004707-PA:-3654,INTRONIC:AGAP004707-PB:49:AGAP004707-PB:-3654,INTRONIC:AGAP004707-PC:49:AGAP004707-PC:-3654,INTRONIC:1:49:3:-3636,INTRONIC:1:49:3:-3636,INTRONIC:1:49:3:-3636,INTRONIC:1:49:3:-3636,INTRONIC:1:49:2j:-1287,INTRONIC:1:49:3:-3636,INTRONIC:1:49:3:-3636,INTRONIC:1:49:3:-3636,INTRONIC:1:49:2j:-1287,INTRONIC:1:49:3:-3636
