# Extract mutations in VGSC

This notebook extracts data on all mutations in the VGSC gene.

## Setup

In [66]:
%run setup.ipynb

In [2]:
# download gene annotations from vectorbase
!wget \
    --no-clobber \
    -O ../data/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.4.gff3.gz \
    https://www.vectorbase.org/download/anopheles-gambiae-pestbasefeaturesagamp44gff3gz

File `../data/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.4.gff3.gz' already there; not retrieving.


In [3]:
# download the Davies et al. (2007) gene models
!wget \
    --no-clobber \
    -O ../data/davies_vgsc_model_20170125.gff3 \
    http://alimanfoo.github.io/assets/davies_vgsc_model_20170125.gff3

File `../data/davies_vgsc_model_20170125.gff3' already there; not retrieving.


In [4]:
# load the vectorbase geneset
geneset_agamp44 = allel.FeatureTable.from_gff3('../data/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.4.gff3.gz',
                                               attributes=['ID', 'Parent'])
geneset_agamp44 = geneset_to_pandas(geneset_agamp44)
geneset_agamp44.head()

  return pandas.DataFrame.from_items(items)


Unnamed: 0,seqid,source,type,start,end,score,strand,phase,ID,Parent
0,2L,VectorBase,chromosome,1,49364325,-1.0,.,-1,2L,.
1,2L,VectorBase,gene,157348,186936,-1.0,-,-1,AGAP004677,.
2,2L,VectorBase,mRNA,157348,181305,-1.0,-,-1,AGAP004677-RA,AGAP004677
3,2L,VectorBase,three_prime_UTR,157348,157495,-1.0,-,-1,.,AGAP004677-RA
4,2L,VectorBase,exon,157348,157623,-1.0,-,-1,.,AGAP004677-RA


In [5]:
# subset to VGSC
geneset_agamp44_vgsc = geneset_agamp44.query(region_vgsc.query_str).copy()
# replace CDS IDs as not informative
geneset_agamp44_vgsc['ID'].values[(geneset_agamp44_vgsc.type == 'CDS').values] = ''
geneset_agamp44_vgsc.type.value_counts()

exon    93
CDS     93
mRNA     3
gene     1
Name: type, dtype: int64

In [6]:
# load the Davies geneset
geneset_davies = allel.FeatureTable.from_gff3('../data/davies_vgsc_model_20170125.gff3',
                                              attributes=['ID', 'Parent'])
geneset_davies = geneset_to_pandas(geneset_davies)
geneset_davies.head()

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,ID,Parent
0,2L,Davies et al. (2007),mRNA,2358158,2431617,-1.0,+,-1,Davies-C1N2,AGAP004707
1,2L,Davies et al. (2007),mRNA,2358158,2431617,-1.0,+,-1,Davies-C3N2,AGAP004707
2,2L,Davies et al. (2007),mRNA,2358158,2431617,-1.0,+,-1,Davies-C5N2,AGAP004707
3,2L,Davies et al. (2007),mRNA,2358158,2431617,-1.0,+,-1,Davies-C7N2,AGAP004707
4,2L,Davies et al. (2007),mRNA,2358158,2431617,-1.0,+,-1,Davies-C8N2,AGAP004707


In [7]:
# make a combined geneset
geneset_vgsc_combined = pandas.concat([geneset_agamp44_vgsc, geneset_davies])
geneset_vgsc_combined.query("type == 'mRNA'")

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,ID,Parent
666,2L,VectorBase,mRNA,2358158,2431617,-1.0,+,-1,AGAP004707-RA,AGAP004707
729,2L,VectorBase,mRNA,2358158,2431617,-1.0,+,-1,AGAP004707-RB,AGAP004707
792,2L,VectorBase,mRNA,2358158,2431617,-1.0,+,-1,AGAP004707-RC,AGAP004707
0,2L,Davies et al. (2007),mRNA,2358158,2431617,-1.0,+,-1,Davies-C1N2,AGAP004707
1,2L,Davies et al. (2007),mRNA,2358158,2431617,-1.0,+,-1,Davies-C3N2,AGAP004707
2,2L,Davies et al. (2007),mRNA,2358158,2431617,-1.0,+,-1,Davies-C5N2,AGAP004707
3,2L,Davies et al. (2007),mRNA,2358158,2431617,-1.0,+,-1,Davies-C7N2,AGAP004707
4,2L,Davies et al. (2007),mRNA,2358158,2431617,-1.0,+,-1,Davies-C8N2,AGAP004707
5,2L,Davies et al. (2007),mRNA,2358158,2431617,-1.0,+,-1,Davies-C10N2,AGAP004707
6,2L,Davies et al. (2007),mRNA,2358158,2431617,-1.0,+,-1,Davies-C11N2,AGAP004707


In [8]:
# setup a variant annotator
annotator = veff.Annotator(
    fasta_path='../phase2.AR1/genome/agamP3/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP3.fa', 
    gff3_path=['../data/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.4.gff3.gz',
               '../data/davies_vgsc_model_20170125.gff3'],
    seqid='2L'
)

In [9]:
# identify VGSC transcripts
transcript_ids = [f.feature_id for f in annotator.get_children('AGAP004707')]
transcript_ids

['AGAP004707-RA',
 'AGAP004707-RB',
 'AGAP004707-RC',
 'Davies-C1N2',
 'Davies-C3N2',
 'Davies-C5N2',
 'Davies-C7N2',
 'Davies-C8N2',
 'Davies-C10N2',
 'Davies-C11N2',
 'Davies-C1N9',
 'Davies-C8N9',
 'Davies-C1N9ck']

In [10]:
# tabulate Davies exons
tbl_davies_exons = (
    etl
    .fromdataframe(geneset_davies)
    .eq('type', 'CDS')
    .cutout('Parent', 'source', 'type', 'score', 'strand', 'phase')
    .merge(key=('start', 'end'))
    .rename('seqid', 'exon_seqid')
    .rename('ID', 'exon')
    .rename('start', 'exon_start')
    .rename('end', 'exon_end')
    .movefield('exon_seqid', 0)
)
tbl_davies_exons.displayall()

0|exon_seqid,1|exon_start,2|exon_end,3|exon
2L,2358158,2358304,1
2L,2359640,2359672,2j
2L,2361989,2362144,3
2L,2381065,2381270,4
2L,2382270,2382398,5
2L,2385694,2385785,6
2L,2390129,2390341,7
2L,2390425,2390485,8
2L,2390594,2390738,9
2L,2391156,2391320,10


## Extract table of variants

In [17]:
callset = phase2_ar1.callset
callset

<zarr.hierarchy.Group '/' read-only>

In [18]:
# what fields are available?
print(', '.join(callset['2L/variants']))

AC, ALT, CHROM, FILTER_PASS, FS, HRun, HighCoverage, HighMQ0, LowCoverage, LowMQ, NoCoverage, POS, QD, REF, ReadPosRankSum, RepeatDUST, RepeatMasker, RepeatTRF, num_alleles


In [50]:
#get SNPEFF annotations from HDF5 - also find out why these aren't in zarr format
snpeff_h5_fn = '../phase2.AR1/variation/main/hdf5/all_snpeff/ag1000g.phase2.ar1.snpeff.AgamP4.2.2L.h5'
snpf = h5py.File(snpeff_h5_fn, mode='r')

In [55]:
# what SNPEFF fields are available?
print(', '.join(snpf['2L/variants/ANN'].dtype.names))

Allele, Annotation, Annotation_Impact, Gene_Name, Gene_ID, Feature_Type, Feature_ID, Transcript_BioType, Rank, HGVS_c, HGVS_p, cDNA_pos, cDNA_length, CDS_pos, CDS_length, AA_pos, AA_length, Distance


In [56]:
samples = phase2_ar1.df_samples
samples.head()

Unnamed: 0_level_0,src_code,population,country,region,contributor,contact,year,m_s,sex,n_sequences,mean_coverage
ox_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AA0040-C,Twifo_Praso__E2,GHcol,Ghana,Twifo_Praso,David Weetman,,2012.0,M,F,95033368,30.99
AA0041-C,Twifo_Praso__H3,GHcol,Ghana,Twifo_Praso,David Weetman,,2012.0,M,F,95843804,31.7
AA0042-C,Takoradi_C7,GHcol,Ghana,Takoradi,David Weetman,,2012.0,M,F,107420666,35.65
AA0043-C,Takoradi_H8,GHcol,Ghana,Takoradi,David Weetman,,2012.0,M,F,95993752,29.46
AA0044-C,Takoradi_D10,GHcol,Ghana,Takoradi,David Weetman,,2012.0,M,F,103044262,33.67


In [87]:
#samples needs to have a numeric index - this should probably be tested in future versions
samples = samples.reset_index()

### breakdown table code - not working

In [71]:
variants = callset[seqid]['variants']
ann = snpf[seqid]['variants']['ANN']
pos = allel.SortedIndex(variants['POS'])

In [72]:
start = region_vgsc.start
end = region_vgsc.end

In [73]:
loc = pos.locate_range(start, end)
genotype = allel.GenotypeArray(callset[seqid]['calldata/genotype'][loc])

In [75]:
acs = genotype.count_alleles_subpops(max_allele=3, subpops=subpops)

In [88]:
def tabulate_variants(callset, snpeff, seqid, start, end, pop_ids, subpops):
    """Build a table of variants for a given callset and genome region."""
    
    variants = callset[seqid]['variants']
    ann = snpeff[seqid]['variants']['ANN']
    pos = allel.SortedIndex(variants['POS'])
    loc = pos.locate_range(start, end)
    genotype = allel.GenotypeArray(callset[seqid]['calldata/genotype'][loc])
    acs = genotype.count_alleles_subpops(max_allele=3, subpops=subpops)
    
    # extract columns
    variants_fields = [
        'CHROM',
        'POS',
        'num_alleles',
        'REF',
        'ALT',
        'AC',
        'FILTER_PASS',
        'NoCoverage',
        'LowCoverage',
        'HighCoverage',
        'LowMQ',
        'HighMQ0',
        'RepeatDUST',
        'RepeatMasker',
        'RepeatTRF',
        'FS',
        'HRun',
        'QD',
        'ReadPosRankSum',
    ]
    ann_fields = ['Allele', 'Annotation', 'HGVS_c', 'HGVS_p', 'Feature_ID', 'CDS_pos']
    cols = (
        [variants[f][loc] for f in variants_fields] + 
        [ann[loc][f] for f in ann_fields] + 
        [acs[p].to_frequencies() for p in pop_ids]
    )

    def split_alleles(row):
        for i in range(row.num_alleles - 1):
            # break down alleles
            out = [
                row['CHROM'], 
                row['POS'], 
                row['num_alleles'], 
                row['REF'], 
                row['ALT'][i], 
                row['AC'][i], 
                i, 
            ]
            # add in remaining variant annotations
            out += [row[f] for f in variants_fields[6:]]
            # SNPEFF annotation only applies to first allele
            if i == 0:
                out += [row[f] for f in ann_fields]
            else:
                out += [None for f in ann_fields]
            # add in population allele frequencies
            out += [row[p][i+1] for p in pop_ids]
            yield out
        
    tbl = (
        etl
        .fromcolumns(cols, header=variants_fields + ann_fields + list(pop_ids))
        .rowmapmany(split_alleles, header=variants_fields[:6] + ['ALTIX'] + variants_fields[6:] + ann_fields + list(pop_ids), failonerror=True)
        .convert('CHROM REF ALT Allele Annotation HGVS_c HGVS_p Feature_ID'.split(), lambda v: str(v, 'ascii'))
        .rename({f: 'SNPEFF_' + f for f in ann_fields})
        .rename({p: 'AF_%s' % p for p in pop_ids})
        .addfield('check_allele', lambda row: row['SNPEFF_Allele'] is None or row['SNPEFF_Allele'] == row['ALT'])
    )
    
    return tbl

In [89]:
pop_ids = phase2_ar1.pop_ids
print(', '.join(pop_ids))

AOcol, GHcol, BFcol, CIcol, GNcol, GW, GM, CMgam, GHgam, BFgam, GNgam, GAgam, UGgam, GQgam, FRgam, KE


In [90]:
subpops = {p: samples[samples.population == p].index.values.tolist() for p in pop_ids}

In [92]:
# build a table of variants from phase 1
tbl_variants_phase2 = tabulate_variants(callset, snpf, 
                                        seqid=region_vgsc.seqid, start=region_vgsc.start, end=region_vgsc.end, 
                                        pop_ids=pop_ids, subpops=subpops)
tbl_variants_phase2

0|CHROM,1|POS,2|num_alleles,3|REF,4|ALT,5|AC,6|ALTIX,7|FILTER_PASS,8|NoCoverage,9|LowCoverage,10|HighCoverage,11|LowMQ,12|HighMQ0,13|RepeatDUST,14|RepeatMasker,15|RepeatTRF,16|FS,17|HRun,18|QD,19|ReadPosRankSum,20|SNPEFF_Allele,21|SNPEFF_Annotation,22|SNPEFF_HGVS_c,23|SNPEFF_HGVS_p,24|SNPEFF_Feature_ID,25|SNPEFF_CDS_pos,26|AF_AOcol,27|AF_GHcol,28|AF_BFcol,29|AF_CIcol,30|AF_GNcol,31|AF_GW,32|AF_GM,33|AF_CMgam,34|AF_GHgam,35|AF_BFgam,36|AF_GNgam,37|AF_GAgam,38|AF_UGgam,39|AF_GQgam,40|AF_FRgam,41|AF_KE,42|check_allele
2L,2358254,2,G,A,1,0,True,0,0,15,0,0,False,False,False,11.836,1,17.3,-0.022,A,missense_variant,n.97G>A,p.Asp33Asn,AGAP004707-RA,97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0016835016835016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
2L,2358309,2,A,G,1,0,True,0,0,20,0,0,False,False,False,2.266,0,16.39,-2.092,G,splice_region_variant&intron_varia,n.147+5A>G,.,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0016835016835016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
2L,2358316,2,T,G,81,0,True,0,0,20,0,0,False,False,False,2.404,0,16.11,1.204,G,intron_variant,n.147+12T>G,.,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1363636363636363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
2L,2358328,2,T,C,8,0,True,0,0,18,0,0,False,False,False,3.373,0,14.76,-0.945,C,intron_variant,n.147+24T>C,.,AGAP004707-RA,-1,0.0,0.0,0.0066666666666666,0.0,0.0,0.0164835164835164,0.0307692307692307,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
2L,2358353,2,C,T,1,0,True,0,2,19,0,0,False,False,False,7.008,0,9.79,1.307,T,intron_variant,n.147+49C>T,.,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0054945054945054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True


In [None]:
#up to here.

## Annotate effects for all transcripts

In [19]:
cds_effects = [
    'NON_SYNONYMOUS_CODING', 
    'SYNONYMOUS_CODING',    
]
intron_effects = [
    'INTRONIC', 
    'SPLICE_CORE',
    'SPLICE_REGION',        
]
selected_effects = cds_effects + intron_effects

In [20]:
def lpop(l, default=None):
    """Pop the first item from a list if not empty."""
    try:
        return l[0]
    except IndexError:
        return default



In [21]:
def transcript_effect(transcript_id):
    def f(row):
        e = lpop([e for e in row.VEFF if e.transcript_id == transcript_id])
        if e and e.effect in cds_effects:
            return (e.effect, e.aa_change)
        elif e and e.effect in intron_effects:
            return (e.effect, e.intron_cds_5prime, e.intron_5prime_dist, e.intron_cds_3prime, e.intron_3prime_dist)
        else:
            return None
    return f



In [22]:
tbl_variants_phase1_eff = (
    tbl_variants_phase1
    # join in Davies exon information
    .intervalleftjoin(
        # don't include shorter exon alternatives
        tbl_davies_exons.select('exon', lambda v: v[-1] != '-'),
        lkey='CHROM', rkey='exon_seqid', lstart='POS', rstart='exon_start', lstop='POS', rstop='exon_end', include_stop=True)
    .cutout('exon_seqid')
    .addfield('VEFF', lambda row: [e for e in annotator.get_effects(chrom=row.CHROM, pos=row.POS, ref=row.REF, alt=row.ALT) 
                                   if e.effect in selected_effects])
    .addfield(transcript_ids[0], transcript_effect(transcript_ids[0]))
    .addfield(transcript_ids[1], transcript_effect(transcript_ids[1]))
    .addfield(transcript_ids[2], transcript_effect(transcript_ids[2]))
    .addfield(transcript_ids[3], transcript_effect(transcript_ids[3]))
    .addfield(transcript_ids[4], transcript_effect(transcript_ids[4]))
    .addfield(transcript_ids[5], transcript_effect(transcript_ids[5]))
    .addfield(transcript_ids[6], transcript_effect(transcript_ids[6]))
    .addfield(transcript_ids[7], transcript_effect(transcript_ids[7]))
    .addfield(transcript_ids[8], transcript_effect(transcript_ids[8]))
    .addfield(transcript_ids[9], transcript_effect(transcript_ids[9]))
    .addfield(transcript_ids[10], transcript_effect(transcript_ids[10]))
    .addfield(transcript_ids[11], transcript_effect(transcript_ids[11]))
    .addfield(transcript_ids[12], transcript_effect(transcript_ids[12]))
    .cutout('VEFF')
    .replaceall('.', None)
    .replaceall('', None)
    .cache()
)

In [23]:
tbl_variants_phase1_eff.display(20)

0|CHROM,1|POS,2|num_alleles,3|REF,4|ALT,5|AC,6|ALTIX,7|FILTER_PASS,8|NoCoverage,9|LowCoverage,10|HighCoverage,11|LowMQ,12|HighMQ0,13|RepeatDUST,14|RepeatMasker,15|RepeatTRF,16|FS,17|HRun,18|QD,19|ReadPosRankSum,20|SNPEFF_Allele,21|SNPEFF_Annotation,22|SNPEFF_HGVS_c,23|SNPEFF_HGVS_p,24|SNPEFF_Feature_ID,25|SNPEFF_CDS_pos,26|AF_AOM,27|AF_BFM,28|AF_GWA,29|AF_GNS,30|AF_BFS,31|AF_CMS,32|AF_GAS,33|AF_UGS,34|AF_KES,35|check_allele,36|exon_start,37|exon_end,38|exon,39|AGAP004707-RA,40|AGAP004707-RB,41|AGAP004707-RC,42|Davies-C1N2,43|Davies-C3N2,44|Davies-C5N2,45|Davies-C7N2,46|Davies-C8N2,47|Davies-C10N2,48|Davies-C11N2,49|Davies-C1N9,50|Davies-C8N9,51|Davies-C1N9ck
2L,2358254,2,G,A,1,0,True,0,0,10,0,0,False,False,False,9.8672,1,17.547,-0.049988,A,missense_variant,n.97G>A,p.Asp33Asn,AGAP004707-RA,97,0.0,0.0,0.0,0.0,0.0,0.00181818181818,0.0,0.0,0.0,True,2358158.0,2358304.0,1.0,"('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')"
2L,2358316,2,T,G,73,0,True,0,0,15,0,0,False,False,False,2.4844,0,16.438,1.4219,G,intron_variant,n.147+12T>G,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.132727272727,0.0,0.0,0.0,True,,,,"('INTRONIC', 'AGAP004707-PA', 12, 'AGAP004707-PA', -3691)","('INTRONIC', 'AGAP004707-PB', 12, 'AGAP004707-PB', -3691)","('INTRONIC', 'AGAP004707-PC', 12, 'AGAP004707-PC', -3691)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '2j', -1324)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '2j', -1324)","('INTRONIC', '1', 12, '3', -3673)"
2L,2358328,2,T,C,2,0,True,0,0,14,0,0,False,False,False,2.7363,0,16.062,-0.646,C,intron_variant,n.147+24T>C,,AGAP004707-RA,-1,0.0,0.00724637681159,0.0108695652174,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,"('INTRONIC', 'AGAP004707-PA', 24, 'AGAP004707-PA', -3679)","('INTRONIC', 'AGAP004707-PB', 24, 'AGAP004707-PB', -3679)","('INTRONIC', 'AGAP004707-PC', 24, 'AGAP004707-PC', -3679)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '2j', -1312)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '2j', -1312)","('INTRONIC', '1', 24, '3', -3661)"
2L,2358353,2,C,T,1,0,True,0,1,15,0,0,False,False,False,1.9512,0,9.8594,1.1582,T,intron_variant,n.147+49C>T,,AGAP004707-RA,-1,0.0,0.0,0.0108695652174,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,"('INTRONIC', 'AGAP004707-PA', 49, 'AGAP004707-PA', -3654)","('INTRONIC', 'AGAP004707-PB', 49, 'AGAP004707-PB', -3654)","('INTRONIC', 'AGAP004707-PC', 49, 'AGAP004707-PC', -3654)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '2j', -1287)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '2j', -1287)","('INTRONIC', '1', 49, '3', -3636)"
2L,2358405,2,T,A,1,0,True,0,6,14,0,0,False,False,False,20.844,1,10.859,1.1562,A,intron_variant,n.147+101T>A,,AGAP004707-RA,-1,0.0,0.0,0.0108695652174,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,"('INTRONIC', 'AGAP004707-PA', 101, 'AGAP004707-PA', -3602)","('INTRONIC', 'AGAP004707-PB', 101, 'AGAP004707-PB', -3602)","('INTRONIC', 'AGAP004707-PC', 101, 'AGAP004707-PC', -3602)","('INTRONIC', '1', 101, '3', -3584)","('INTRONIC', '1', 101, '3', -3584)","('INTRONIC', '1', 101, '3', -3584)","('INTRONIC', '1', 101, '3', -3584)","('INTRONIC', '1', 101, '2j', -1235)","('INTRONIC', '1', 101, '3', -3584)","('INTRONIC', '1', 101, '3', -3584)","('INTRONIC', '1', 101, '3', -3584)","('INTRONIC', '1', 101, '2j', -1235)","('INTRONIC', '1', 101, '3', -3584)"
2L,2358441,2,A,T,78,0,False,0,6,17,0,0,False,False,False,2.4805,1,21.703,0.94385,T,intron_variant,n.147+137A>T,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0145454545455,0.625,0.0,0.0,True,,,,"('INTRONIC', 'AGAP004707-PA', 137, 'AGAP004707-PA', -3566)","('INTRONIC', 'AGAP004707-PB', 137, 'AGAP004707-PB', -3566)","('INTRONIC', 'AGAP004707-PC', 137, 'AGAP004707-PC', -3566)","('INTRONIC', '1', 137, '3', -3548)","('INTRONIC', '1', 137, '3', -3548)","('INTRONIC', '1', 137, '3', -3548)","('INTRONIC', '1', 137, '3', -3548)","('INTRONIC', '1', 137, '2j', -1199)","('INTRONIC', '1', 137, '3', -3548)","('INTRONIC', '1', 137, '3', -3548)","('INTRONIC', '1', 137, '3', -3548)","('INTRONIC', '1', 137, '2j', -1199)","('INTRONIC', '1', 137, '3', -3548)"
2L,2358463,2,G,T,5,0,False,0,4,16,0,0,False,False,False,22.0,0,15.211,-0.42798,T,intron_variant,n.147+159G>T,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0568181818182,True,,,,"('INTRONIC', 'AGAP004707-PA', 159, 'AGAP004707-PA', -3544)","('INTRONIC', 'AGAP004707-PB', 159, 'AGAP004707-PB', -3544)","('INTRONIC', 'AGAP004707-PC', 159, 'AGAP004707-PC', -3544)","('INTRONIC', '1', 159, '3', -3526)","('INTRONIC', '1', 159, '3', -3526)","('INTRONIC', '1', 159, '3', -3526)","('INTRONIC', '1', 159, '3', -3526)","('INTRONIC', '1', 159, '2j', -1177)","('INTRONIC', '1', 159, '3', -3526)","('INTRONIC', '1', 159, '3', -3526)","('INTRONIC', '1', 159, '3', -3526)","('INTRONIC', '1', 159, '2j', -1177)","('INTRONIC', '1', 159, '3', -3526)"
2L,2358468,2,A,C,150,0,False,0,4,17,0,0,False,False,False,1.668,0,19.812,-0.198,C,intron_variant,n.147+164A>C,,AGAP004707-RA,-1,0.0,0.0,0.0978260869565,0.0,0.0,0.0509090909091,0.0,0.52427184466,0.0568181818182,True,,,,"('INTRONIC', 'AGAP004707-PA', 164, 'AGAP004707-PA', -3539)","('INTRONIC', 'AGAP004707-PB', 164, 'AGAP004707-PB', -3539)","('INTRONIC', 'AGAP004707-PC', 164, 'AGAP004707-PC', -3539)","('INTRONIC', '1', 164, '3', -3521)","('INTRONIC', '1', 164, '3', -3521)","('INTRONIC', '1', 164, '3', -3521)","('INTRONIC', '1', 164, '3', -3521)","('INTRONIC', '1', 164, '2j', -1172)","('INTRONIC', '1', 164, '3', -3521)","('INTRONIC', '1', 164, '3', -3521)","('INTRONIC', '1', 164, '3', -3521)","('INTRONIC', '1', 164, '2j', -1172)","('INTRONIC', '1', 164, '3', -3521)"
2L,2358501,2,A,T,5,0,False,0,4,22,0,0,False,False,False,11.672,0,14.359,-1.2432,T,intron_variant,n.147+197A>T,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0568181818182,True,,,,"('INTRONIC', 'AGAP004707-PA', 197, 'AGAP004707-PA', -3506)","('INTRONIC', 'AGAP004707-PB', 197, 'AGAP004707-PB', -3506)","('INTRONIC', 'AGAP004707-PC', 197, 'AGAP004707-PC', -3506)","('INTRONIC', '1', 197, '3', -3488)","('INTRONIC', '1', 197, '3', -3488)","('INTRONIC', '1', 197, '3', -3488)","('INTRONIC', '1', 197, '3', -3488)","('INTRONIC', '1', 197, '2j', -1139)","('INTRONIC', '1', 197, '3', -3488)","('INTRONIC', '1', 197, '3', -3488)","('INTRONIC', '1', 197, '3', -3488)","('INTRONIC', '1', 197, '2j', -1139)","('INTRONIC', '1', 197, '3', -3488)"
2L,2358536,2,T,G,4,0,False,0,3,25,0,0,False,False,False,4.3203,1,17.234,2.2852,G,intron_variant,n.147+232T>G,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.00727272727273,0.0,0.0,0.0,True,,,,"('INTRONIC', 'AGAP004707-PA', 232, 'AGAP004707-PA', -3471)","('INTRONIC', 'AGAP004707-PB', 232, 'AGAP004707-PB', -3471)","('INTRONIC', 'AGAP004707-PC', 232, 'AGAP004707-PC', -3471)","('INTRONIC', '1', 232, '3', -3453)","('INTRONIC', '1', 232, '3', -3453)","('INTRONIC', '1', 232, '3', -3453)","('INTRONIC', '1', 232, '3', -3453)","('INTRONIC', '1', 232, '2j', -1104)","('INTRONIC', '1', 232, '3', -3453)","('INTRONIC', '1', 232, '3', -3453)","('INTRONIC', '1', 232, '3', -3453)","('INTRONIC', '1', 232, '2j', -1104)","('INTRONIC', '1', 232, '3', -3453)"


## Inspect missense variants

In [24]:
def simplify_missense_effect(v):
    if v and v[0] == 'NON_SYNONYMOUS_CODING':
        return v[1]
    else:
        return ''

    
td_styles = {
    'FILTER_PASS': lambda v: 'background-color: red' if not v else '',
    'NoCoverage': lambda v: 'background-color: red' if v > 1 else '',
    'LowCoverage': lambda v: 'background-color: red' if v > 76 else '',
    'HighCoverage': lambda v: 'background-color: red' if v > 15 else '',
    'LowMQ': lambda v: 'background-color: red' if v > 76 else '',
    'HighMQ0': lambda v: 'background-color: red' if v > 1 else '',
    'RepeatDUST': lambda v: 'background-color: red' if v else '',
    'FS': lambda v: 'background-color: red' if v > 60 else '',
    'QD': lambda v: 'background-color: red' if v < 5 else '',
    'ReadPosRankSum': lambda v: 'background-color: red' if v < -8 else '',
    'HRun': lambda v: 'background-color: red' if v > 4 else '',
    'num_alleles': lambda v: 'background-color: orange' if v > 2 else '',
}


def tr_style(row):
    """Colour row by alternate allele count."""
    return 'background-color:rgba(0, 255, 0, %.3f)' % (min(1, row['AC']/100))


tbl_variants_phase1_missense = (
    tbl_variants_phase1_eff
    .select(lambda row: any(row[t] and row[t][0] == 'NON_SYNONYMOUS_CODING' for t in transcript_ids))
    .convert(transcript_ids, simplify_missense_effect)
)
tbl_variants_phase1_missense.displayall(td_styles=td_styles, tr_style=tr_style)

0|CHROM,1|POS,2|num_alleles,3|REF,4|ALT,5|AC,6|ALTIX,7|FILTER_PASS,8|NoCoverage,9|LowCoverage,10|HighCoverage,11|LowMQ,12|HighMQ0,13|RepeatDUST,14|RepeatMasker,15|RepeatTRF,16|FS,17|HRun,18|QD,19|ReadPosRankSum,20|SNPEFF_Allele,21|SNPEFF_Annotation,22|SNPEFF_HGVS_c,23|SNPEFF_HGVS_p,24|SNPEFF_Feature_ID,25|SNPEFF_CDS_pos,26|AF_AOM,27|AF_BFM,28|AF_GWA,29|AF_GNS,30|AF_BFS,31|AF_CMS,32|AF_GAS,33|AF_UGS,34|AF_KES,35|check_allele,36|exon_start,37|exon_end,38|exon,39|AGAP004707-RA,40|AGAP004707-RB,41|AGAP004707-RC,42|Davies-C1N2,43|Davies-C3N2,44|Davies-C5N2,45|Davies-C7N2,46|Davies-C8N2,47|Davies-C10N2,48|Davies-C11N2,49|Davies-C1N9,50|Davies-C8N9,51|Davies-C1N9ck
2L,2358254,2,G,A,1,0,True,0,0,10,0,0,False,False,False,9.8672,1,17.547,-0.049988,A,missense_variant,n.97G>A,p.Asp33Asn,AGAP004707-RA,97.0,0.0,0.0,0.0,0.0,0.0,0.00181818181818,0.0,0.0,0.0,True,2358158,2358304,1,D33N,D33N,D33N,D33N,D33N,D33N,D33N,D33N,D33N,D33N,D33N,D33N,D33N
2L,2359670,2,G,A,7,0,False,1,171,1,1,0,False,False,False,8.6641,6,14.406,-0.029007,A,intron_variant,n.147+1366G>,,AGAP004707-RA,-1.0,0.0,0.0,0.0,0.0,0.0,0.0109090909091,0.0,0.0,0.0113636363636,True,2359640,2359672,2j,,,,,,,,E60K,,,,E60K,
2L,2362002,2,A,T,2,0,True,0,1,3,0,0,False,False,False,0.5459,0,12.531,-0.55322,T,splice_region_variant&intron_varia,n.148-5A>T,,AGAP004707-RA,-1.0,0.0,0.0144927536232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,2361989,2362144,3,,,,D54V,D54V,D54V,D54V,D65V,D54V,D54V,D54V,D65V,D54V
2L,2362019,2,G,T,2,0,True,0,0,6,0,0,False,False,False,3.9824,0,13.641,0.7749,T,missense_variant,n.160G>T,p.Gly54Cys,AGAP004707-RA,160.0,0.0,0.0144927536232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,2361989,2362144,3,G54C,G54C,G54C,G60C,G60C,G60C,G60C,G71C,G60C,G60C,G60C,G71C,G60C
2L,2362023,2,C,T,1,0,True,0,1,3,0,0,False,False,False,0.0,0,13.477,-1.1611,T,missense_variant,n.164C>T,p.Pro55Leu,AGAP004707-RA,164.0,0.0,0.0,0.0,0.0,0.00617283950617,0.0,0.0,0.0,0.0,True,2361989,2362144,3,P55L,P55L,P55L,P61L,P61L,P61L,P61L,P72L,P61L,P61L,P61L,P72L,P61L
2L,2390168,2,A,G,2,0,True,0,2,10,0,0,False,False,False,0.56982,1,15.219,-0.026001,G,missense_variant,n.752A>G,p.Lys251Arg,AGAP004707-RA,752.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0178571428571,0.0,0.0,True,2390129,2390341,7,K251R,K251R,K251R,K257R,K214R,K257R,K257R,K268R,K257R,K257R,K257R,K268R,K257R
2L,2390177,2,G,A,198,0,True,0,3,8,0,0,False,False,False,0.12695,1,18.625,0.83496,A,missense_variant,n.761G>A,p.Arg254Lys,AGAP004707-RA,761.0,0.0,0.0,0.0,0.0,0.0,0.316363636364,0.214285714286,0.0,0.0,True,2390129,2390341,7,R254K,R254K,R254K,R260K,R217K,R260K,R260K,R271K,R260K,R260K,R260K,R271K,R260K
2L,2390311,2,G,A,1,0,True,0,0,10,0,0,False,False,False,0.0,3,14.07,-0.70996,A,missense_variant,n.895G>A,p.Glu299Lys,AGAP004707-RA,895.0,0.0,0.0,0.0,0.0,0.0,0.00181818181818,0.0,0.0,0.0,True,2390129,2390341,7,E299K,E299K,E299K,E305K,E262K,E305K,E305K,E316K,E305K,E305K,E305K,E316K,E305K
2L,2390448,2,G,A,6,0,True,0,0,15,0,0,False,False,False,0.71094,0,16.125,-0.65918,A,missense_variant,n.949G>A,p.Gly317Ser,AGAP004707-RA,949.0,0.0,0.0,0.0,0.0,0.0,0.0109090909091,0.0,0.0,0.0,True,2390425,2390485,8,G317S,G317S,G317S,G323S,G280S,G323S,G323S,G334S,G323S,G323S,G323S,G334S,G323S
2L,2391228,3,G,C,10,0,True,0,0,12,0,0,False,False,False,2.0352,0,14.867,-1.1777,C,missense_variant,n.1204G>C,p.Val402Leu,AGAP004707-RA,1204.0,0.0,0.0724637681159,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,2391156,2391320,10,V402L,V402L,V402L,V408L,V365L,,V408L,V419L,V408L,V408L,V408L,V419L,V408L


## Inspect splice site variants

In [25]:
def simplify_intron_effect(v):
    if v and v[0] in ['SPLICE_REGION', 'SPLICE_CORE']:
        if math.fabs(v[2]) < math.fabs(v[4]):
            return v[1], v[2]
        else:
            return v[3], v[4]
    else:
        return ''

    
td_styles = {
    'FILTER_PASS': lambda v: 'background-color: red' if not v else '',
    'NoCoverage': lambda v: 'background-color: red' if v > 1 else '',
    'LowCoverage': lambda v: 'background-color: red' if v > 76 else '',
    'HighCoverage': lambda v: 'background-color: red' if v > 15 else '',
    'LowMQ': lambda v: 'background-color: red' if v > 76 else '',
    'HighMQ0': lambda v: 'background-color: red' if v > 1 else '',
    'RepeatDUST': lambda v: 'background-color: red' if v else '',
    'FS': lambda v: 'background-color: red' if v > 60 else '',
    'QD': lambda v: 'background-color: red' if v < 5 else '',
    'ReadPosRankSum': lambda v: 'background-color: red' if v < -8 else '',
    'HRun': lambda v: 'background-color: red' if v > 4 else '',
    'num_alleles': lambda v: 'background-color: orange' if v > 2 else '',
}


def tr_style(row):
    """Colour row by alternate allele count."""
    return 'background-color:rgba(0, 255, 0, %.3f)' % (min(1, row['AC']/100))


tbl_variants_phase1_splice = (
    tbl_variants_phase1_eff
    .select(lambda row: any(row[t] and row[t][0] in ['SPLICE_REGION', 'SPLICE_CORE'] for t in transcript_ids))
    .convert(transcript_ids, simplify_intron_effect)
)
tbl_variants_phase1_splice.displayall(td_styles=td_styles, tr_style=tr_style)

0|CHROM,1|POS,2|num_alleles,3|REF,4|ALT,5|AC,6|ALTIX,7|FILTER_PASS,8|NoCoverage,9|LowCoverage,10|HighCoverage,11|LowMQ,12|HighMQ0,13|RepeatDUST,14|RepeatMasker,15|RepeatTRF,16|FS,17|HRun,18|QD,19|ReadPosRankSum,20|SNPEFF_Allele,21|SNPEFF_Annotation,22|SNPEFF_HGVS_c,23|SNPEFF_HGVS_p,24|SNPEFF_Feature_ID,25|SNPEFF_CDS_pos,26|AF_AOM,27|AF_BFM,28|AF_GWA,29|AF_GNS,30|AF_BFS,31|AF_CMS,32|AF_GAS,33|AF_UGS,34|AF_KES,35|check_allele,36|exon_start,37|exon_end,38|exon,39|AGAP004707-RA,40|AGAP004707-RB,41|AGAP004707-RC,42|Davies-C1N2,43|Davies-C3N2,44|Davies-C5N2,45|Davies-C7N2,46|Davies-C8N2,47|Davies-C10N2,48|Davies-C11N2,49|Davies-C1N9,50|Davies-C8N9,51|Davies-C1N9ck
2L,2362002,2,A,T,2,0,True,0,1,3,0,0,False,False,False,0.5459,0,12.531,-0.55322,T,splice_region_variant&intron_varia,n.148-5A>T,,AGAP004707-RA,-1,0.0,0.0144927536232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,2361989.0,2362144.0,3.0,"('AGAP004707-PA', -5)","('AGAP004707-PB', -5)","('AGAP004707-PC', -5)",,,,,,,,,,
2L,2362003,2,C,T,2,0,True,0,1,3,0,0,False,False,False,0.50195,0,14.062,0.024994,T,splice_region_variant&intron_varia,n.148-4C>T,,AGAP004707-RA,-1,0.0,0.0144927536232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,2361989.0,2362144.0,3.0,"('AGAP004707-PA', -4)","('AGAP004707-PB', -4)","('AGAP004707-PC', -4)",,,,,,,,,,
2L,2382263,2,A,G,166,0,True,0,45,1,0,0,False,False,False,5.957,0,25.375,-2.8809,G,splice_region_variant&intron_varia,n.492-7A>G,,AGAP004707-RA,-1,0.00833333333333,0.0,0.0,0.0,0.0,0.0,0.0,0.47572815534,0.761363636364,True,,,,"('AGAP004707-PA', -7)","('AGAP004707-PB', -7)","('AGAP004707-PC', -7)","('5', -7)",,"('5', -7)","('5', -7)","('5', -7)","('5', -7)","('5', -7)","('5', -7)","('5', -7)","('5', -7)"
2L,2390126,2,C,T,2,0,True,0,2,11,0,0,False,False,False,3.4746,0,14.32,-1.0264,T,splice_region_variant&intron_varia,n.713-3C>T,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.00363636363636,0.0,0.0,0.0,True,,,,"('AGAP004707-PA', -3)","('AGAP004707-PB', -3)","('AGAP004707-PC', -3)","('7', -3)","('7', -3)","('7', -3)","('7', -3)","('7', -3)","('7', -3)","('7', -3)","('7', -3)","('7', -3)","('7', -3)"
2L,2400176,2,A,G,1,0,True,0,0,7,0,0,False,False,False,0.0,0,22.203,0.74316,G,splice_region_variant&intron_varia,n.1572+3A>G,,AGAP004707-RA,-1,0.0,0.0,0.0108695652174,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,"('AGAP004707-PA', 3)","('AGAP004707-PB', 3)","('AGAP004707-PC', 3)","('11i+', 3)","('11i+', 3)","('11i+', 3)","('11i+', 3)","('11i+', 3)","('11i+', 3)","('11i+', 3)","('11i+', 3)","('11i+', 3)","('11i+', 3)"
2L,2407888,2,T,C,4,0,True,0,2,9,0,0,False,False,False,5.7578,0,16.281,-0.76416,C,splice_region_variant&intron_varia,n.2017-6T>C,,AGAP004707-RA,-1,0.0,0.0,0.0434782608696,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,"('AGAP004707-PA', -6)","('AGAP004707-PB', -6)","('AGAP004707-PC', -6)","('16', -6)","('16', -6)","('16', -6)","('16', -6)","('16', -6)","('16', -6)","('16', -6)","('16', -6)","('16', -6)","('16', -6)"
2L,2417362,2,A,G,496,0,False,5,712,0,0,0,False,False,False,63.062,1,28.844,1.251,G,splice_region_variant&intron_varia,n.2637+4A>G,,AGAP004707-RA,-1,0.838983050847,0.840579710145,0.0666666666667,1.0,1.0,0.0967153284672,0.0,0.0,0.0,True,,,,"('AGAP004707-PA', 4)","('AGAP004707-PB', 4)","('AGAP004707-PC', 4)","('19', 4)","('19', 4)","('19', 4)","('19', 4)","('19', 4)","('19', 4)","('19', 4)","('19', 4)","('19', 4)","('19', 4)"
2L,2425766,2,T,A,79,0,True,0,1,9,0,0,False,False,False,9.9062,0,21.391,1.6143,A,intron_variant,n.4068+315T>,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.0145454545455,0.633928571429,0.0,0.0,True,,,,,,,,,,,,,,,,"('27k', -4)"
2L,2429868,2,C,A,2,0,True,0,0,14,0,0,False,False,False,8.5469,0,14.961,-0.014,A,splice_region_variant&intron_varia,n.4765-4C>A,,AGAP004707-RA,-1,0.0,0.0,0.0217391304348,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,"('AGAP004707-PA', -4)","('AGAP004707-PB', -4)","('AGAP004707-PC', -4)","('31', -4)","('31', -4)","('31', -4)","('31', -4)","('31', -4)","('31', -4)","('31', -4)","('31', -4)","('31', -4)","('31', -4)"


## Write out variants to file

In [26]:
(tbl_variants_phase1_eff
 .teepickle('../data/tbl_variants_phase1.pkl')
 .convert(transcript_ids, lambda v: ':'.join(map(str, v)))
 .replaceall(None, 'NA')
 .tocsv('../data/tbl_variants_phase1.csv')
)

In [27]:
# check OK
etl.frompickle('../data/tbl_variants_phase1.pkl')

0|CHROM,1|POS,2|num_alleles,3|REF,4|ALT,5|AC,6|ALTIX,7|FILTER_PASS,8|NoCoverage,9|LowCoverage,10|HighCoverage,11|LowMQ,12|HighMQ0,13|RepeatDUST,14|RepeatMasker,15|RepeatTRF,16|FS,17|HRun,18|QD,19|ReadPosRankSum,20|SNPEFF_Allele,21|SNPEFF_Annotation,22|SNPEFF_HGVS_c,23|SNPEFF_HGVS_p,24|SNPEFF_Feature_ID,25|SNPEFF_CDS_pos,26|AF_AOM,27|AF_BFM,28|AF_GWA,29|AF_GNS,30|AF_BFS,31|AF_CMS,32|AF_GAS,33|AF_UGS,34|AF_KES,35|check_allele,36|exon_start,37|exon_end,38|exon,39|AGAP004707-RA,40|AGAP004707-RB,41|AGAP004707-RC,42|Davies-C1N2,43|Davies-C3N2,44|Davies-C5N2,45|Davies-C7N2,46|Davies-C8N2,47|Davies-C10N2,48|Davies-C11N2,49|Davies-C1N9,50|Davies-C8N9,51|Davies-C1N9ck
2L,2358254,2,G,A,1,0,True,0,0,10,0,0,False,False,False,9.8672,1,17.547,-0.049988,A,missense_variant,n.97G>A,p.Asp33Asn,AGAP004707-RA,97,0.0,0.0,0.0,0.0,0.0,0.00181818181818,0.0,0.0,0.0,True,2358158.0,2358304.0,1.0,"('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')","('NON_SYNONYMOUS_CODING', 'D33N')"
2L,2358316,2,T,G,73,0,True,0,0,15,0,0,False,False,False,2.4844,0,16.438,1.4219,G,intron_variant,n.147+12T>G,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.132727272727,0.0,0.0,0.0,True,,,,"('INTRONIC', 'AGAP004707-PA', 12, 'AGAP004707-PA', -3691)","('INTRONIC', 'AGAP004707-PB', 12, 'AGAP004707-PB', -3691)","('INTRONIC', 'AGAP004707-PC', 12, 'AGAP004707-PC', -3691)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '2j', -1324)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '3', -3673)","('INTRONIC', '1', 12, '2j', -1324)","('INTRONIC', '1', 12, '3', -3673)"
2L,2358328,2,T,C,2,0,True,0,0,14,0,0,False,False,False,2.7363,0,16.062,-0.646,C,intron_variant,n.147+24T>C,,AGAP004707-RA,-1,0.0,0.00724637681159,0.0108695652174,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,"('INTRONIC', 'AGAP004707-PA', 24, 'AGAP004707-PA', -3679)","('INTRONIC', 'AGAP004707-PB', 24, 'AGAP004707-PB', -3679)","('INTRONIC', 'AGAP004707-PC', 24, 'AGAP004707-PC', -3679)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '2j', -1312)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '3', -3661)","('INTRONIC', '1', 24, '2j', -1312)","('INTRONIC', '1', 24, '3', -3661)"
2L,2358353,2,C,T,1,0,True,0,1,15,0,0,False,False,False,1.9512,0,9.8594,1.1582,T,intron_variant,n.147+49C>T,,AGAP004707-RA,-1,0.0,0.0,0.0108695652174,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,"('INTRONIC', 'AGAP004707-PA', 49, 'AGAP004707-PA', -3654)","('INTRONIC', 'AGAP004707-PB', 49, 'AGAP004707-PB', -3654)","('INTRONIC', 'AGAP004707-PC', 49, 'AGAP004707-PC', -3654)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '2j', -1287)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '3', -3636)","('INTRONIC', '1', 49, '2j', -1287)","('INTRONIC', '1', 49, '3', -3636)"
2L,2358405,2,T,A,1,0,True,0,6,14,0,0,False,False,False,20.844,1,10.859,1.1562,A,intron_variant,n.147+101T>A,,AGAP004707-RA,-1,0.0,0.0,0.0108695652174,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,"('INTRONIC', 'AGAP004707-PA', 101, 'AGAP004707-PA', -3602)","('INTRONIC', 'AGAP004707-PB', 101, 'AGAP004707-PB', -3602)","('INTRONIC', 'AGAP004707-PC', 101, 'AGAP004707-PC', -3602)","('INTRONIC', '1', 101, '3', -3584)","('INTRONIC', '1', 101, '3', -3584)","('INTRONIC', '1', 101, '3', -3584)","('INTRONIC', '1', 101, '3', -3584)","('INTRONIC', '1', 101, '2j', -1235)","('INTRONIC', '1', 101, '3', -3584)","('INTRONIC', '1', 101, '3', -3584)","('INTRONIC', '1', 101, '3', -3584)","('INTRONIC', '1', 101, '2j', -1235)","('INTRONIC', '1', 101, '3', -3584)"


In [28]:
etl.fromcsv('../data/tbl_variants_phase1.csv')

0|CHROM,1|POS,2|num_alleles,3|REF,4|ALT,5|AC,6|ALTIX,7|FILTER_PASS,8|NoCoverage,9|LowCoverage,10|HighCoverage,11|LowMQ,12|HighMQ0,13|RepeatDUST,14|RepeatMasker,15|RepeatTRF,16|FS,17|HRun,18|QD,19|ReadPosRankSum,20|SNPEFF_Allele,21|SNPEFF_Annotation,22|SNPEFF_HGVS_c,23|SNPEFF_HGVS_p,24|SNPEFF_Feature_ID,25|SNPEFF_CDS_pos,26|AF_AOM,27|AF_BFM,28|AF_GWA,29|AF_GNS,30|AF_BFS,31|AF_CMS,32|AF_GAS,33|AF_UGS,34|AF_KES,35|check_allele,36|exon_start,37|exon_end,38|exon,39|AGAP004707-RA,40|AGAP004707-RB,41|AGAP004707-RC,42|Davies-C1N2,43|Davies-C3N2,44|Davies-C5N2,45|Davies-C7N2,46|Davies-C8N2,47|Davies-C10N2,48|Davies-C11N2,49|Davies-C1N9,50|Davies-C8N9,51|Davies-C1N9ck
2L,2358254,2,G,A,1,0,True,0,0,10,0,0,False,False,False,9.8672,1,17.547,-0.049988,A,missense_variant,n.97G>A,p.Asp33Asn,AGAP004707-RA,97,0.0,0.0,0.0,0.0,0.0,0.00181818181818,0.0,0.0,0.0,True,2358158.0,2358304.0,1.0,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N,NON_SYNONYMOUS_CODING:D33N
2L,2358316,2,T,G,73,0,True,0,0,15,0,0,False,False,False,2.4844,0,16.438,1.4219,G,intron_variant,n.147+12T>G,,AGAP004707-RA,-1,0.0,0.0,0.0,0.0,0.0,0.132727272727,0.0,0.0,0.0,True,,,,INTRONIC:AGAP004707-PA:12:AGAP004707-PA:-3691,INTRONIC:AGAP004707-PB:12:AGAP004707-PB:-3691,INTRONIC:AGAP004707-PC:12:AGAP004707-PC:-3691,INTRONIC:1:12:3:-3673,INTRONIC:1:12:3:-3673,INTRONIC:1:12:3:-3673,INTRONIC:1:12:3:-3673,INTRONIC:1:12:2j:-1324,INTRONIC:1:12:3:-3673,INTRONIC:1:12:3:-3673,INTRONIC:1:12:3:-3673,INTRONIC:1:12:2j:-1324,INTRONIC:1:12:3:-3673
2L,2358328,2,T,C,2,0,True,0,0,14,0,0,False,False,False,2.7363,0,16.062,-0.646,C,intron_variant,n.147+24T>C,,AGAP004707-RA,-1,0.0,0.00724637681159,0.0108695652174,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,INTRONIC:AGAP004707-PA:24:AGAP004707-PA:-3679,INTRONIC:AGAP004707-PB:24:AGAP004707-PB:-3679,INTRONIC:AGAP004707-PC:24:AGAP004707-PC:-3679,INTRONIC:1:24:3:-3661,INTRONIC:1:24:3:-3661,INTRONIC:1:24:3:-3661,INTRONIC:1:24:3:-3661,INTRONIC:1:24:2j:-1312,INTRONIC:1:24:3:-3661,INTRONIC:1:24:3:-3661,INTRONIC:1:24:3:-3661,INTRONIC:1:24:2j:-1312,INTRONIC:1:24:3:-3661
2L,2358353,2,C,T,1,0,True,0,1,15,0,0,False,False,False,1.9512,0,9.8594,1.1582,T,intron_variant,n.147+49C>T,,AGAP004707-RA,-1,0.0,0.0,0.0108695652174,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,INTRONIC:AGAP004707-PA:49:AGAP004707-PA:-3654,INTRONIC:AGAP004707-PB:49:AGAP004707-PB:-3654,INTRONIC:AGAP004707-PC:49:AGAP004707-PC:-3654,INTRONIC:1:49:3:-3636,INTRONIC:1:49:3:-3636,INTRONIC:1:49:3:-3636,INTRONIC:1:49:3:-3636,INTRONIC:1:49:2j:-1287,INTRONIC:1:49:3:-3636,INTRONIC:1:49:3:-3636,INTRONIC:1:49:3:-3636,INTRONIC:1:49:2j:-1287,INTRONIC:1:49:3:-3636
2L,2358405,2,T,A,1,0,True,0,6,14,0,0,False,False,False,20.844,1,10.859,1.1562,A,intron_variant,n.147+101T>A,,AGAP004707-RA,-1,0.0,0.0,0.0108695652174,0.0,0.0,0.0,0.0,0.0,0.0,True,,,,INTRONIC:AGAP004707-PA:101:AGAP004707-PA:-3602,INTRONIC:AGAP004707-PB:101:AGAP004707-PB:-3602,INTRONIC:AGAP004707-PC:101:AGAP004707-PC:-3602,INTRONIC:1:101:3:-3584,INTRONIC:1:101:3:-3584,INTRONIC:1:101:3:-3584,INTRONIC:1:101:3:-3584,INTRONIC:1:101:2j:-1235,INTRONIC:1:101:3:-3584,INTRONIC:1:101:3:-3584,INTRONIC:1:101:3:-3584,INTRONIC:1:101:2j:-1235,INTRONIC:1:101:3:-3584
