In [1]:
import os
import gzip
import numpy as np
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from utils import Interval, FastaGenome, WigTrack
from dgutils.pandas import add_column
# from bx.wiggle import IntervalReader

In [2]:
df_anno = pd.read_csv('raw_data/annotation/ncbiRefSeqCurated.txt.gz', sep='\t', header=None, 
                     names=['bin', 'name', 'chrom', 'strand', 'tx_start', 'tx_end',
                          'cds_start', 'cds_end', 'exon_count', 'exon_starts', 'exon_ends',
                          'score', 'name_2', 'cds_start_stat', 'cds_end_stat', 'exon_frames'])

In [3]:
# debug
df_anno = df_anno.head(100)

In [4]:
df_anno.head()

Unnamed: 0,bin,name,chrom,strand,tx_start,tx_end,cds_start,cds_end,exon_count,exon_starts,exon_ends,score,name_2,cds_start_stat,cds_end_stat,exon_frames
0,585,NM_001180043.1,chrI,-,1806,2169,1806,2169,1,1806,2169,0,PAU8,cmpl,cmpl,0
1,585,NM_001184582.1,chrI,+,2479,2707,2479,2707,1,2479,2707,0,YAL067W-A,cmpl,cmpl,0
2,585,NM_001178208.1,chrI,-,7234,9016,7234,9016,1,7234,9016,0,SEO1,cmpl,cmpl,0
3,585,NM_001179897.1,chrI,-,11564,11951,11564,11951,1,11564,11951,0,YAL065C,cmpl,cmpl,0
4,585,NM_001180042.1,chrI,+,12045,12426,12045,12426,1,12045,12426,0,YAL064W-B,cmpl,cmpl,0


In [11]:
def get_transcript_ditv(chrom, strand, exon_starts, exon_ends):
#     chrom = row['chrom']
#     strand = row['strand']
    exon_starts = [int(x) for x in exon_starts.rstrip(',').split(',')]
    exon_ends = [int(x) for x in exon_ends.rstrip(',').split(',')]
    return [Interval(chrom, strand, s, e) for s, e in zip(exon_starts, exon_ends)]

In [6]:
genome = FastaGenome('raw_data/fasta/')

In [28]:
genome.chrom_sizes

{'chrI': 230218,
 'chrII': 813184,
 'chrIII': 316620,
 'chrIV': 1531933,
 'chrIX': 439888,
 'chrM': 85779,
 'chrV': 576874,
 'chrVI': 270161,
 'chrVII': 1090940,
 'chrVIII': 562643,
 'chrX': 745751,
 'chrXI': 666816,
 'chrXII': 1078177,
 'chrXIII': 924431,
 'chrXIV': 784333,
 'chrXV': 1091291,
 'chrXVI': 948066}

In [32]:
chrom_sorted = sorted(genome.chrom_sizes.keys(), key=lambda x: genome.chrom_sizes[x])
print(chrom_sorted)
K = 5
for k in range(K):
    print('[%s]' % ', '.join(chrom_sorted[k::K]))

['chrM', 'chrI', 'chrVI', 'chrIII', 'chrIX', 'chrVIII', 'chrV', 'chrXI', 'chrX', 'chrXIV', 'chrII', 'chrXIII', 'chrXVI', 'chrXII', 'chrVII', 'chrXV', 'chrIV']
[chrM, chrVIII, chrII, chrXV]
[chrI, chrV, chrXIII, chrIV]
[chrVI, chrXI, chrXVI]
[chrIII, chrX, chrXII]
[chrIX, chrXIV, chrVII]


In [37]:
x = np.asarray([1.0,2,3,4])[:, np.newaxis]
print x.shape

(4, 1)


In [7]:
# select transcripts with complete cds status
df_anno = df_anno[(df_anno['cds_start_stat'] == 'cmpl') & (df_anno['cds_end_stat'] == 'cmpl')]

In [8]:
len(df_anno)

98

In [9]:
wig_track = WigTrack(genome.chrom_sizes,'raw_data/dms/GSE45803_Feb13_VivoAllextra_1_15_PLUS.wig.gz',
                    'raw_data/dms/GSE45803_Feb13_VivoAllextra_1_15_Minus.wig.gz', np.nan)

In [12]:
# get ditv
df_anno = add_column(df_anno, 'ditv', ['chrom', 'strand', 'exon_starts', 'exon_ends'], get_transcript_ditv)

Processing ditv: 100%|██████████| 98/98 [00:00<00:00, 12557.80it/s]


In [13]:
# add sequence
df_anno = add_column(df_anno, 'sequence', ['ditv'], lambda x: genome.dna(x))

Processing sequence: 100%|██████████| 98/98 [00:00<00:00, 14308.55it/s]


In [14]:
# add data
df_anno = add_column(df_anno, 'data', ['ditv'], lambda x: wig_track[x])

Processing data: 100%|██████████| 98/98 [00:00<00:00, 15241.27it/s]


In [49]:
for _, row in df_anno.head(50).iterrows():
    seq = row['sequence']
    vals = row['data']
    idx = np.where(~np.isnan(vals))[0]
    if len(idx)>0:
        seq_with_val = ''.join([seq[i] for i in idx])
        print seq_with_val
        print seq_with_val.count('A')+seq_with_val.count('C'), len(seq_with_val)
        print (seq_with_val.count('A')+seq_with_val.count('C'))/float(len(seq_with_val))
        print ''
    
    

TGGCCTTGTACTTAAAAAAACAGATATTCACTTCTTTGTAATGTTTTATAGAGACGTCATAGACCTGACTACTAATTTGGGGCATGAGTTCACTTGTAGGTTATTTAAGCAGTTAA
52 116
0.448275862069

AAAGCCGGGCGTCGCTAAGCGAAATGAGAGAGAGAGAAGCAGTCGCTGCAATACAGTAAGAACTTTGAGCTAGAGCAACGTGAGGTGGACGGATTAACAGCCAGGTTGGAATTTGCGTTCAAGACTTACTAGAATCTTAGACGGGCTTCATTTTAACCAGGGAAACGCAATTCGAGATAATTAGTAAGTGGTTTGAGATATTGATCTGAAATGTTGCAAGTCGAAGGGAGTATCTCAGGAGTAGCGGACGCATCGATGACGCACCCCAAAACGGCGAGAGGTTGAGAGACATTCACGTTGACAGGCCATAAAGTGCACAAGAGTACCGGTCTAATGATAAAAAGACTCTGGGGTAAGACTAAGAGTATTAACTTAAGCTCAGCACAAGGCCAAATCAAACACCATATAAGAAATAGTGATTCGAATTCAGGCGG
226 434
0.520737327189

TGTGAACTCAAAGGCACTTCGTGTTTGATTGCCTAGCCAGCCAATCAAGATACCACCAGACGACGCAAAACTAAGCGCAGCTAGGCCGTCCCTGCACCAGATGTAAAATTACATATTGAGGTAGATAATACTCAAAGCCCTGCCGCGATAACCGTATAATGTGGAAAGTCGTTATATGCTACTCACGAGAGCAGGGGTAGAGAGGACTTAAAAATCAGCGCAACTGACCAGGGATTTTGGTTTGCGGTCACATTAGACATATTTAGACTAGGACGAGACTATTACATCAGTGTTACATATAAGAAGCACAGCATCACACACGTGGATGAGAGAGAGTCATTGGAAGCATTTACGAGAACAGCCTTATATAGTTAACGGAGTCAATATTAACAATACGGGAC

In [27]:
# tmp = df_anno[['name', 'name_2', 'sequence', 'data', 'chrom', 'strand', 'tx_start', 'tx_end']].iloc[20]
# print tmp
# print tmp['sequence']
# print tmp['data'].tolist()

In [None]:
wig_track[Interval('chrI', '+', 31155, 31180)]

In [None]:
wig_track[[Interval('chrI', '+', 31155, 31158), Interval('chrI', '+', 31176, 31180)]]

In [None]:
wig_track[Interval('chrI', '-', 7339, 7359)]

In [None]:
wig_track[[Interval('chrI', '-', 7339, 7342), Interval('chrI', '-', 7354, 7359)]]