In [16]:
import os
import re
import pandas as pd
import time
import pdb
def getTime():
    return time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))

## Goal

 For a given chromosome *chr* and genomic distance d, this function maps conservation scores to all regions within d bp from the TSS of every gene in chromosome *chr*.
 * parse the conservation dataset to extract the relevant values according to genomic loci
 * do some validationt to verify that the anticipated output matches the actual output

## Alter the R code to python and fix bugs

* function1: parse wiggle headers
        - Parameters: chr
        - Output: a table containing header info of one .wig file ( 'start' col)

* function2: iterate through the whole wiggle for once, and fill in the result matrix 
    - save to files instead of only returning a dataFrame

* further: support BED

### wiggle parser
* 'readline' method: reads the entire file contents into memory first, so no
```
print open('xxx.txt').readlines()[:7][-1]
```

* Instead, use the original file iterator, which is more performant for large files.
    - using 'islice' is slower than 'with open()'
    
* deal with 'fixedStep' of wiggle only(for now)

In [107]:
# ParseError: Raised if the wiggle track has invalid content.
class ParseError(Exception):
    pass 

# state: wiggle track header info(dictionary) mode = 0 for fixed; 1 for variable
def createState():
    return dict(mode=0, span=1, start=None, step=None)

# ********wiggleHeaderReader*******
# parameter: a wiggle header line;   output: a dictionary
# TODO: put 'span' into consideration (an optional parameter for wiggle format)
def wiggleHeaderReader(line, state):
        if line[:12]=='variableStep':
            try:
                fields = dict(map(lambda field: field.split('='), line[len('variableStep'):].split()))
                state['mode'] = 1
                state['span'] = int(fields.get('span', 1))
                return fields.get('chrom')
            except (ValueError, KeyError):
                raise parseError('Could not parse header: %s'%line)
        elif line[:9]=='fixedStep':
            try:
                fields = dict(map(lambda field: field.split('='), line[len('fixedStep'):].split()))
                state['mode'] = 1
                state['step'] = int(fields.get('step', 1))
                state['span'] = int(fields.get('span', 1))
                state['start'] = int(fields.get('start', 1))
                return fields.get('chrom')
            except(ValueError, KeyError):
                raise ParseError('Could not parse header: %s'%line)
                
# ******wiggleParser()******
# parameter: wiggle file path;    output: a table containing line numbers of all headers and their info(start, step)
def wiggleParser(wigglepath):
    state = createState()
    df = pd.DataFrame(columns=['line', 'step'])
    with open(wigglepath) as f:
        for i, line in enumerate(f):
            if line[:9] == 'fixedStep':
                chrom = wiggleHeaderReader(line, state)
                # 'i+1' is because the score starts at next line
                temp = pd.DataFrame({'step':state.get('step'), 'line':i+1}, index=[state.get('start')])
                df = df.append(temp)
            # empty lines and comments may exist?
            elif line in ('\n', '\r\n', '\r')  or line[0] in '0123456789.-' or line[:7] == 'browser'  or line[:5] == 'track' or line[0] == '#':
                continue
            else:
                raise ParseError('Invalid content: %s, line number:%d'%(line, i))
    return  df,chrom

#********getWiggleLine(tss)**********
# parameter: tss of interest(array); wiggle dataframe  output: relevant line number in wiggle track 
def getWiggleLine(TSS, df):
    index = df.index.tolist() 
    cur = 0; line = 0; step = 1; TSS_map = []
    for i in TSS:
        # '>=' is vital ('>' makes the mapping from TSS to TSS_line_number_in_wig go wrong)
        while i>=index[cur+1] and cur<len(index)-2:
            cur+=1
            line = df.loc[index[cur]]['line']
            step = df.loc[index[cur]]['step']
        if cur==len(index)-1:
            print(i, index[cur], len(TSS), cur)
        l = line + (i-index[cur])/step+1 # '+1'
        TSS_map.append(l)
        #if i==6631798 or i==6630182:
            #import pdb; pdb.set_trace()
    return TSS_map
            
# ******get_cons_score()******
# Combine two R functions together. Since the TSS coordinates in a chromosome are in ascending order, the matrix can be 
# obtained by one iteration throughout the wiggle file.
# parameters: filepath, transcripts annotation file path(any standard gtf containing 'start' and 'seqname'), desired distance d
# TODO: deal with inconsistent 'step'
def getConsScore(wigpath, transPath, d):
    df, chrom = wiggleParser(wigpath)
    chrom = re.sub('\D','',chrom)  # 'chr1' -> '1'
    trans = pd.read_csv(transPath, sep='\t', usecols=['seqnames','transcript_id','gene_id','start','end'], dtype={'seqnames':object})
    trans = trans[trans.seqnames==chrom]
    trans = trans.sort_values(by="start" , ascending=True) # TSS is not ascending in the gtf
    TSS = trans['start'].tolist()
    headers = df.line.tolist()
    TSS_map = getWiggleLine(TSS, df)
    ntrans = len(TSS)
    # iterate TSS array: enumerate through wiggle track, for each TSS, return a list of length d
    # To fill in the matrix with one iteration, consider when 'd' is large or neighboring TSS are close, there will be intersections.
    mat = pd.DataFrame(columns = list(range(d+1)))
    mat = pd.concat(  [trans.reset_index(drop=True), mat], axis=1  )
    start, end = 0, 0
    with open(wigpath) as f:
        for i, line in enumerate(f):
            if i<TSS_map[0]-d or line.startswith('fix'):
                continue
            while start<=ntrans-1 and TSS_map[start]<i:
                start += 1
            if end != ntrans-1:
                while end<ntrans-1 and TSS_map[end+1]-d<=i: 
                    end += 1
            k = start
            gap = 0
            while k<=end:
                try:
                    # gap is for comment lines between the ith line and TSS[k](sometimes different transcripts have same TSS)                        
                    gap = len(  [x for x in headers if x>i and x<=TSS_map[k] ])
                    mat.set_value(k, TSS_map[k]-i-gap, float(line))
                    #print('start ',start, ' end ' ,end, ' i ',  i, ' k ', k, 'TSS[k]: ',TSS[k], 'TSS_map[k] ', TSS_map[k],' column ', TSS_map[k]-i-gap, gap, float(line), sep=', ')
                    k+=1
                except 'ValueError':
                    print('An error has occured: wiggle track line:%s, line number: %s'%(line, str(i)))
    return mat

In [108]:
print(getTime())
df = getConsScore(transPath='transcript.GRCh38.92.gtf', wigpath='wig/chr16.phyloP30way.wigFix', d=30)
print(getTime())

2018-08-24 23:04:22
2018-08-24 23:09:18


In [5]:
print(getTime())
df2, chrom = wiggleParser('wig/chr1.phyloP30way.wigFix')
print(getTime())

2018-08-24 03:34:28
2018-08-24 03:35:47


In [111]:
df
#df.to_csv('chr16_cons_score.csv', sep = '\t')

Unnamed: 0,seqnames,start,end,gene_id,transcript_id,0,1,2,3,4,...,21,22,23,24,25,26,27,28,29,30
0,16,11555,14090,ENSG00000233614,ENST00000513886,0.311,-0.521,0.381,0.336,0.336,...,0.311,0.338,0.338,-2.784,-0.649,0.338,-1.558,0.338,-0.537,-0.657
1,16,11861,13351,ENSG00000233614,ENST00000430178,0.338,-2.32,-0.763,0.266,0.255,...,-0.861,-0.843,-0.651,-0.752,0.275,0.337,0.298,0.337,-0.892,0.298
2,16,14381,18068,ENSG00000234769,ENST00000564273,0.238,0.238,0.238,0.238,-0.841,...,-0.709,0.268,0.268,0.268,-0.764,0.259,0.238,0.259,0.268,0.412
3,16,17052,17119,ENSG00000278739,ENST00000615957,0.342,0.342,0.422,0.372,0.386,...,0.422,0.342,0.342,0.372,0.386,-1.559,0.386,-0.489,0.386,0.422
4,16,17514,35195,ENSG00000260803,ENST00000568710,-0.534,0.386,-0.545,0.342,0.372,...,0.342,0.342,0.342,0.331,-0.42,0.342,-0.547,0.386,0.372,0.342
5,16,22910,25123,ENSG00000231439,ENST00000527434,-1.789,-1.009,-0.56,0.375,0.305,...,0.375,0.305,-0.56,0.305,0.375,0.305,0.341,-1.195,-0.921,0.305
6,16,29336,38321,ENSG00000226942,ENST00000442856,-0.005,-0.058,-0.546,-0.724,0.013,...,-0.34,0.057,0.138,0.028,0.03,1.048,0.827,1.049,0.943,-0.297
7,16,46407,53628,ENSG00000161980,ENST00000293860,-0.434,0.452,0.452,0.342,0.374,...,0.384,0.421,0.342,0.374,-0.894,0.349,-0.903,0.349,0.341,-0.94
8,16,47236,52142,ENSG00000161980,ENST00000481810,0.856,-1.487,0.769,-0.034,0.767,...,0.615,0.679,-0.154,0.548,0.476,0.588,-0.139,-0.828,-1.282,-0.099
9,16,53010,57669,ENSG00000161981,ENST00000293861,0.433,0.488,-0.345,0.488,-3.315,...,0.338,0.537,-0.441,0.433,0.488,0.474,0.31,-0.28,0.537,-0.478


In [98]:
l = list(range(1,23))
l = l+['X','Y']
#trans = pd.read_csv('Homo_sapiens.GRCh38.92.gtf', sep='\t', usecols=['seqnames','transcript_id','gene_id','start','end'])
trans = pd.read_csv('transcript.GRCh38.92.gtf', sep='\t', dtype={'seqnames':object})
chr_trans = [len(trans[trans.seqnames == str(x)]) for x in l]
for i in range(0,len(l)):
    print(l[i], chr_trans[i])

1 18062
2 14754
3 12391
4 8103
5 9417
6 8917
7 9722
8 7919
9 6823
10 6707
11 12642
12 11643
13 3365
14 7574
15 7506
16 10137
17 12760
18 3729
19 12889
20 4471
21 2446
22 4578
X 6345
Y 738
