In [1]:
import os
import re
import pandas as pd
import time
def getTime():
    return time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))

## Goal

 For a given chromosome *chr* and genomic distance d, this function maps conservation scores to all regions within d bp from the TSS of every gene in chromosome *chr*.
 * parse the conservation dataset to extract the relevant values according to genomic loci
 * do some validationt to verify that the anticipated output matches the actual output

## Alter the R code to python and fix bugs

* function1: parse wiggle headers
        - Parameters: chr
        - Output: a table containing header info of one .wig file ( 'start' col)

* function2: iterate through the whole wiggle for once, and fill in the result matrix 
    - save to files instead of only returning a dataFrame

* further: support BED

### wiggle parser
* 'readline' method: reads the entire file contents into memory first, so no
```
print open('xxx.txt').readlines()[:7][-1]
```

* Instead, use the original file iterator, which is more performant for large files.
    - using 'islice' is slower than 'with open()'
    
* deal with 'fixedStep' of wiggle only(for now)

In [11]:
# ParseError: Raised if the wiggle track has invalid content.
class ParseError(Exception):
    pass 

# state: wiggle track header info(dictionary) mode = 0 for fixed; 1 for variable
def createState():
    return dict(mode=0, span=1, start=None, step=None)

# ********wiggleHeaderReader*******
# parameter: a wiggle header line;   output: a dictionary
# TODO: put 'span' into consideration (an optional parameter for wiggle format)
def wiggleHeaderReader(line, state):
        if line[:12]=='variableStep':
            try:
                fields = dict(map(lambda field: field.split('='), line[len('variableStep'):].split()))
                state['mode'] = 1
                state['span'] = int(fields.get('span', 1))
                return fields.get('chrom')
            except (ValueError, KeyError):
                raise parseError('Could not parse header: %s'%line)
        elif line[:9]=='fixedStep':
            try:
                fields = dict(map(lambda field: field.split('='), line[len('fixedStep'):].split()))
                state['mode'] = 1
                state['step'] = int(fields.get('step', 1))
                state['span'] = int(fields.get('span', 1))
                state['start'] = int(fields.get('start', 1))
                return fields.get('chrom')
            except(ValueError, KeyError):
                raise ParseError('Could not parse header: %s'%line)

                
# ******wiggleParser()******
# parameter: wiggle file path;    output: a table containing line numbers of all headers and their info(start, step)
def wiggleParser(wigglepath):
    state = createState()
    df = pd.DataFrame(columns=['line', 'step'])
    with open(wigglepath) as f:
        for i, line in enumerate(f):
            if line[:9] == 'fixedStep':
                chrom = wiggleHeaderReader(line, state)
                temp = pd.DataFrame({'step':state.get('step'), 'line':i}, index=[state.get('start')])
                df = df.append(temp)
            # empty lines and comments may exist?
            elif line in ('\n', '\r\n', '\r')  or line[0] in '0123456789.-' or line[:7] == 'browser'  or line[:5] == 'track' or line[0] == '#':
                pass
            else:
                raise ParseError('Invalid content: %s, line number:%d'%(line, i))
    return  df,chrom

#********getWiggleLine(tss)**********
# parameter: tss of interest(array); wiggle dataframe  output: relevant line number in wiggle track 
def getWiggleLine(TSS, df):
    index = df.index.tolist() 
    cur = 0; line = 0; step = 1; TSS_map = []
    for i in TSS:
        # TODO:
        while i>index[cur+1]:
            cur+=1
            line = df.loc[index[cur]]['line']
            step = df.loc[index[cur]]['step']
        l = line + (i-index[cur])/step
        TSS_map.append(l)
    return TSS_map
            
# ******get_cons_score()******
# Combine two R functions together. Since the TSS coordinates in a chromosome are in ascending order, the matrix can be 
# obtained by one iteration throughout the wiggle file.
# parameters: filepath, transcripts annotation file path(any standard gtf containing 'start' and 'seqname'), desired distance d
# TODO: deal with inconsistent 'step'
def getConsScore(wigpath, transPath, d):
    df, chrom = wiggleParser(wigpath)
    chrom = int( re.sub('\D','',chrom) ) # 'chr1' -> 1
    headers = df.index.tolist() # each start site in wiggle track
#    trans = pd.read_csv('transcript.GRCh38.92.gtf', sep='\t', usecols=['seqnames','transcript_id','gene_id','start','end'])
    trans = pd.read_csv(transPath, sep='\t', usecols=['seqnames','transcript_id','gene_id','start','end'])
    trans = trans[trans['seqnames'==chrom]]
    TSS = trans['start'].tolist()
    TSS_map = getWiggleLine(TSS, df)
    nTSS = len(TSS)
    # iterate TSS array: enumerate through wiggle track, for each TSS, return a list of length d
    # To fill in the matrix with 1 iteration, consider when 'd' is large or neighboring TSS are close, there will be intersections.
    df = pd.DataFrame(columns = ['chr', 'gene_id', 'transcript_id', 'start', 'end'] + list(range(d+1)))
    with open(wigpath) as f:
        start, end = 0, 0
        for i, line in enumerate(f):
            if i<TSS_map[0]-d-1:
                continue
            while TSS_map[start]<i:
                start+=1
            while TSS_map[end+1]-d<i:
                end+=1
        k = start
        # initiate dataframe, col:0-d
        while k<=end:
            ind = df.set_value('row', 'col', value)
            k+=1
            
    

In [8]:
print(getTime())
#df, chrom = wiggleParser('chr21')
df, chrom = wiggleParser('wig/chr21.phyloP30way.wigFix')
print(getTime())

2018-08-21 11:05:54
2018-08-21 11:06:07


In [9]:
state = createState()
wiggleHeaderReader('wig/chr1.phyloP30way.wigFix', state)
state

{'mode': 0, 'span': 1, 'start': None, 'step': None}

In [3]:
trans = pd.read_csv('transcript.GRCh38.92.gtf', sep='\t', usecols=['seqnames','transcript_id','gene_id','start','end'])

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
type(trans.iloc[:,0])
len(trans.iloc[:,0])
# the seqnames problem is probably due to multiple datatype in one columns
# record bugs from now on as a part of work diary

203742

In [2]:
# print(df.head(10))
# chrom = 21
# index = df.index.tolist()  #1
# trans = pd.read_csv('transcript.GRCh38.92.gtf', sep='\t', usecols=['seqnames','transcript_id','gene_id','start','end'])
# trans = trans[trans.seqnames=='21']
# TSS = trans['start'].tolist() #2
# #df.loc[index[2]]['line']
# cur = 0
# line = 0
# step = 1
# k = 0
# for i in TSS:
#     # TODO:
#     while i>index[cur]:
#         cur+=1
#         line = df.loc[index[cur]]['line']
#         step = df.loc[index[cur]]['step']
#     l = line + (i-index[cur])/step
#     print(l)
#     k += 1
#     if k>10: break
# print(cur)
# print(index[cur])
# print(i)