In [49]:
import subprocess
import pandas as pd
import numpy as np
import re
import itertools

# Part I, add GC related parameters

####Step 0, create gc 30-60 model file with uniform base model

In [50]:
baseModel = '/home/richard/research/tests/Oct29AssembleModel/models/baseModelV1.txt'
modelSetPath = '/home/richard/research/tests/Oct29AssembleModel/modelSet/'

for i in range(30,61):
    command = 'cp {} {}MetaFungal_model_{}.txt\n'.format(baseModel, modelSetPath, i)
#     print command
    subprocess.check_call(command,shell = True)

In [51]:
filePattern = 'MetaFungal_model_{}.txt'

#### Step 1, add markov matrix

In [52]:
## prepare gc_matrixTable pair

letters = map(lambda x: ''.join(x), list(itertools.product('ACGT','ACGT','ACGT')))

with open('/home/richard/research/tests/Oct29AssembleModel/partsForModel/EukMGM.mod') as f: EukMGMtext = f.read()
splitedEukMGMtext = EukMGMtext.split('__GC')

markov_gc_table_pair = {}
for item in splitedEukMGMtext[10:-10]:
    
    gc = int(item.split()[0])
    
    #deal with coding matrix
    coding_mat = item.split('COD1')[1].split('NONC')[0].split()
    coding_mat = map(float, coding_mat)
    assert len(filter(lambda x: x<0, coding_mat)) == 0 ## check positive
    
    coding_mat = np.array(coding_mat).reshape(len(coding_mat)/3, 3)
    table = pd.DataFrame(coding_mat)
    
    #deal with non coding matrix
    non_coding_mat = map(float,item.split('NONC')[1].split('COD2')[0].split())
    assert len(filter(lambda x: x<0, non_coding_mat)) == 0
    table['3'] = non_coding_mat
    table['4'] = non_coding_mat
    
    #complete table and make it as a string to substitue
    table.index = letters
    table_string = table.to_string(header = None)
    
    markov_gc_table_pair[gc] = table_string

In [53]:
## substitute table into model file
markov_line = ''
for gc in range(30,61):
    filename = filePattern.format(gc)
    
    ## locate position
    with open(modelSetPath+filename) as f:
        for line in f:
            if '##MARKOV##' in line:
                markov_line = line
                
    with open(modelSetPath+filename) as f:
        text = f.read()
        param = '$MARKOV\n'
        text_sub = re.sub(markov_line, param+markov_gc_table_pair[gc]+'\n', text)
    
    with open(modelSetPath+filename,'w') as f:
        f.write(text_sub)

#### Step 2, add donor

In [54]:
## prepare gc_matrixTable_pair
donorMatrixLocation = '/home/richard/research/tests/Oct29AssembleModel/partsForModel/trend_donor/'
donor_gc_table_pair = {}
for gc in range(30,61):
    table = pd.read_csv('{}gc{}.mat'.format(donorMatrixLocation,gc), dtype=np.float, header = None)
    table.index = [l for l in 'TCAG']
    table = table.applymap(lambda x: max(x,0))
    donor_gc_table_pair[gc] = table.to_string(header = None)

In [55]:
## substitute table into model file
donor_line = ''
for gc in range(30,61):
    filename = filePattern.format(gc)
    
    for i in range(3):
        ## locate position
        with open(modelSetPath+filename) as f:
            for line in f:
                if '##DONOR_{}_MAT##'.format(i) in line:
                    donor_line = line

        with open(modelSetPath+filename) as f:
            text = f.read()
            param = '$DONOR_{}_MAT\n'.format(i)
            text_sub = re.sub(donor_line, param+donor_gc_table_pair[gc]+'\n', text)
            
        with open(modelSetPath+filename,'w') as f:
            f.write(text_sub)

#### Step 3, add long acceptor

In [56]:
## prepare gc_matrixTable_pair
acceptorMatrixLocation = '/home/richard/research/tests/Oct29AssembleModel/partsForModel/trend_acceptor_long/'
acceptor_gc_table_pair = {}
for gc in range(30,61):
    table = pd.read_csv('{}gc{}.mat'.format(acceptorMatrixLocation,gc), dtype=np.float, header = None)
    table.index = [l for l in 'TCAG']
    table = table.applymap(lambda x: max(x,0))
    acceptor_gc_table_pair[gc] = table.to_string(header = None)

In [57]:
## substitute table into model file
acceptor_line = ''
for gc in range(30,61):
    filename = filePattern.format(gc)
    
    for i in range(3):
        ## locate position
        with open(modelSetPath+filename) as f:
            for line in f:
                if '##ACCEPTOR_{}_MAT##'.format(i) in line:
                    acceptor_line = line

        with open(modelSetPath+filename) as f:
            text = f.read()
            param = '$ACCEPTOR_{}_MAT\n'.format(i)
            text_sub = re.sub(acceptor_line, param+acceptor_gc_table_pair[gc]+'\n', text)
            
        with open(modelSetPath+filename,'w') as f:
            f.write(text_sub)

#### Step 4, add short acceptor

In [58]:
## prepare gc_matrixTable_pair
shortAcceptorMatrixLocation = '/home/richard/research/tests/Oct29AssembleModel/partsForModel/trend_acceptor/'
shortAcceptor_gc_table_pair = {}
for gc in range(30,61):
    table = pd.read_csv('{}gc{}.mat'.format(shortAcceptorMatrixLocation,gc), dtype=np.float, header = None)
    table.index = [l for l in 'TCAG']
    table = table.applymap(lambda x: max(x,0))
    shortAcceptor_gc_table_pair[gc] = table.to_string(header = None)

In [59]:
## substitute table into model file
shortAcceptor_line = ''
for gc in range(30,61):
    filename = filePattern.format(gc)
    
    for i in range(3):
        ## locate position
        with open(modelSetPath+filename) as f:
            for line in f:
                if '##ACC_BP_{}_MAT##'.format(i) in line:
                    shortAcceptor_line = line

        with open(modelSetPath+filename) as f:
            text = f.read()
            param = '$ACC_BP_{}_MAT\n'.format(i)
            text_sub = re.sub(shortAcceptor_line, param+shortAcceptor_gc_table_pair[gc]+'\n', text)
            
        with open(modelSetPath+filename,'w') as f:
            f.write(text_sub)

#### Step 5, add branch point

In [60]:
## prepare gc_matrixTable_pair
bpMatrixLocation = '/home/richard/research/tests/Oct29AssembleModel/partsForModel/trend_bp/'
bp_gc_table_pair = {}
for gc in range(30,61):
    table = pd.read_csv('{}gc{}.mat'.format(bpMatrixLocation,gc), dtype=np.float, header = None)
    table.index = [l for l in 'TCAG']
    table = table.applymap(lambda x: max(x,0))
    bp_gc_table_pair[gc] = table.to_string(header = None)

In [61]:
## substitute table into model file
bp_line = ''
for gc in range(30,61):
    filename = filePattern.format(gc)
    
    ## locate position
    with open(modelSetPath+filename) as f:
        for line in f:
            if '##BRANCH_MAT##' in line:
                bp_line = line
                
    with open(modelSetPath+filename) as f:
        text = f.read()
        param = '$BRANCH_MAT\n'
        text_sub = re.sub(bp_line, param+bp_gc_table_pair[gc]+'\n', text)
    
    with open(modelSetPath+filename,'w') as f:
        f.write(text_sub)

#### Step 6, add spacer emission probability

In [62]:
## prepare gc_matrixTable_pair
spacerMatrixLocation = '/home/richard/research/tests/Oct29AssembleModel/partsForModel/spacer_emission/'
spacer_gc_table_pair = {}
for gc in range(30,61):
    table = (pd.read_csv('{}spacer_emission_{}.txt'.format(spacerMatrixLocation,gc), 
                         header = None, sep = ' ', skiprows=1))
    table = table.applymap(lambda x: max(x,0))
    table[0] = map(lambda x: x.lstrip(), table[0])
    table.index = table[0]
    spacer_gc_table_pair[gc] = table.iloc[:,1].to_string(header = None)

In [63]:
## substitute table into model file
spacer_line = ''
for gc in range(30,61):
    filename = filePattern.format(gc)
    
    ## locate position
    with open(modelSetPath+filename) as f:
        for line in f:
            if '##MARKOV_BP_SPACER##' in line:
                spacer_line = line
                
    with open(modelSetPath+filename) as f:
        text = f.read()
        param = '$MARKOV_BP_SPACER\n'
        text_sub = re.sub(spacer_line, param+spacer_gc_table_pair[gc]+'\n', text)
    
    with open(modelSetPath+filename,'w') as f:
        f.write(text_sub)

#### Step 7, add start 

In [67]:
## prepare gc_matrixTable_pair
startMatrixLocation = '/home/richard/research/tests/Oct29AssembleModel/partsForModel/trend_start/'
start_gc_table_pair = {}
for gc in range(30,61):
    table = pd.read_csv('{}gc{}_start.mat'.format(startMatrixLocation,gc), dtype=np.float, header = None)
    table.index = [l for l in 'TCAG']
    table = table.applymap(lambda x: max(x,0))
    start_gc_table_pair[gc] = table.to_string(header = None)

In [68]:
## substitute table into model file
start_line = ''
for gc in range(30,61):
    filename = filePattern.format(gc)
    
    for i in range(3):
        ## locate position
        with open(modelSetPath+filename) as f:
            for line in f:
                if '##INI_MAT##'.format(i) in line:
                    start_line = line

        with open(modelSetPath+filename) as f:
            text = f.read()
            param = '$INI_MAT\n'.format(i)
            text_sub = re.sub(start_line, param+start_gc_table_pair[gc]+'\n', text)
            
        with open(modelSetPath+filename,'w') as f:
            f.write(text_sub)

# Part II, add GC and intron density related parameters

### I, low intron density

In [80]:
filePattern = 'MetaFungal_model_{}_low.txt'
modelSetPathLow = '/home/richard/research/tests/Oct29AssembleModel/modelSetLow/'
targetLow = '/home/richard/research/tests/Oct29AssembleModel/partsForModel/length_distributions/low/'

In [78]:
# prepare new set of parameters
for i in range(30,61):
    command = 'cp {}MetaFungal_model_{}.txt {}MetaFungal_model_{}_low.txt\n'.format(modelSetPath, i, modelSetPathLow, i)
#     print command
    subprocess.check_call(command,shell = True)

In [79]:
## write initial length distribution to model files
with open('{}initial_exon_low.txt'.format(targetLow)) as f:
    initial_distr = f.read()

for gc in range(30,61):
    filename = filePattern.format(gc)
    
    ## locate position
    with open(modelSetPathLow+filename) as f:
        for line in f:
            if '##INITIAL_DISTR##' in line:
                tag_line = line
                
    with open(modelSetPathLow+filename) as f:
        text = f.read()
        param = '$INITIAL_DISTR\n'
        text_sub = re.sub(tag_line, param+initial_distr+'\n', text)
    
    with open(modelSetPathLow+filename,'w') as f:
        f.write(text_sub)

In [81]:
## write internal length distribution to model files
with open('{}internal_exon_low.txt'.format(targetLow)) as f:
    internal_distr = f.read()

for gc in range(30,61):
    filename = filePattern.format(gc)
    
    ## locate position
    with open(modelSetPathLow+filename) as f:
        for line in f:
            if '##EXON_DISTR##' in line:
                tag_line = line
                
    with open(modelSetPathLow+filename) as f:
        text = f.read()
        param = '$EXON_DISTR\n'
        text_sub = re.sub(tag_line, param+internal_distr+'\n', text)
    
    with open(modelSetPathLow+filename,'w') as f:
        f.write(text_sub)

In [82]:
## write terminal length distribution to model files
with open('{}terminal_exon_low.txt'.format(targetLow)) as f:
    terminal_distr = f.read()

for gc in range(30,61):
    filename = filePattern.format(gc)
    
    ## locate position
    with open(modelSetPathLow+filename) as f:
        for line in f:
            if '##TERMINAL_DISTR##' in line:
                tag_line = line
                
    with open(modelSetPathLow+filename) as f:
        text = f.read()
        param = '$TERMINAL_DISTR\n'
        text_sub = re.sub(tag_line, param+terminal_distr+'\n', text)
    
    with open(modelSetPathLow+filename,'w') as f:
        f.write(text_sub)

In [83]:
## write intron length distribution to model files
with open('{}intron_low.txt'.format(targetLow)) as f:
    intron_distr = f.read()

for gc in range(30,61):
    filename = filePattern.format(gc)
    
    ## locate position
    with open(modelSetPathLow+filename) as f:
        for line in f:
            if '##INTRON_DISTR##' in line:
                tag_line = line
                
    with open(modelSetPathLow+filename) as f:
        text = f.read()
        param = '$INTRON_DISTR\n'
        text_sub = re.sub(tag_line, param+intron_distr+'\n', text)
    
    with open(modelSetPathLow+filename,'w') as f:
        f.write(text_sub)

In [84]:
## write spacer length distribution to model files
with open('{}spacer_low.txt'.format(targetLow)) as f:
    spacer_distr = f.read()

for gc in range(30,61):
    filename = filePattern.format(gc)
    
    ## locate position
    with open(modelSetPathLow+filename) as f:
        for line in f:
            if '##BP_ACC_DISTR##' in line:
                tag_line = line
                
    with open(modelSetPathLow+filename) as f:
        text = f.read()
        param = '$BP_ACC_DISTR\n'
        text_sub = re.sub(tag_line, param+spacer_distr+'\n', text)
    
    with open(modelSetPathLow+filename,'w') as f:
        f.write(text_sub)

In [85]:
## write donor to bp length distribution to model files
with open('{}donorToBp_low.txt'.format(targetLow)) as f:
    donorToBp_distr = f.read()

for gc in range(30,61):
    filename = filePattern.format(gc)
    
    ## locate position
    with open(modelSetPathLow+filename) as f:
        for line in f:
            if '##DON_BP_DISTR##' in line:
                tag_line = line
                
    with open(modelSetPathLow+filename) as f:
        text = f.read()
        param = '$DON_BP_DISTR\n'
        text_sub = re.sub(tag_line, param+donorToBp_distr+'\n', text)
    
    with open(modelSetPathLow+filename,'w') as f:
        f.write(text_sub)

#### Aux Programs

In [64]:
# def automateTest(modelFile, fastaFile, )