In [3]:
'''
The note book deal with spacer emission probability to get a better understanding of emission prob distribution

1. plot all emission probability on the same scale
2. compare linear and poly regression on emission probability 
3. build new set of parameters on spacer emission probability
4. build function to assembly spacer emission probabilty fitting the model file
'''

## use ipython parallel programming 
from IPython.parallel import Client
rc = Client()
dview = rc[:]


import pandas as pd; dview.execute("import pandas as pd")

import numpy as np

import matplotlib.pylab as plt
%matplotlib inline
import matplotlib


import os
# The path where all the GeneMark-ES 4th run models located
modelFilePath = "/home/richard/largeDataSet/ES_Run_modfileAug2015/" 
modelFileNames = os.listdir(modelFilePath)

# full data
fulldata = pd.read_csv("../1_DataSmall/fullTableInfoGff3GffRNAESwithDensity20150829.csv")
fulldata.index = fulldata['shortName']


dview.push({'modelFilePath':modelFilePath,'modelFileNames':modelFileNames})

IOError: Connection file u'~/.ipython/profile_default/security/ipcontroller-client.json' not found.
You have attempted to connect to an IPython Cluster but no Controller could be found.
Please double-check your configuration and ensure that a cluster is running.

In [None]:
class ESModelSpacer():
    _GC_PARAM = { # each param is paired with an ending signal
        '$MARKOV_BP_SPACER':'TT'
    }
    
    def __init__(self, filename):
        self.f = open(filename)

    def close(self):
        '''close file connection'''
        self.f.close()
        
    def extract_to_table(self, param, cols = ['X','Y']):
        '''
        Capture data below parameter and output an dataframe
        Right now, only been tested on spacer
        
        '''
        data = []
        switch = False
        for line in self.f:
            if switch:
                data.append(line)
#                 print line
            if param in line:
                switch = True
            if switch and "{}".format(self._GC_PARAM[param]) in line:
                switch = False
                break
        data = map(lambda line: line.rstrip().split(), data)
#         pairs = map(lambda (a,b): (int(a), float(b)), data)
        
        return pd.DataFrame(data, columns= cols) 
    
dview.push({'ESModelSpacer':ESModelSpacer})

In [None]:
test1 = ESModelSpacer(modelFilePath+modelFileNames[0]).extract_to_table("$MARKOV_BP_SPACER")

In [None]:
cols = test1.X #assign column names as dinucleotides

In [None]:
spacer_prob_map = dview.map(lambda filename: ESModelSpacer(modelFilePath+filename).extract_to_table("$MARKOV_BP_SPACER"), modelFileNames)

In [None]:
spacer_probs = spacer_prob_map.get()

In [None]:
df = pd.DataFrame(map(lambda df: df.Y, spacer_probs),dtype=float)

In [None]:
df.columns = cols

In [None]:
## get GC
gc = map(lambda filename: fulldata.loc[filename[:-11]].gc, modelFileNames)

In [None]:
df['gc'] = gc

In [None]:
# adjust font size
matplotlib.rcParams.update({'font.size': 20})

In [None]:
plt.plot(df.gc, df.icol(0),'ro')
plt.title(df.columns[0])
plt.ylim([0,.5])
plt.xlim([10,70])
plt.xlabel('GC')
plt.ylabel('Probability')

In [None]:
df.to_csv('../1_DataSmall/SpacerEmissionProbTable.csv',index=None)

In [None]:
# plot path 
plot_path = "/home/richard/research/1_DataSmall/Plots/spacer_emission/"

In [None]:
for i in range(16):
    plt.plot(df.gc, df.icol(i),'ro')
    plt.title(df.columns[i])
    plt.ylim([0,.3])
    plt.xlim([10,70])
    plt.xlabel('GC')
    plt.ylabel('Probability')
    plt.savefig(plot_path+df.columns[i],bbox_inches='tight')
    plt.close()

In [None]:
### start poly regression 

In [None]:
# regression set with 1d wrapper, just plug in gc and will output probability
regressModelSet = map(lambda XY: np.poly1d(np.polyfit(df.gc, df[XY],deg = 2)), df.columns)

In [None]:
gc_range = range(20,65)

In [None]:
regressResult = map(lambda model: model(gc_range), regressModelSet)

In [None]:
AA = np.polyfit(df.gc, df.AA, 2)
linearAA = np.polyfit(df.gc, df.AA, 1)

In [None]:
p = np.poly1d(AA)
linearp = np.poly1d(linearAA)

In [None]:
plt.plot(range(10,70),p(range(10,70)), lw = 3, label = '2nd degree poly regression')
plt.plot(range(10,70),linearp(range(10,70)), lw = 3, label = 'linear regression')
plt.plot(df.gc, df.icol(0),'ro',label = 'real data',alpha = 0.15)
plt.legend(fontsize = 10)

In [None]:
# plot path 
plot_path = "/home/richard/research/1_DataSmall/Plots/spacer_emission_regression/"

In [None]:
for i in range(16):
    plt.plot(df.gc, df.icol(i),'ro',label = 'real data')
    plt.plot(gc_range, regressResult[i], lw = 4, label = '2nd degree poly regression')
    plt.title(df.columns[i])
    plt.ylim([0,.3])
    plt.xlim([10,70])
    plt.xlabel('GC')
    plt.ylabel('Probability')
    plt.legend()
    plt.savefig(plot_path+df.columns[i],bbox_inches='tight')
    plt.close()

In [None]:
## add comparison linear regression
linearRegressModelSet = map(lambda XY: np.poly1d(np.polyfit(df.gc, df[XY],deg = 1)), df.columns)
gc_range = range(20,65)
linearRegressResult = map(lambda model: model(gc_range), linearRegressModelSet)

In [None]:
matplotlib.rcParams.update({'font.size': 20})
_alpha = 1
lineWidth = 2
for i in range(16):
    plt.plot(df.gc, df.icol(i),'ro',label = 'real data', alpha = _alpha)
    plt.plot(gc_range, regressResult[i], lw = lineWidth, label = '2nd degree poly regression')
    plt.plot(gc_range, linearRegressResult[i], lw = lineWidth, label = 'linear regression')
    plt.title(df.columns[i])
    plt.ylim([0,.3])
    plt.xlim([10,70])
    plt.xlabel('GC')
    plt.ylabel('Probability')
    plt.legend(fontsize = 13)
    plt.savefig(plot_path+df.columns[i],bbox_inches='tight')
    plt.close()

In [None]:
## fit TT legend
plt.plot(df.gc, df.TT,'ro',label = 'real data',alpha = _alpha)
plt.plot(gc_range, regressResult[i], lw = lineWidth, label = '2nd degree poly regression')
plt.plot(gc_range, linearRegressResult[i], lw = lineWidth, label = 'linear regression')
plt.title('TT')
plt.ylim([0,.3])
plt.xlim([10,70])
plt.xlabel('GC')
plt.ylabel('Probability')
plt.legend(fontsize = 8)
plt.savefig(plot_path+df.columns[i],bbox_inches='tight')
plt.close()

In [None]:
###
### Build new set of parameters on spacer emisson probability
### create table that could to referenced in the future
df_poly_regress = pd.concat(map(pd.DataFrame,regressResult), axis = 1)
df_poly_regress.index = gc_range
df_poly_regress.columns = df.columns
df_poly_regress.head()

In [None]:
df_poly_regress.to_csv("../1_DataSmall/heuristic_model_params/spacer_emission.csv",index = None)

In [None]:
for col in df_poly_regress.columns[:-1]:
    print col,df_poly_regress.loc[20,col]

In [None]:
## assembly parameters for model file
path  = '../1_DataSmall/heuristic_model_params/spacer_emission/'
for gc in gc_range:
    base = '$MARKOV_BP_SPACER\n'
    for col in df_poly_regress.columns[:-1]:
        base+= "{} {}\n".format(col,df_poly_regress.loc[gc,col])
    with open('spacer_emission_{}.txt'.format(gc),'w') as f:
        f.write(base)

In [None]:
####################################
## Oct 28 rerun code for model assembly
for gc in gc_range:
    base = '$MARKOV_BP_SPACER\n'
    for col in df_poly_regress.columns[:-1]:
        base+= "{} {}\n".format(col,df_poly_regress.loc[gc,col])
    print gc
    print base