In [3]:
'''
1. Visualize intron, exon length distrbution.
    1) Exon: initial, internal, terminal
    2) intron: general intron, donor to branch point, branch point to acceptor

2. Fit Gamma distribution to exon 
'''
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline

import os
# The path where all the GeneMark-ES 4th run models located
modelFilePath = "/home/richard/largeDataSet/ES_Run_modfileAug2015/" 
modelFileNames = os.listdir(modelFilePath)

# full data
fulldata = pd.read_csv("../1_DataSmall/fullTableInfoGff3GffRNAESwithDensity20150829.csv")
fulldata.index = fulldata['shortName']

In [4]:
class ESModelDistribution():
    """Take in 4th run of model file as text and extract distribution info"""
    _DISTR_TYPE= {
        'INITIAL'  :('$INITIAL_DISTR', range(3,10001)),
        'EXON'     :('$EXON_DISTR',range(3,10001)),
        'TERMINAL' : ('$TERMINAL_DISTR', range(3,10001)),
        'SINGLE'   :('$SINGLE_DISTR', range(300,10000)),
        'INTRON'   :('$INTRON_DISTR',range(20,3001)),
        'BP_ACC'   :('$BP_ACC_DISTR',range(2,41)),
        'DON_BP'   :('$DON_BP_DISTR',range(5,3001))
    }
    
    def __init__(self, filename):
        self.f = open(filename)
    
    def close(self):
        '''close file connection'''
        self.f.close()
    
    def getLengthDistribution(self, probAcc = 0.95, distrType = 'EXON'):
        
        '''
        input: Optional
        probAcc, probability accumulation, default = 0.9
        distrType, one of the six length distribution type
        
        return: dataframe with probability and their position(as two columns) that take up 0.95 probability(default)
           
        '''
        # decompose the distrbutionType varibable to distribution type/tag and min_max length range
        TAG, RANGE = self._DISTR_TYPE[distrType]
        # accqure the max length signal
        LENGTH = RANGE[-1]

        # capture length distribution
        lengthData = []
        switch = False
        for line in self.f:
            if switch:
                lengthData.append(line)
            if TAG in line:
                switch = True
            if switch and "{}\t".format(LENGTH) in line:
                switch = False
                break
        lengthData = map(lambda line: line.rstrip().split("\t"), lengthData)
        positionProbabilityPair = map(lambda (a,b): (int(a), float(b)), lengthData)
        
        df = pd.DataFrame(positionProbabilityPair, columns=['position','probability']) 
        
        #sort probability in reverse order, capture distribution that take up to probAcc
        df.sort_index(axis = 0, ascending = False, by = ['probability'], inplace = True)
        df['accProb'] = df.probability.cumsum()
        idx  = df.accProb < probAcc
        df_sub = df[idx].copy()
        #sort index to original order, output data frame
        df_sub.sort_index(axis = 0, by = ['position'], inplace = True)
        return df_sub[['position','probability']]

    def plotLengthDistrbution(self, df, longName = '', lengthType = '',  path = None, x_lim = None, y_lim = None, showPlot = True, gc=None, intronDensity=None, saveName = None):
        fig = plt.figure()
        ax = fig.add_subplot(1,1,1)
        ax.plot(df.position, df.probability, 'r.',markersize=6)
        ax.set_title('{} Length Distribution\nOrganism: {}\nGC: {},  Intron Density: {}'.format(lengthType,longName,gc,intronDensity))
        ax.set_xlabel('{} Length'.format(lengthType))
        ax.set_ylabel('Probability')
        ax.set_xlim(x_lim)
        ax.set_ylim(y_lim)
        if path and saveName:
            plt.savefig("{}{}".format(path, saveName), bbox_inches='tight')
        if not showPlot:
            plt.close(fig)

In [5]:
def batchPlot(probAcc,x_lim,y_lim, targetPath, lengthType, distrType, testPlot = False, sampleSize = 10):
    if testPlot:
        max_range = sampleSize
    else:
        max_range = len(modelFileNames)
        
        
    for modelFileName in modelFileNames[0:max_range]:
        shortName = modelFileName.split('.ES_C_4.mod')[0]
        longName = fulldata.loc[shortName]['longName']
        longName = " ".join(longName.split()[:2])

        gc = fulldata.loc[shortName]['gc']
        if pd.isnull(fulldata.loc[shortName]['intronDensityGff3']):
            intronDensity = fulldata.loc[shortName]['intronDensityGff']
        else:
            intronDensity = fulldata.loc[shortName]['intronDensityGff3']
        saveName = '{}_{}.png'.format(distrType, shortName)

        model = ESModelDistribution('{}{}'.format(modelFilePath,modelFileName))
        df = model.getLengthDistribution(probAcc, distrType)
        model.plotLengthDistrbution(df,longName,lengthType,targetPath,x_lim,y_lim,testPlot,gc,intronDensity,saveName)
        model.close()    

In [6]:
# exon specific x, y limit
x_lim = [0,1501]
y_lim = [0,0.005]

probAcc = 0.95 # how much probability do we want to capture

In [8]:
## plot internal exon length distribution
targetPath = '/home/richard/research/1_DataSmall/Plots/lengthDistribution/internalExon/'
lengthType = 'Internal Exon'
distrType = 'EXON'

# turn testPlot = True to test plotting
batchPlot(probAcc,x_lim,y_lim, targetPath, lengthType, distrType, testPlot = False, sampleSize = 5)

In [6]:
## plot initial exon length distribution
targetPath = '/home/richard/research/1_DataSmall/Plots/lengthDistribution/initialExon/'
lengthType = 'Initial Exon'
distrType = 'INITIAL'
batchPlot(probAcc,x_lim,y_lim, targetPath, lengthType, distrType, testPlot = False, sampleSize = 20)

In [7]:
## plot terminal exon length distribution
targetPath = '/home/richard/research/1_DataSmall/Plots/lengthDistribution/terminalExon/'
lengthType = 'Terminal Exon'
distrType = 'TERMINAL'
batchPlot(probAcc,x_lim,y_lim, targetPath, lengthType, distrType, testPlot = False, sampleSize = 20)

In [8]:
# intron specific x, y limit
x_lim = [0,150]
y_lim = [0,0.15]
probAcc = 0.99 # how much probability do we want to capture

In [9]:
## plot intron length distribution
targetPath = '/home/richard/research/1_DataSmall/Plots/lengthDistribution/intron/'
lengthType = 'Intron'
distrType = 'INTRON'
batchPlot(probAcc,x_lim,y_lim, targetPath, lengthType, distrType, testPlot = False, sampleSize = 20)

In [10]:
## plot Spacer length distribution
targetPath = '/home/richard/research/1_DataSmall/Plots/lengthDistribution/bp_to_acc/'
lengthType = 'Spacer'
distrType = 'BP_ACC'
batchPlot(probAcc,x_lim,y_lim, targetPath, lengthType, distrType, testPlot = False, sampleSize = 20)

In [11]:
## plot intron length distribution
targetPath = '/home/richard/research/1_DataSmall/Plots/lengthDistribution/don_to_bp/'
lengthType = 'Donor To Branch Point'
distrType = 'DON_BP'
batchPlot(probAcc,x_lim,y_lim, targetPath, lengthType, distrType, testPlot = False, sampleSize = 20)