# Generating the input-output function $P(g\mid R, c)$ for varying repressor copy number $R$.

In [1]:
import pickle
import os
import glob
import datetime

# Our numerical workhorses
import numpy as np
from sympy import mpmath
import scipy.optimize
import scipy.special
import scipy.integrate
import pandas as pd
import itertools

# Import libraries to parallelize processes
from joblib import Parallel, delayed

# Import the utils for this project
import chann_cap_utils as chann_cap

# Pre-computing analytical distributions of gene expession.

Since the computation of the mRNA and protein steady-state probability distributions are computationally expensive, we can pre-compute the distribution for different repressor copy number and save the results as a lookup table to compute any desired quantity out of these distributions including the channel capacity and the variability in gene expression due to the stochasticity of the allosteric molecules.

This notebook achieves the simple task of computing the mRNA and protein distribution for different repressor copy numbers saving the result into csv files that we can read with `numpy`.

The matrices are arranged such that each row's index is given by the number of repressors and each column index indicates either the mRNA or protein count.

## Pre-computing the mRNA distribution

Let's start by saving the distribution for mRNA molecules.

In [None]:
# Define the parameters
k0 = 2.7E-3 # Used by Jones and Brewster

# The MWC parameters come from the global fit to the O2 data
mRNA_params = dict(ka=0.199, ki=0.00064, omega=np.exp(-4.5), 
                   k0=k0, gamma=0.00284, r_gamma=15.7)

In [None]:
# Define the mRNA copy numbers to evaluate
# It is break up in blocks to run the process in parallel
mRNA_grid = np.reshape(np.arange(0, 50), [-1, 10])

# define the array of repressor copy numbers to evaluate the function in
R_array = np.arange(0, 1001)

kon_array = [chann_cap.kon_fn(-17, mRNA_params['k0']),
             chann_cap.kon_fn(-15.3, mRNA_params['k0']),
             chann_cap.kon_fn(-13.9, mRNA_params['k0']),
             chann_cap.kon_fn(-9.7, mRNA_params['k0'])]
kon_operators = ['Oid', 'O1', 'O2', 'O3']

compute_matrix = True
if compute_matrix:
    for j, kon in enumerate(kon_array):
        print('operator : ' + kon_operators[j])
        # Set the value for the kon
        mRNA_params['kon'] = kon
        # Initialize transition matrix
        QmR = np.zeros([mRNA_grid.size, len(R_array)])
        for i, r in enumerate(R_array):
            if r%100==0:
                print('repressors : {:d}'.format(r))
            mRNA_params['rep'] = r * 1.66
            # -- Parallel computation of distribution -- #
            lnm_list = list()
            # loop through the concentrations
            # define a function to run in parallel the computation
            def lnm_parallel(m):
                lnm = chann_cap.log_p_m_mid_C(C=0, mRNA=m, **mRNA_params)
                return lnm
            lnm_list.append(Parallel(n_jobs=7)(delayed(lnm_parallel)(m) \
                                               for m in mRNA_grid))
            # -- Building and cleaning the transition matrix -- #
            for k, lnm in enumerate(lnm_list):
                # Initialize the matrix of zeros where the normalized
                # distribution will live
                p_norm = np.zeros_like(lnm)
                p = np.exp(lnm)
                # Compute the cumulative sum of the protein copy number
                p_sum = np.cumsum(np.sum(p, axis=1))
                # Find the first block that is already normalized given 
                # the tolerance value
                norm_idx = np.where((p_sum <= 1 + 1E-5) & \
                                    (p_sum >= 1 - 1E-5))[0][-1]
                # add all the probability values of these blocks to our matrix
                p_norm[0:norm_idx, :] = p[0:norm_idx, :]
            QmR[:, i] = p_norm.ravel()
            # Check that all distributions for each concentration are normalized
        np.savetxt('../../tmp/QmR_' + kon_operators[j] +\
                   '_0_1000_literature_param.csv', QmR, delimiter=",")

### Pre-computing the protien distribution



In [2]:
# Protein parameters
k0 = 2.7E-3 # From Jones & Brewster
prot_params = dict(ka=141.52, ki=0.56061, epsilon=4.5,
                   kon=chann_cap.kon_fn(-9.7, k0),
                   k0=k0,
                   gamma_m=0.00284, r_gamma_m=15.7,
                   gamma_p=0.000277, r_gamma_p=100)

In [None]:
# Define the protein blocks to evaluate in parallel
# Break into blocks to compute the distributions in parallel
prot_grid = np.reshape(np.arange(0, 4000), [-1, 50])

# define the array of repressor copy numbers to evaluate the function in
R_array = np.arange(0, 1050)

# Setting the kon parameter based on k0 and the binding energies form stat. mech.
kon_array = [chann_cap.kon_fn(-13.9, prot_params['k0']),
             chann_cap.kon_fn(-15.3, prot_params['k0']),
             chann_cap.kon_fn(-9.7, prot_params['k0']),
             chann_cap.kon_fn(-17, prot_params['k0'])]
kon_operators = ['O2', 'O1', 'O3', 'Oid']
kon_dict = dict(zip(kon_operators, kon_array))

compute_matrix = True
if compute_matrix:
    for kon, op in enumerate(kon_operators):
        print('operator : ' + op)
        # Set the value for the kon
        prot_params['kon'] = kon_dict[op]
        # Define filename
        file = '../../data/csv_protein_dist/lnp_' + op + '_DJ_RB.csv'
    # If the file exists read the file, find the maximum number of repressors
    # And compute from this starting point.
        if os.path.isfile(file): 
            df = pd.read_csv(file, index_col=0)
            max_rep = df.repressor.max()
            df = df[df.repressor != max_rep]
            df.to_csv(file)
            r_array = np.arange(max_rep, np.max(R_array) + 1)
        else:
            r_array = R_array

        # Loop through repressor copy numbers
        for i, r in enumerate(r_array):
            if r%50==0:
                print('repressors : {:d}'.format(r))
            prot_params['rep'] = r * 1.66
            # -- Parallel computation of distribution -- #
            # define a function to run in parallel the computation
            def lnp_parallel(p):
                lnp = chann_cap.log_p_p_mid_C(C=0, protein=p, **prot_params)
                df = pd.DataFrame([r] * len(p), index=p, columns=['repressor'])
                df.loc[:, 'protein'] = pd.Series(p, index=df.index)
                df.loc[:, 'lnp'] = lnp
                
                # if file does not exist write header 
                if not os.path.isfile(file): 
                    df.to_csv(file) 
                else: # else it exists so append without writing the header
                    df.to_csv(file, mode='a', header=False)
            Parallel(n_jobs=40)(delayed(lnp_parallel)(p) for p in prot_grid)

operator : O2
repressors : 0


# Cleaning up the lookup tables

These calculations can sometimes be numerically unstable due to the complicated confluent hypergeometric function. What can happen is that by the time the probability is basically zero (i.e. the $\ln P \ll 0$) there can be some "jumps" where the calcualtion overshoots. But this happens for probability values that should be very close to zero, so it is very easy to discard these values.

We will define a function to pre-process these lookup tables.

In [2]:
def pre_process_lnp(df, group_col='repressor', lnp_col='lnp',
                    output_col='prob', tol=-20):
    '''
    Pre-processes the lookup tables containing the log probability of a protein
    copy number for different repressor copy numbers eliminating the values
    that were numerically unstable, and returning the data frame with a column
    containing the processed probability.
    Parameters
    ----------
    filename : df
        Data frame containing the log probabilities.
    group_col : str.
        Name of the column in the data frame to be used to group the distributions
    lnp_col : str.
        Name of the column containing the log probability
    output_col : str.
        Name of the column that will contain the processed probability
    tol : float.
        log probability under which to consider values as probability zero.
        This is important since some of the calculations goe to < -300
        
    Returns
    -------
    Pandas dataframe containing the processed probability.
    '''
    # Remove duplicated rows
    df = df[[not x for x in df.duplicated()]]
    
    # Group by group_col
    df_group = df.groupby(group_col)
    
    # Initialize data frame where to save the processed data
    df_clean = pd.DataFrame(columns=df.columns)
    # Loop through each group, computing the log probability making sure that
    # There is no numerical overshoot and that the very small lnp are set to 0
    # probability
    for group, data in df_group:
        data.sort(columns='protein', inplace=True)
        # Set the new column to be all probability zero
        data.loc[:, output_col] = [0.0] * len(data)
        # Exponentiate the good log probabilities
        data.loc[(data.lnp > tol) & (data.lnp < 0), output_col] =\
        pd.Series(np.exp(data.loc[(data.lnp > tol) & (data.lnp < 0), lnp_col]))
        # Make sure cumulative sum still adds to zero
        cumsum = np.cumsum(data[output_col])
        data.loc[cumsum > 1, output_col] = 0
        # Append to the clean data frame
        df_clean = pd.concat([df_clean, data])
    
    return df_clean

Having defined the function let's pre-process the matrices we generated.

In [None]:
files = glob.glob('../../data/csv_protein_dist/*O3_all*.csv')
for f in files:
    print(f)
    df = pd.read_csv(f, header=0, index_col=0, comment='#')
    df_clean = pre_process_lnp(df)
    df_clean.to_csv(f)

../../data/csv_protein_dist/lnp_O3_all_RBS1027_fit.csv


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  na_position=na_position)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
