In [32]:
import numpy as np
import pandas as pd
import os
import random
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm

%load_ext autoreload
%autoreload 2

# Loading the data

In [2]:
file = "../Data/result_full_factorial_pgm.csv"

In [42]:
data = pd.read_csv(file)

In [43]:
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,enzyme_complex_concentration,enzyme_concentration,k1_bwd_effective,k1_bwd_relative,k1_fwd_effective,k1_fwd_relative,k2_bwd_effective,k2_bwd_relative,k2_fwd_effective,k2_fwd_relative,mu_mass,product_concentration,realization,sigma_mass,substrate_concentration,volume_fraction
0,0,0,5.8e-05,6e-06,4.503135,0.450314,262554.802451,1.723016,713913.8,2.164051,9.907347,0.450334,12.1,9.8e-05,2.0,0.0,4.9e-05,0.2
1,1,0,6e-06,5.8e-05,4.491493,0.449149,285144.59841,1.871261,622742.9,1.887689,9.879449,0.449066,12.1,0.000147,3.0,0.0,0.000245,0.2
2,2,0,1.9e-05,4.5e-05,7.636543,0.763654,262351.891789,1.721684,659729.9,1.999806,16.79977,0.763626,31.9,1.6e-05,5.0,0.825,2.4e-05,0.2
3,3,0,1.3e-05,5.1e-05,0.634308,0.063431,484409.307429,3.178936,1063666.0,3.224238,1.395557,0.063434,12.1,0.000196,8.0,0.0,1.2e-05,0.4
4,4,0,6e-06,5.8e-05,4.416002,0.4416,281281.994713,1.845913,587160.5,1.77983,9.719614,0.441801,12.1,4.9e-05,4.0,0.0,0.000147,0.2


In [45]:
data.shape

(151956, 18)

In [46]:
data.dtypes

Unnamed: 0                        int64
Unnamed: 0.1                      int64
enzyme_complex_concentration    float64
enzyme_concentration            float64
k1_bwd_effective                float64
k1_bwd_relative                 float64
k1_fwd_effective                float64
k1_fwd_relative                 float64
k2_bwd_effective                float64
k2_bwd_relative                 float64
k2_fwd_effective                float64
k2_fwd_relative                 float64
mu_mass                         float64
product_concentration           float64
realization                     float64
sigma_mass                      float64
substrate_concentration         float64
volume_fraction                 float64
dtype: object

In [47]:
data.drop('Unnamed: 0.1', axis=1, inplace=True)
data.drop("Unnamed: 0",axis = 1, inplace = True)
data.head()

Unnamed: 0,enzyme_complex_concentration,enzyme_concentration,k1_bwd_effective,k1_bwd_relative,k1_fwd_effective,k1_fwd_relative,k2_bwd_effective,k2_bwd_relative,k2_fwd_effective,k2_fwd_relative,mu_mass,product_concentration,realization,sigma_mass,substrate_concentration,volume_fraction
0,5.8e-05,6e-06,4.503135,0.450314,262554.802451,1.723016,713913.8,2.164051,9.907347,0.450334,12.1,9.8e-05,2.0,0.0,4.9e-05,0.2
1,6e-06,5.8e-05,4.491493,0.449149,285144.59841,1.871261,622742.9,1.887689,9.879449,0.449066,12.1,0.000147,3.0,0.0,0.000245,0.2
2,1.9e-05,4.5e-05,7.636543,0.763654,262351.891789,1.721684,659729.9,1.999806,16.79977,0.763626,31.9,1.6e-05,5.0,0.825,2.4e-05,0.2
3,1.3e-05,5.1e-05,0.634308,0.063431,484409.307429,3.178936,1063666.0,3.224238,1.395557,0.063434,12.1,0.000196,8.0,0.0,1.2e-05,0.4
4,6e-06,5.8e-05,4.416002,0.4416,281281.994713,1.845913,587160.5,1.77983,9.719614,0.441801,12.1,4.9e-05,4.0,0.0,0.000147,0.2


# From the paper, theory

Linear regression to estimate $\beta$ and $\alpha_j$ in the following model:

\begin{equation*}
log(\frac{k_j}{k_{j,0}}) = \beta_j + \alpha_{E,j}log(\frac{[E]}{[E]_0}) + \alpha_{ES,j}log(\frac{[ES]}{[ES]_0})  + \alpha_{P,j}log(\frac{[P]}{[P]_0})  + \alpha_{S,j}log(\frac{[S]}{[S]_0}) 
\end{equation*}

where $j\in$ [1_bwd, 1_fwd, 2_bwd, 2_fwd],

$k$ is the effective rate constant, and $k_0$ is the relative rate constant

The given concentration is (??) $[X]/[X]_0$

knowing that be modify our data set in order to have the $log(\frac{k_j}{k_{j,0}})$ as a feature we will try to predict

In [48]:
data.dtypes

enzyme_complex_concentration    float64
enzyme_concentration            float64
k1_bwd_effective                float64
k1_bwd_relative                 float64
k1_fwd_effective                float64
k1_fwd_relative                 float64
k2_bwd_effective                float64
k2_bwd_relative                 float64
k2_fwd_effective                float64
k2_fwd_relative                 float64
mu_mass                         float64
product_concentration           float64
realization                     float64
sigma_mass                      float64
substrate_concentration         float64
volume_fraction                 float64
dtype: object

In [50]:
# create transformed dataframe
data_trans = data.copy()
data_trans['y1_bwd'] = np.log(data_trans['k1_bwd_effective']/data_trans['k1_bwd_relative'])
data_trans['y1_fwd'] = np.log(data_trans['k1_fwd_effective']/data_trans['k1_fwd_relative'])
data_trans['y2_bwd'] = np.log(data_trans['k2_bwd_effective']/data_trans['k2_bwd_relative'])
data_trans['y2_fwd'] = np.log(data_trans['k2_fwd_effective']/data_trans['k2_fwd_relative'])
data_trans['E'] = np.log(data_trans['enzyme_concentration'])
data_trans['ES'] = np.log(data_trans['enzyme_complex_concentration'])
data_trans['P'] = np.log(data_trans['product_concentration'])/49
data_trans['S'] = np.log(data_trans['substrate_concentration'])/49
data_trans.drop(['k1_bwd_effective','k1_bwd_relative', 'k1_fwd_effective',
                'k1_fwd_relative', 'k2_bwd_effective', 'k2_bwd_relative',
                'k2_fwd_effective', 'k2_fwd_relative', 'enzyme_concentration',
                'enzyme_complex_concentration', 'product_concentration',
                'substrate_concentration'], axis=1, inplace=True)
data_trans.head()

Unnamed: 0,mu_mass,realization,sigma_mass,volume_fraction,y1_bwd,y1_fwd,y2_bwd,y2_fwd,E,ES,P,S
0,12.1,2.0,0.0,0.2,2.302585,11.934139,12.706535,3.091042,-11.959213,-9.761988,-0.188378,-0.202524
1,12.1,3.0,0.0,0.2,2.302585,11.934139,12.706535,3.091042,-9.761988,-11.959213,-0.180104,-0.169679
2,31.9,5.0,0.825,0.2,2.302585,11.934139,12.706535,3.091042,-10.013302,-10.8606,-0.224945,-0.21667
3,12.1,8.0,0.0,0.4,2.302585,11.934139,12.706535,3.091042,-9.879771,-11.266065,-0.174233,-0.230816
4,12.1,4.0,0.0,0.2,2.302585,11.934139,12.706535,3.091042,-9.761988,-11.959213,-0.202524,-0.180104


In [57]:
x = data_trans[['E', 'ES', 'P', 'S']].values
X = np.c_[np.ones(len(x)), x]
y = data_trans['y1_bwd']

reg = LinearRegression()
reg.fit(X,y)
print("beta ",reg.intercept_)
print("alpha ", reg.coef_)

beta  -16672.173596747387
alpha  [0.         0.01628486 0.01509131 0.00394474 0.00410995]


From paper we expected to find: -1.48e-02, 1.9e-04, 1.89e-05, -2.94e-03, -2.94e-03

In [54]:
def standardize(x):
    """Standardize a column."""
    mean_x = np.mean(x)
    x = x - mean_x
    std_x = np.std(x)
    if std_x != 0:
        x = x / std_x
    return x, mean_x, std_x

def standardize_data(data):
    '''
    normalization of a data_set
    '''
    for i in range(0,data.shape[1]):
        data.iloc[:,i],mean,variance = standardize(data.iloc[:,i])

    return data

In [55]:
data_test = standardize_data(data_trans)

In [56]:
x = data_test[['E', 'ES', 'P', 'S']].values
X = np.c_[np.ones(len(x)), x]
y = data_test['y1_bwd']

reg = LinearRegression()
reg.fit(X,y)
print("beta ",reg.intercept_)
print("alpha ", reg.coef_)

beta  -16672.173596747387
alpha  [0.         0.01628486 0.01509131 0.00394474 0.00410995]
