In [1]:
# imports and loadings
import os
import sys
sys.path.insert(0, '../functions/')
import create_data_for_single_gene as cdg
import interface_GAMS as iG
import parameter_optimization as po
import conversion_equations as ce
import multiprocessing
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import animation
from matplotlib.colors import Normalize
import mplcursors  # Import mplcursors library
import numpy as np
import pickle
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN
import ast
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# settings
max_run = 99999999 # 16 should be the normal, high number if running all at first but otherwise generate results/plot 16 at a time as it runs on 8 processors
pull_best_paras = True # pull best results from parameter_optimization, else use defaults

# pull in flags_df and limit
flags_df = pd.read_csv('../data/saved_flags_expanded_filtered.csv', index_col = 0)
flags_df = flags_df[flags_df['checked'] == False]
flags_df = flags_df.iloc[0:max_run]
flags_df['sanity_plots'] = [False for _ in flags_df.index]

# below are the default flags used if nothing is pre-set
# set flags by editing the "saved_flags.csv" in the ../data folder
t_half_life_deg = 300
stable_flags = { # these do not change gene by gene
    # overall
    'run_basal_calculations' : False, # it is very slow and should only be necessary to run if something has changed basal conditions
    'only_create_ratios' : False,
    'only_check_KdRNAPCrp' : False, # if True, quit out of code after generating KdRNAPCrp, done to see if it is generating valid values through sanity check plots
    'save_results' : False, # saves resulting figures and cAct/cInh values of the previous run to the save_results_run folder
    'include_Amy_samples' : True, # append on Amy's stationary phase samples to analysis
    'remove_outliers' : True, # removes samples that do not correlate well with others, see ../data_cleaning/1_locate_outliers_to_drop.ipynb
    'case' : False, # only used for remove_outliers right now, if False, it's multi-iM
    'drop_basal_conds' : True, # if True, removes basal conditions from sample after they're used to calculate ratios (useful when their outliers)
    
    # KdRNAPCrp optimization
    'KdRNAPCrp_sanity' : True, # if True, return sanity plots from this optimization
    
    # GAMs
    'limit_TF_conc_by_actual' : False, # limits the TF concentrations for the model by the actual values, otherwise lets it be a very wide range
    'supress_output' : False,
    'use_greedy' : True, # use the greedy algo values (if False, uses the results of the GA)
    'run_on_all' : False, # run on all genes that are in the saved output folder
    'limit_samples' : flags_df.index.to_list(), # if run_on_all is False, limit to these samples (or which of them are available)
    'delete_old' : True,
    'run_seperate' : False, # run cActivator and cInhibitor solvers seperately
    
    # input constants for GAMs (all get logged inside GAMs so pass in un-logged)
    'act_TF_conc_lo' : 2.902870141566294e-13 / 1000000000, # minimum TF conc found in Heineman data
    'act_TF_conc_up' : 0.00014190659526601638 * 1000000000, # max of ^
    'act_Kd_lo' : 11e-9 / 1000000000, # 11 - 35 nM (1e-9) is the answer here - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4646316/
    'act_Kd_up' : 35e-9 * 1000000000, # from above
    'inh_TF_conc_lo' : 2.902870141566294e-13 / 1000000000, # minimum TF conc found in Heineman data
    'inh_TF_conc_up' : 0.00014190659526601638 * 1000000000, # max of ^
    'inh_Kd_lo' : 11e-9 / 1000000000, # 11 - 35 nM (1e-9) is the answer here - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4646316/
    'inh_Kd_up' : 35e-9 * 1000000000, # from above
    'inh_metab_Total_lo' : 0.000038 / 1000000000, # minimum of arginine concentration in stationary phase samples, div a buffer
    'inh_metab_Total_up' : 0.000408 * 1000000000, # maximum of arginine concentration in stationary phase samples, mult a buffer
    'act_metab_Total_lo' : 0.000038 / 1000000000, # minimum of arginine concentration in stationary phase samples, div a buffer
    'act_metab_Total_up' : 0.000408 * 1000000000, # maximum of arginine concentration in stationary phase samples, mult a buffer
    
    # best for argR
    #'act_TF_conc_lo' : 2.902870141566294e-13 / 100, # minimum TF conc found in Heineman data
    #'act_TF_conc_up' : 0.00014190659526601638 * 100, # max of ^
    #'act_Kd_lo' : 11e-9 / 100, # 11 - 35 nM (1e-9) is the answer here - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4646316/
    #'act_Kd_up' : 35e-9 * 100, # from above
    #'inh_TF_conc_lo' : 2.902870141566294e-13 / 100, # minimum TF conc found in Heineman data
    #'inh_TF_conc_up' : 0.00014190659526601638 * 100, # max of ^
    #'inh_Kd_lo' : 11e-9 / 100, # 11 - 35 nM (1e-9) is the answer here - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4646316/
    #'inh_Kd_up' : 35e-9 * 100, # from above
    #'metab_Total_lo' : 0.000038 / 100, # minimum of arginine concentration in stationary phase samples, div a buffer
    #'metab_Total_up' : 0.000408 * 100, # maximum of arginine concentration in stationary phase samples, mult a buffer
    
    
    # objective function weightings
    'weight_act_obj1' : 1,
    'weight_inh_obj1' : 1,
    'weight_act_obj2' : 0,
    'weight_inh_obj2' : 0,
    'weight_mRNA_match' : 1.0001,
    'weight_act_corr' : 0.00000000000000001,
    'weight_inh_corr' : 0.00000000000000001,
    
    
    # misc
    'eq_str' : 'Eq(mRNARatio,((cActivator*KdRNAP + KdRNAPCrp)*(KdRNAP + RNAP + \
            KeqOpening*RNAP))/((1 + cActivator + cInhibitor)*KdRNAP*KdRNAPCrp + \
            cActivator*KdRNAP*(1 + KeqOpening)*RNAP + KdRNAPCrp*(1 + \
            KeqOpening)*RNAP))',
    
    # cell_constants'
    'cell_constants_RNAP': 10**-6,
    'cell_constants_mRNA_total': 1800, # Total mRNA/cell from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3554401
    'cell_constants_cell_volume': 10**-15, # Liters from https://bionumbers.hms.harvard.edu/bionumber.aspx?id=100004&ver=19
    'cell_constants_kDeg': np.log(2)/t_half_life_deg, # Rate of degradation
    'cell_constants_promoterConcVal': 10**-9, # Promoter concentration
    'cell_constants_u': 1/3600, # Growth rate
}

# pick best parameters from the optimization results
best_paras = None
if pull_best_paras:
    # not set up at all right now for multi-iM, unclear what this will look like
    pickle_in = open('../data/case_to_best_paras.pkl', 'rb')
    case_to_best_paras = pickle.load(pickle_in)
    pickle_in.close()
    best_paras = dict(case_to_best_paras['multi_iM']['Value'])
    
def show_figure(fig):

    # create a dummy figure and use its
    # manager to display "fig"

    dummy = plt.figure()
    new_manager = dummy.canvas.manager
    new_manager.canvas.figure = fig
    fig.set_canvas(new_manager.canvas)

In [None]:
# multiprocess run

# overall setup
if stable_flags['save_results']:
    folders = [val for val in os.listdir('../data/saved_run_results') if 'run' in val]
    if len(folders) == 0:
        run_ct = 1
    else:
        run_ct = max([int(val.split('_')[1]) for val in folders]) + 1
    new_run_folder = '../data/saved_run_results/run_'+str(run_ct)
    os.mkdir(new_run_folder)
    
    # put the flags df in there
    flags_df.to_csv(new_run_folder+'/saved_flags.csv')
# setup inputs
gene_flags = []

for gene in flags_df.index:
    temp_flags = dict(flags_df.loc[gene])
    temp_flags.update({'central_gene' : gene})
    
    # need to convert some flags from strings to lists
    for col in ['basal_conditions', 'target_range', 'cActivator', 'cInhibitor']:
        temp_flags[col] = ast.literal_eval(temp_flags[col])
    
    # convert cell constants into a dictionary
    temp_flags.update({'cell_constants' : {
        'RNAP' : stable_flags['cell_constants_RNAP'],
        'mRNA_total' : stable_flags['cell_constants_mRNA_total'],
        'cell_volume' : stable_flags['cell_constants_cell_volume'],
        'kDeg' : stable_flags['cell_constants_kDeg'],
        'promoterConcVal' : stable_flags['cell_constants_promoterConcVal'],
        'u' : stable_flags['cell_constants_u'],
        'mRNA_total' : stable_flags['cell_constants_mRNA_total'],
    }})
    
    # convert some additional flags over
    temp_flags.update({'run_basal_calculations' : stable_flags['run_basal_calculations']})
    temp_flags.update({'eq_str' : stable_flags['eq_str']})
    temp_flags.update({'save_results' : stable_flags['save_results']})
    if stable_flags['save_results']:
        temp_flags.update({'save_results_folder' : new_run_folder})
    temp_flags.update({'include_Amy_samples' : stable_flags['include_Amy_samples']})
    temp_flags.update({'only_check_KdRNAPCrp' : stable_flags['only_check_KdRNAPCrp']})
    temp_flags.update({'only_create_ratios' : stable_flags['only_create_ratios']})
    temp_flags.update({'KdRNAPCrp_sanity' : stable_flags['KdRNAPCrp_sanity']})
    temp_flags.update({'remove_outliers' : stable_flags['remove_outliers']})
    temp_flags.update({'case' : stable_flags['case']})
    temp_flags.update({'drop_basal_conds' : stable_flags['drop_basal_conds']})

    # add flags to run directory
    gene_flags.append(temp_flags)

# run pool
pool = multiprocessing.Pool(processes = 8)
results = pool.map(cdg.create_data_for_gene, gene_flags)
pool.close()
pool.join()

In [None]:
# display gene specific results
# display an overall plot first
if False: # this figure only works correclty in some cases, remove for now
    fig = plt.figure(figsize = (3, 3))
    for gene in genes:
        file = gene+'_zerod'+str(flags['use_zerod_A_matrix'])+'_cAct_cInh_vals.csv'
        vals_df = pd.read_csv('../data/save_for_GAMs/'+file, index_col = 0)
        plt.scatter(vals_df['cAct'], vals_df['cInh'], label = gene, alpha = 0.3)
    plt.xscale('log')
    plt.yscale('log')
    plt.xlabel('cActivator')
    plt.ylabel('cInhibitor')
    plt.legend()
    plt.title('All cActivator and cInhibitor Values')
    plt.show()

folders = [val for val in os.listdir('../data/saved_run_results') if 'run' in val]
if len(folders) == 0:
    run_ct = 1
else:
    run_ct = max([int(val.split('_')[1]) for val in folders])
new_run_folder = '../data/saved_run_results/run_'+str(run_ct)
    
# display gene specific plot
if stable_flags['save_results']:
    results = []
    for gene in flags_df.index.to_list():
        pickle_in = open(new_run_folder+'/'+gene+'/figures.pkl', 'rb')
        results.append(pickle.load(pickle_in))
        pickle_in.close()
for gene, result in zip(flags_df.index.to_list(), results):
    result[0].suptitle(gene+' : '+str(flags_df.loc[gene]['act_iM'])+', '+str(flags_df.loc[gene]['inh_iM']), fontsize = 24)
    result[0].tight_layout()
    for res in result:
        show_figure(res)
        #display(plt.gcf())