In [46]:
# imports and loadings
import os
import sys
sys.path.insert(0, '../functions/')
import mRNA_ratios as mr
import create_data_for_single_gene as cdg

import math
import pandas as pd
import ast
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# settings
test_case = 'Lrp___nan'
    
# below are the default flags used if nothing is pre-set
# set flags by editing the "saved_flags.csv" in the ../data folder
flags_df = pd.read_csv('../data/saved_flags_expanded_curated.csv', index_col = 0)
flags_df = flags_df[flags_df['include'] == True]
keep = []
for index, row in flags_df.iterrows():
    case = str(row['act_iM'])+'___'+str(row['inh_iM'])
    if case == test_case:
        keep.append(index)
flags_df = flags_df.loc[keep]
genes = flags_df.index.to_list()

t_half_life_deg = 300
stable_flags = { # these do not change gene by gene
    # overall
    'only_create_ratios' : True,
    'only_check_KdRNAPCrp' : True, # if True, quit out of code after generating KdRNAPCrp, done to see if it is generating valid values through sanity check plots
    'save_results' : True, # saves resulting figures and cAct/cInh values of the previous run to the save_results_run folder
    'include_Amy_samples' : True, # append on Amy's stationary phase samples to analysis
    'remove_outliers' : True, # removes samples that do not correlate well with others, see ../data_cleaning/1_locate_outliers_to_drop.ipynb
    'case' : False, # only used for remove_outliers right now
    'drop_basal_conds' : False,
    
    # KdRNAPCrp optimization
    'KdRNAPCrp_sanity' : True, # if True, return sanity plots from this optimization
    # GAMs
    'supress_output' : False,
    'use_greedy' : True, # use the greedy algo values (if False, uses the results of the GA)
    'run_on_all' : False, # run on all genes that are in the saved output folder
    'limit_samples' : genes, #['b1101', 'b1817', 'b1818', 'b1819'], # if run_on_all is False, limit to these samples (or which of them are available)
    'delete_old' : True,
    'run_seperate' : False, # run cActivator and cInhibitor solvers seperately
    
    # input constants for GAMs (all get logged inside GAMs so pass in un-logged)
    'act_TF_conc_lo' : 1e-10,
    'act_TF_conc_up' : 1e-5,
    'act_Kd_lo' : 1e-10,
    'act_Kd_up' : 1e-6,
    'inh_TF_conc_lo' : 1e-10,
    'inh_TF_conc_up' : 1e-5,
    'inh_Kd_lo' : 1e-10,
    'inh_Kd_up' : 1e-6,
    # objective function weightings
    'weight_act_obj1' : 1,
    'weight_inh_obj1' : 1,
    'weight_act_obj2' : 0,
    'weight_inh_obj2' : 0,
    'weight_mRNA_match' : .1,
    'weight_act_corr' : 0.00000000000000001,
    'weight_inh_corr' : 0.00000000000000001,
    
    
    # misc
    'eq_str' : 'Eq(mRNARatio,((cActivator*KdRNAP + KdRNAPCrp)*(KdRNAP + RNAP + \
            KeqOpening*RNAP))/((1 + cActivator + cInhibitor)*KdRNAP*KdRNAPCrp + \
            cActivator*KdRNAP*(1 + KeqOpening)*RNAP + KdRNAPCrp*(1 + \
            KeqOpening)*RNAP))',
    
    # cell_constants'
    'cell_constants_RNAP': 10**-6,
    'cell_constants_mRNA_total': 1800, # Total mRNA/cell from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3554401
    'cell_constants_cell_volume': 10**-15, # Liters from https://bionumbers.hms.harvard.edu/bionumber.aspx?id=100004&ver=19
    'cell_constants_kDeg': np.log(2)/t_half_life_deg, # Rate of degradation
    'cell_constants_promoterConcVal': 10**-9, # Promoter concentration
    'cell_constants_u': 1/3600, # Growth rate
}


def show_figure(fig):

    # create a dummy figure and use its
    # manager to display "fig"

    dummy = plt.figure()
    new_manager = dummy.canvas.manager
    new_manager.canvas.figure = fig
    fig.set_canvas(new_manager.canvas)

# fixing up saved_flags to work for new values
for index, row in flags_df.iterrows():
    if type(row['basal_conditions']) == float and math.isnan(row['basal_conditions']):
        flags_df.at[index, 'basal_conditions'] = "[\'p1k_00001\', \'p1k_00002\']"
    if type(row['target_range']) == float and math.isnan(row['target_range']):
        flags_df.at[index, 'target_range'] = "[-1, 3]"
    if type(row['cActivator']) == float and math.isnan(row['cActivator']):
        flags_df.at[index, 'cActivator'] = "[-4, 2]"
    if type(row['cInhibitor']) == float and math.isnan(row['cInhibitor']):
        flags_df.at[index, 'cInhibitor'] = "[-4, 2]"
    flags_df.at[index, 'force_rerun'] = True
    flags_df.at[index, 'basal_or_hard_val'] = 'basal'
if False: # I'm not sure what this was for, but it errors now
    # for now, drop samples with differing act_inh_combos
    combos = [(row['act_iM'], row['inh_iM']) for _, row in flags_df.iterrows()]
    combo_to_ct = {combo : combos.count(combo) for combo in combos}
    top_combo = [k for k, _ in sorted(combo_to_ct.items(), key = lambda k : k[1])][-1]
    keep = []
    for index, row in flags_df.iterrows():
        if (row['act_iM'], row['inh_iM']) == top_combo:
            keep.append(index)
    flags_df = flags_df.loc[keep]

# new way - automatic

In [2]:
# settings
input_file = '../data/saved_flags_expanded_curated.csv'

In [3]:
# load log tpm
# loading
log_tpm_df = pd.read_csv('../data/precise_1k/log_tpm.csv', index_col = 0)
starve_log_tpm = pd.read_csv('../data/validation_data_sets/stationary_phase/cleaned_log_tpm_qc.csv', index_col = 0)
to_blank_inds = list(set(log_tpm_df.index) - set(starve_log_tpm.index))
# need to create zero rows for missing values
zeros_data = {col : 0 for col in starve_log_tpm.columns}
zeros_df = pd.DataFrame(zeros_data, index = to_blank_inds)
starve_log_tpm = pd.concat([starve_log_tpm, zeros_df])
starve_log_tpm = starve_log_tpm.loc[log_tpm_df.index]
log_tpm_df = pd.concat([starve_log_tpm, log_tpm_df], axis = 1)

input_df = pd.read_csv(input_file, index_col = 0)

# loop through each case, find the best basal
act_inh_combos = list(set([(row['act_iM'], row['inh_iM']) for _, row in input_df.iterrows()]))
case_to_basal = {}
for act, inh in act_inh_combos:
    case = str(act)+'___'+str(inh)
    result_df = input_df[(input_df['act_iM'] == act) | (pd.isna(input_df['act_iM']) & pd.isna(act))]
    result_df = result_df[(result_df['inh_iM'] == inh) | (pd.isna(result_df['inh_iM']) & pd.isna(inh))]
    genes = result_df.index.to_list()

    # scale and normalize it
    bby = log_tpm_df.loc[genes].copy()
    to_drop = []
    for col in bby.columns:
        if (bby[col] == 0).any():
            to_drop.append(col)
    bby = bby.drop(to_drop, axis=1)
    df = 2**bby.T
    normalized_df=(df-df.mean())/df.std()
    normalized_df = normalized_df
    normalized_df['avg_exp'] = normalized_df.mean(axis = 1)
    normalized_df['abs_avg_exp'] = abs(normalized_df['avg_exp'])
    
    # check if activator, inhibitor, or both
    if pd.isna(act) and not pd.isna(inh):
        # inhibitor case, pick the top expressed example to be basal
        basal = [normalized_df.sort_values(by = 'avg_exp').index[-1]]
    elif pd.isna(inh) and not pd.isna(act):
        # activator case, pick the least expressed example to be basal
        basal = [normalized_df.sort_values(by = 'avg_exp').index[0]]
    elif not pd.isna(inh) and not pd.isna(act):
        # both case, pick the most average expressed example to be basal
        basal = [normalized_df.sort_values(by = 'abs_avg_exp').index[0]]
    case_to_basal.update({(act, inh) : [str(basal[0])]})

# now add the new values to the dataframe
basals = []
for _, row in input_df.iterrows():
    case = str(act)+'___'+str(inh)
    basals.append(case_to_basal[case])
input_df['basal_conditions'] = basals

In [7]:
# save it off
input_df.to_csv(input_file)

# old way - manually for each

In [2]:
# DONT RUN THIS NORMALLY multiprocess run
gene = genes[0]

# overall setup
folders = [val for val in os.listdir('../data/saved_run_results') if 'run' in val]
if len(folders) == 0:
    run_ct = 1
else:
    run_ct = max([int(val.split('_')[1]) for val in folders]) + 1
new_run_folder = '../data/saved_run_results/run_'+str(run_ct)
os.mkdir(new_run_folder)

# put the flags df in there
flags_df.to_csv(new_run_folder+'/saved_flags.csv')

# setup inputs
temp_flags = dict(flags_df.loc[gene])

# need to convert some flags from strings to lists
for col in ['basal_conditions', 'target_range', 'cActivator', 'cInhibitor']:
    temp_flags[col] = ast.literal_eval(temp_flags[col])

# convert cell constants into a dictionary
temp_flags.update({'cell_constants' : {
    'RNAP' : stable_flags['cell_constants_RNAP'],
    'mRNA_total' : stable_flags['cell_constants_mRNA_total'],
    'cell_volume' : stable_flags['cell_constants_cell_volume'],
    'kDeg' : stable_flags['cell_constants_kDeg'],
    'promoterConcVal' : stable_flags['cell_constants_promoterConcVal'],
    'u' : stable_flags['cell_constants_u'],
    'mRNA_total' : stable_flags['cell_constants_mRNA_total'],
}})
temp_flags.update({'eq_str' : stable_flags['eq_str']})
temp_flags.update({'save_results' : stable_flags['save_results']})
temp_flags.update({'save_results_folder' : new_run_folder})
temp_flags.update({'include_Amy_samples' : stable_flags['include_Amy_samples']})
temp_flags.update({'only_check_KdRNAPCrp' : stable_flags['only_check_KdRNAPCrp']})
temp_flags.update({'only_create_ratios' : stable_flags['only_create_ratios']})
temp_flags.update({'KdRNAPCrp_sanity' : stable_flags['KdRNAPCrp_sanity']})
temp_flags.update({'remove_outliers' : stable_flags['remove_outliers']})
temp_flags.update({'case' : stable_flags['case']})
temp_flags.update({'central_gene' : gene})
temp_flags.update({'drop_basal_conds' : stable_flags['drop_basal_conds']})

ret_figs = cdg.create_data_for_gene(temp_flags)
ratios_df = pd.read_pickle(new_run_folder+'/'+gene+'/ratios_df.pkl')

In [47]:
# load log tpm
log_tpm_df = pd.read_csv('../data/precise_1k/log_tpm.csv', index_col = 0)
starve_log_tpm = pd.read_csv('../data/validation_data_sets/stationary_phase/cleaned_log_tpm_qc.csv', index_col = 0)
to_blank_inds = list(set(log_tpm_df.index) - set(starve_log_tpm.index))
# need to create zero rows for missing values
zeros_data = {col : 0 for col in starve_log_tpm.columns}
zeros_df = pd.DataFrame(zeros_data, index = to_blank_inds)
starve_log_tpm = pd.concat([starve_log_tpm, zeros_df])
starve_log_tpm = starve_log_tpm.loc[log_tpm_df.index]
log_tpm_df = pd.concat([starve_log_tpm, log_tpm_df], axis = 1)

In [48]:
# scale and normalize it
bby = log_tpm_df.loc[genes].copy()
to_drop = []
for col in bby.columns:
    if (bby[col] == 0).any():
        to_drop.append(col)
bby = bby.drop(to_drop, axis=1)
df = 2**bby.T
normalized_df=(df-df.mean())/df.std()
normalized_df = normalized_df
normalized_df['avg_exp'] = normalized_df.mean(axis = 1)
normalized_df['abs_avg_exp'] = abs(normalized_df['avg_exp'])
normalized_df.sort_values(by = 'avg_exp')

Unnamed: 0,b1798,b2209,b2845,b3673,avg_exp,abs_avg_exp
p1k_00273,-1.284231,-0.650153,-0.395246,-0.259056,-0.647171,0.647171
p1k_00274,-1.236331,-0.618431,-0.381028,-0.229766,-0.616389,0.616389
p1k_00126,-1.270021,-0.577635,-0.366929,-0.243714,-0.614575,0.614575
p1k_00110,-1.167677,-0.574620,-0.397905,-0.245379,-0.596396,0.596396
p1k_00856,-1.334414,-0.260384,-0.529418,-0.238728,-0.590736,0.590736
...,...,...,...,...,...,...
p1k_00865,5.376918,1.550833,6.297124,2.458298,3.920793,3.920793
p1k_00864,5.475190,1.523514,6.926638,2.442729,4.092018,4.092018
p1k_00103,1.166757,1.655468,5.341748,9.004250,4.292056,4.292056
p1k_00104,1.117076,1.915905,6.045714,9.375887,4.613646,4.613646


In [8]:
# save it off to manually add to the normal saved_flags.csv
flags_df.to_csv('../data/exported_flags_df.csv')

# greatly expanded and more automated search

In [31]:
# load in 
ext_flags_df = pd.read_csv('../data/saved_flags_expanded.csv', index_col = 0)

# load log tpm
log_tpm_df = pd.read_csv('../data/precise_1k/corrected/PRECISE_1K_log_tpm_basal.csv', index_col = 0)
starve_log_tpm = pd.read_csv('../data/validation_data_sets/stationary_phase/cleaned_log_tpm_qc.csv', index_col = 0)
to_blank_inds = list(set(log_tpm_df.index) - set(starve_log_tpm.index))
# need to create zero rows for missing values
zeros_data = {col : 0 for col in starve_log_tpm.columns}
zeros_df = pd.DataFrame(zeros_data, index = to_blank_inds)
starve_log_tpm = pd.concat([starve_log_tpm, zeros_df])
starve_log_tpm = starve_log_tpm.loc[log_tpm_df.index]
log_tpm_df = pd.concat([starve_log_tpm, log_tpm_df], axis = 1)

gene_to_basal_conds = {}
for act_iM in set(ext_flags_df.act_iM):
    # first let's filter out genes with activator and inhibitor
    baby_df = ext_flags_df[ext_flags_df['act_iM'] == act_iM]
    baby_df = baby_df[[type(val) == float for val in baby_df['inh_iM']]]

    # scale and normalize it
    df = 2**log_tpm_df.loc[baby_df.index].T
    normalized_df=(df-df.mean())/df.std()
    normalized_df = normalized_df
    normalized_df['avg_exp'] = normalized_df.mean(axis = 1)
    normalized_df = normalized_df.sort_values(by = 'avg_exp')
    
    # it's an activator, so we want the lowest expression as basal
    basal = normalized_df.index[0]
    
    for gene in baby_df.index:
        gene_to_basal_conds.update({gene : [basal]})
for inh_iM in set(ext_flags_df.inh_iM):
    # first let's filter out genes with activator and inhibitor
    baby_df = ext_flags_df[ext_flags_df['inh_iM'] == inh_iM]
    baby_df = baby_df[[type(val) == float for val in baby_df['act_iM']]]

    # scale and normalize it
    df = 2**log_tpm_df.loc[baby_df.index].T
    normalized_df=(df-df.mean())/df.std()
    normalized_df = normalized_df
    normalized_df['avg_exp'] = normalized_df.mean(axis = 1)
    normalized_df = normalized_df.sort_values(by = 'avg_exp')
    
    # it's an inhibitor, so we want the highest expression as basal
    basal = normalized_df.index[-1]
    
    for gene in baby_df.index:
        gene_to_basal_conds.update({gene : [basal]})
        
missing = list(set(ext_flags_df.index) - set(gene_to_basal_conds.keys()))
act_inh_iMs = set([(row['act_iM'], row['inh_iM']) for _, row in ext_flags_df.loc[missing].iterrows()])
for act_iM, inh_iM in act_inh_iMs:
    # gather all matching
    baby_df = ext_flags_df[ext_flags_df['act_iM'] == act_iM]
    baby_df = baby_df[baby_df['inh_iM'] == inh_iM]
    
    # scale and normalize it
    df = 2**log_tpm_df.loc[baby_df.index].T
    normalized_df=(df-df.mean())/df.std()
    normalized_df = normalized_df
    normalized_df['avg_exp'] = normalized_df.mean(axis = 1)
    normalized_df = normalized_df.sort_values(by = 'avg_exp')
    
    # this is both activator and inhibitor, for now let's take the median expression value
    basal = normalized_df.loc[normalized_df['avg_exp'] == normalized_df['avg_exp'].median()].index[0]
    
    for gene in baby_df.index:
        gene_to_basal_conds.update({gene : [basal]})

baby_ext_flags_df = ext_flags_df.loc[gene_to_basal_conds.keys()]
baby_ext_flags_df['basal_conditions'] = gene_to_basal_conds.values()
baby_ext_flags_df.to_csv('../data/saved_flags_expanded_filtered.csv')

In [38]:
missing = list(set(ext_flags_df.index) - set(gene_to_basal_conds.keys()))
ext_flags_df.loc[missing].sort_values(by = 'inh_iM')

# eh - let's deal with these later, picking basal conditions for them is much more complex than worth dealing with right now

Unnamed: 0,include,force_rerun,sanity_plots,act_iM,inh_iM,basal_conditions,grid_use,basal_bool,basal_or_hard_val,hard_val,...,lambda_,cxpb,cx_prob,mutpb,mt_prob,n_gen,verbose,n_iter,max_steps,n_rounds
b1298,False,True,True,Putrescine,ArcA,,8,False,basal,10,...,100,0.6,0.6,0.4,0.1,100,False,5,30,100
b0313,False,True,True,minicoli KOs,ArcA,,8,False,basal,10,...,100,0.6,0.6,0.4,0.1,100,False,5,30,100
b2980,False,True,True,Crp-1,ArcA,,8,False,basal,10,...,100,0.6,0.6,0.4,0.1,100,False,5,30,100
b1276,False,True,True,SoxS,ArcA,,8,False,basal,10,...,100,0.6,0.6,0.4,0.1,100,False,5,30,100
b0314,False,True,True,minicoli KOs,ArcA,,8,False,basal,10,...,100,0.6,0.6,0.4,0.1,100,False,5,30,100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
b0970,False,True,True,CpxR,nquinone ALE 3,,8,False,basal,10,...,100,0.6,0.6,0.4,0.1,100,False,5,30,100
b2343,False,True,True,Fnr-3,nquinone ALE 3,,8,False,basal,10,...,100,0.6,0.6,0.4,0.1,100,False,5,30,100
b1641,False,True,True,Magnesium,nquinone ALE 3,,8,False,basal,10,...,100,0.6,0.6,0.4,0.1,100,False,5,30,100
b0168,False,True,True,SoxS,nquinone ALE 3,,8,False,basal,10,...,100,0.6,0.6,0.4,0.1,100,False,5,30,100


In [29]:
normalized_df.index[0]

'p1k_00844'

In [10]:
float('nan')

nan

In [14]:
type(baby_df['inh_iM'].values[0])

float