In [4]:
# imports and loadings
import os
import sys
sys.path.insert(0, '../functions/')
import mRNA_ratios as mr
import create_data_for_single_gene as cdg

import math
import pandas as pd
import ast
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# settings
testing = True
    
# load in settings flags
if testing:
    flags_filepath = '../options/test_gene_flags.csv'
else:
    flags_filepath = '../options/gene_flags.csv'
flags_df = pd.read_csv(flags_filepath, index_col = 0)
    
flags_df = flags_df[flags_df['include'] == True]
keep = []
for index, row in flags_df.iterrows():
    case = str(row['act_iM'])+'___'+str(row['inh_iM'])
    if case == test_case:
        keep.append(index)
flags_df = flags_df.loc[keep]
genes = flags_df.index.to_list()

# below are the default flags used if nothing is pre-set
t_half_life_deg = 300
stable_flags = { # these do not change gene by gene
    # overall
    'only_create_ratios' : True,
    'only_check_KdRNAPCrp' : True, # if True, quit out of code after generating KdRNAPCrp, done to see if it is generating valid values through sanity check plots
    'save_results' : True, # saves resulting figures and cAct/cInh values of the previous run to the save_results_run folder
    'include_Amy_samples' : True, # append on Amy's stationary phase samples to analysis
    'remove_outliers' : True, # removes samples that do not correlate well with others, see ../data_cleaning/1_locate_outliers_to_drop.ipynb
    'case' : False, # only used for remove_outliers right now
    'drop_basal_conds' : False,
    
    # KdRNAPCrp optimization
    'KdRNAPCrp_sanity' : True, # if True, return sanity plots from this optimization
    # GAMs
    'supress_output' : False,
    'use_greedy' : True, # use the greedy algo values (if False, uses the results of the GA)
    'run_on_all' : False, # run on all genes that are in the saved output folder
    'limit_samples' : genes, #['b1101', 'b1817', 'b1818', 'b1819'], # if run_on_all is False, limit to these samples (or which of them are available)
    'delete_old' : True,
    'run_seperate' : False, # run cActivator and cInhibitor solvers seperately
    
    # input constants for GAMs (all get logged inside GAMs so pass in un-logged)
    'act_TF_conc_lo' : 1e-10,
    'act_TF_conc_up' : 1e-5,
    'act_Kd_lo' : 1e-10,
    'act_Kd_up' : 1e-6,
    'inh_TF_conc_lo' : 1e-10,
    'inh_TF_conc_up' : 1e-5,
    'inh_Kd_lo' : 1e-10,
    'inh_Kd_up' : 1e-6,
    # objective function weightings
    'weight_act_obj1' : 1,
    'weight_inh_obj1' : 1,
    'weight_act_obj2' : 0,
    'weight_inh_obj2' : 0,
    'weight_mRNA_match' : .1,
    'weight_act_corr' : 0.00000000000000001,
    'weight_inh_corr' : 0.00000000000000001,
    
    
    # misc
    'eq_str' : 'Eq(mRNARatio,((cActivator*KdRNAP + KdRNAPCrp)*(KdRNAP + RNAP + \
            KeqOpening*RNAP))/((1 + cActivator + cInhibitor)*KdRNAP*KdRNAPCrp + \
            cActivator*KdRNAP*(1 + KeqOpening)*RNAP + KdRNAPCrp*(1 + \
            KeqOpening)*RNAP))',
    
    # cell_constants'
    'cell_constants_RNAP': 10**-6,
    'cell_constants_mRNA_total': 1800, # Total mRNA/cell from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3554401
    'cell_constants_cell_volume': 10**-15, # Liters from https://bionumbers.hms.harvard.edu/bionumber.aspx?id=100004&ver=19
    'cell_constants_kDeg': np.log(2)/t_half_life_deg, # Rate of degradation
    'cell_constants_promoterConcVal': 10**-9, # Promoter concentration
    'cell_constants_u': 1/3600, # Growth rate
}

# fixing up saved_flags to work for new values
for index, row in flags_df.iterrows():
    if type(row['basal_conditions']) == float and math.isnan(row['basal_conditions']):
        flags_df.at[index, 'basal_conditions'] = "[\'p1k_00001\', \'p1k_00002\']"
    if type(row['target_range']) == float and math.isnan(row['target_range']):
        flags_df.at[index, 'target_range'] = "[-1, 3]"
    if type(row['cActivator']) == float and math.isnan(row['cActivator']):
        flags_df.at[index, 'cActivator'] = "[-4, 2]"
    if type(row['cInhibitor']) == float and math.isnan(row['cInhibitor']):
        flags_df.at[index, 'cInhibitor'] = "[-4, 2]"
    flags_df.at[index, 'force_rerun'] = True
    flags_df.at[index, 'basal_or_hard_val'] = 'basal'

# function to enable display of pickled figures
def show_figure(fig):

    # create a dummy figure and use its
    # manager to display "fig"

    dummy = plt.figure()
    new_manager = dummy.canvas.manager
    new_manager.canvas.figure = fig
    fig.set_canvas(new_manager.canvas)

# set basal conditions

In [14]:
# load log tpm
# loading
log_tpm_df = pd.read_csv('../data/external/imodulon_info/log_tpm.csv', index_col = 0)
starve_log_tpm = pd.read_csv('../data/external/validation_data_sets/stationary_phase/cleaned_log_tpm_qc.csv', index_col = 0)
to_blank_inds = list(set(log_tpm_df.index) - set(starve_log_tpm.index))
# need to create zero rows for missing values
zeros_data = {col : 0 for col in starve_log_tpm.columns}
zeros_df = pd.DataFrame(zeros_data, index = to_blank_inds)
starve_log_tpm = pd.concat([starve_log_tpm, zeros_df])
starve_log_tpm = starve_log_tpm.loc[log_tpm_df.index]
log_tpm_df = pd.concat([starve_log_tpm, log_tpm_df], axis = 1)

input_df = pd.read_csv(flags_filepath, index_col = 0)

# loop through each case, find the best basal
act_inh_combos = list(set([(row['act_iM'], row['inh_iM']) for _, row in input_df.iterrows()]))
case_to_basal = {}
for act, inh in act_inh_combos:
    result_df = input_df[(input_df['act_iM'] == act) | (pd.isna(input_df['act_iM']) & pd.isna(act))]
    result_df = result_df[(result_df['inh_iM'] == inh) | (pd.isna(result_df['inh_iM']) & pd.isna(inh))]
    genes = result_df.index.to_list()

    # scale and normalize it
    bby = log_tpm_df.loc[genes].copy()
    to_drop = []
    for col in bby.columns:
        if (bby[col] == 0).any():
            to_drop.append(col)
    bby = bby.drop(to_drop, axis=1)
    df = 2**bby.T
    normalized_df=(df-df.mean())/df.std()
    normalized_df = normalized_df
    normalized_df['avg_exp'] = normalized_df.mean(axis = 1)
    normalized_df['abs_avg_exp'] = abs(normalized_df['avg_exp'])
    
    # check if activator, inhibitor, or both
    if pd.isna(act) and not pd.isna(inh):
        # inhibitor case, pick the top expressed example to be basal
        basal = [normalized_df.sort_values(by = 'avg_exp').index[-1]]
    elif pd.isna(inh) and not pd.isna(act):
        # activator case, pick the least expressed example to be basal
        basal = [normalized_df.sort_values(by = 'avg_exp').index[0]]
    elif not pd.isna(inh) and not pd.isna(act):
        # both case, pick the most average expressed example to be basal
        basal = [normalized_df.sort_values(by = 'abs_avg_exp').index[0]]
    case_to_basal.update({(act, inh) : [str(basal[0])]})

# now add the new values to the dataframe
basals = []
for _, row in input_df.iterrows():
    case = (row['act_iM'], row['inh_iM'])
    basals.append(case_to_basal[case])
input_df['basal_conditions'] = basals

In [17]:
# save it off
input_df.to_csv(flags_filepath)

# manual investigation

In [55]:
# load log tpm
log_tpm_df = pd.read_csv('../data/precise_1k/log_tpm.csv', index_col = 0)
starve_log_tpm = pd.read_csv('../data/validation_data_sets/stationary_phase/cleaned_log_tpm_qc.csv', index_col = 0)
to_blank_inds = list(set(log_tpm_df.index) - set(starve_log_tpm.index))
# need to create zero rows for missing values
zeros_data = {col : 0 for col in starve_log_tpm.columns}
zeros_df = pd.DataFrame(zeros_data, index = to_blank_inds)
starve_log_tpm = pd.concat([starve_log_tpm, zeros_df])
starve_log_tpm = starve_log_tpm.loc[log_tpm_df.index]
log_tpm_df = pd.concat([starve_log_tpm, log_tpm_df], axis = 1)

In [56]:
# scale and normalize it
bby = log_tpm_df.loc[genes].copy()
to_drop = []
for col in bby.columns:
    if (bby[col] == 0).any():
        to_drop.append(col)
bby = bby.drop(to_drop, axis=1)
df = 2**bby.T
normalized_df=(df-df.mean())/df.std()
normalized_df = normalized_df
normalized_df['avg_exp'] = normalized_df.mean(axis = 1)
normalized_df['abs_avg_exp'] = abs(normalized_df['avg_exp'])
normalized_df.sort_values(by = 'avg_exp')

Unnamed: 0,b3212,b3213,avg_exp,abs_avg_exp
starve_series__t09_starve,-1.337435,-1.490348,-1.413892,1.413892
starve_series__t10_starve,-1.337266,-1.488756,-1.413011,1.413011
starve_series__t12_starve,-1.335823,-1.489076,-1.412449,1.412449
starve_series__t08_starve,-1.336562,-1.486377,-1.411469,1.411469
starve_series__t11_starve,-1.337509,-1.484775,-1.411142,1.411142
...,...,...,...,...
p1k_00036,3.780571,3.320629,3.550600,3.550600
p1k_00116,3.997203,3.307144,3.652173,3.652173
p1k_00117,4.421709,3.606013,4.013861,4.013861
p1k_00034,4.158309,4.109520,4.133914,4.133914
