In [17]:
# imports and loading
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import os
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
import sys
sys.path.insert(0, '../functions/')
import interface_GAMS as iG
import conversion_equations as ce
import pickle
import os
from matplotlib.colors import Normalize
import ast
import seaborn as sns

# settings
filter_minimum = 0.05 # for each gene, need at least X% of samples to have at least a Y correlation
corr_minimum = 0.5 # Y correlation for the filter_minimum

# load in settings flags
settings_df = pd.read_csv('../options/settings.csv', index_col = 0)
flags_filepath = settings_df.loc['gene_flags_filepath']['Setting']
TF_flags_filepath = settings_df.loc['TF_flags_filepath']['Setting']

# load in a bunch of potentially useful files
# loading merged log_tpm_file
# merge together log_tpm_df files
log_tpm_df = pd.read_csv('../data/external/imodulon_info/log_tpm.csv', index_col = 0)
starve_log_tpm = pd.read_csv('../data/external/validation_data_sets/stationary_phase/cleaned_log_tpm_qc.csv', index_col = 0)
to_blank_inds = list(set(log_tpm_df.index) - set(starve_log_tpm.index))
# need to create zero rows for missing values
zeros_data = {col : 0 for col in starve_log_tpm.columns}
zeros_df = pd.DataFrame(zeros_data, index = to_blank_inds)
starve_log_tpm = pd.concat([starve_log_tpm, zeros_df])
starve_log_tpm = starve_log_tpm.loc[log_tpm_df.index]
log_tpm_df = pd.concat([starve_log_tpm, log_tpm_df], axis = 1)
starve_metabs = pd.read_excel('../data/external/validation_data_sets/stationary_phase/metabolites_data.xlsx', index_col = 0)

# load in various other files
pickle_in = open('../data/interim/misc_dictionaries/b_to_gene.pkl', 'rb')
b_to_gene = pickle.load(pickle_in)
pickle_in.close()
gene_to_b = {v : k for k, v in b_to_gene.items()}
flags_df = pd.read_csv(flags_filepath, index_col = 0)
TF_flags_df = pd.read_csv(TF_flags_filepath, index_col = 0)

In [8]:
# go case by case, look at expression of genes within said cases' genes
case_to_mRNA_passed = {}
for iMs_run in set([(row['act_iM'], row['inh_iM']) for _, row in flags_df.iterrows()]):
    clean = [x for x in iMs_run if str(x) != 'nan']
    case = '__'.join([str(iM) for iM in iMs_run]).replace(' ', '_').replace('/', '_')

    keep = []
    for index, row in flags_df.iterrows():
        case2 = '__'.join([str(iM) for iM in [row['act_iM'], row['inh_iM']]]).replace(' ', '_').replace('/', '_')
        if case2 == case:
            keep.append(index)
    bby_flags = flags_df.loc[keep]
    genes = bby_flags.index.to_list()
    
    samples_keep = log_tpm_df.columns[(log_tpm_df.loc[genes].corr() > corr_minimum).sum() > filter_minimum*len(log_tpm_df.columns)].to_list()
    print(case+' : dropping '+str(len(log_tpm_df.columns) - len(samples_keep)))
    
    case_to_mRNA_passed.update({case : samples_keep})

Crp-2__DhaR : dropping 0
Fatty_Acid__nan : dropping 15
nan__Arginine : dropping 0


In [9]:
# save it off
pickle_out = open('../data/interim/misc_dictionaries/case_to_mRNA_passed.pkl', 'wb')
pickle.dump(case_to_mRNA_passed, pickle_out)
pickle_out.close()