In [4]:
# imports and loadings
import pandas as pd
import numpy as np

# settings
testing = True
    
# load in settings flags
if testing:
    flags_filepath = '../options/test_gene_flags.csv'
else:
    flags_filepath = '../options/gene_flags.csv'
flags_df = pd.read_csv(flags_filepath, index_col = 0)

# merge together log_tpm_df files
log_tpm_df = pd.read_csv('../data/external/imodulon_info/log_tpm.csv', index_col = 0)
starve_log_tpm = pd.read_csv('../data/external/validation_data_sets/stationary_phase/cleaned_log_tpm_qc.csv', index_col = 0)
to_blank_inds = list(set(log_tpm_df.index) - set(starve_log_tpm.index))
# need to create zero rows for missing values
zeros_data = {col : 0 for col in starve_log_tpm.columns}
zeros_df = pd.DataFrame(zeros_data, index = to_blank_inds)
starve_log_tpm = pd.concat([starve_log_tpm, zeros_df])
starve_log_tpm = starve_log_tpm.loc[log_tpm_df.index]
log_tpm_df = pd.concat([starve_log_tpm, log_tpm_df], axis = 1)

tpm_df = 2**log_tpm_df

# settings
bott_pct = 0.01
top_pct = 0.99

# RNAP Concentration per sample

In [5]:
# let's find the fold change in the middle 95% percentile of this distribution, make the current RNAP the center, and scale thigns to both sides
# do this for both genes, each sample gets the RNAP value that's the average of both
RNAP_default = 10**-6
fold_modify = 2 # modify the fold amount by this factor (divided by if below, multiplied by if above)

genes = ['b3987', 'b3988']
RNAP_df = pd.DataFrame(index = tpm_df.columns)
for gene in genes:
    # calculations
    bott = tpm_df.loc[gene].sort_values().iloc[int(bott_pct*len(tpm_df.columns))]
    top = tpm_df.loc[gene].sort_values().iloc[int(top_pct*len(tpm_df.columns))]
    fold_change = top/bott
    middle = np.median(tpm_df.loc[gene])

    # scale everything
    new_col = []
    for sample in tpm_df.columns:
        # set outleirs to the min/max
        if tpm_df.loc[gene][sample] > top:
            tpm = top
        elif tpm_df.loc[gene][sample] < bott:
            tpm = bott
        else:
            tpm = tpm_df.loc[gene][sample]
        
        # scale to fold change, then by RNAP value
        fold_change = tpm / middle
        if tpm < middle:
            fold_change /= fold_modify
        else:
            fold_change *= fold_modify
        new_val = RNAP_default * fold_change
        
        new_col.append(new_val)
    RNAP_df[gene] = new_col

In [6]:
# save it off
RNAP_mean_df = pd.DataFrame(RNAP_df.mean(axis = 1))
RNAP_mean_df.columns = ['RNAP']
RNAP_mean_df.to_csv('../data/interim/sample_constants/RNAP_conc.csv')