In [1]:
# import and loading
import pandas as pd

stable_flags = {
    'include_Amy_samples' : True,
}

metab_df = pd.read_excel('../data/validation_data_sets/heinemann_protein_conc.xlsx', index_col = 0)
metab_cols = ['Glucose', 'Acetate', 'Glycerol', 'Pyruvate', 'Galactose ', 'Succinate', 'Fructose']

# load in un-logged tpm
if stable_flags['include_Amy_samples']:
    # merge together log_tpm_df files
    log_tpm_df = pd.read_csv('../data/precise_1.0/log_tpm.csv', index_col = 0)
    starve_log_tpm = pd.read_csv('../data/validation_data_sets/stationary_phase/cleaned_log_tpm_qc.csv', index_col = 0)
    to_blank_inds = list(set(log_tpm_df.index) - set(starve_log_tpm.index))
    # need to create zero rows for missing values
    zeros_data = {col : 0 for col in starve_log_tpm.columns}
    zeros_df = pd.DataFrame(zeros_data, index = to_blank_inds)
    starve_log_tpm = pd.concat([starve_log_tpm, zeros_df])
    starve_log_tpm = starve_log_tpm.loc[log_tpm_df.index]
    log_tpm_df = pd.concat([starve_log_tpm, log_tpm_df], axis = 1)
else:
    log_tpm_df = pd.read_csv('../data/precise_1.0/log_tpm.csv', index_col = 0)

tpm_df = 2**log_tpm_df
overlap = list(set(tpm_df.index).intersection(set(metab_df.index)))
metab_df = metab_df.loc[overlap]
tpm_df = tpm_df.loc[overlap]

In [2]:
# scale tpm_df by the minimum and maximum
new_df = pd.DataFrame(index = tpm_df.index, columns = tpm_df.columns)

to_rem = []
for gene in overlap:
    min_val = min(metab_df[metab_cols].loc[gene])
    max_val = max(metab_df[metab_cols].loc[gene])
    if min_val == max_val: # no good data, skip
        to_rem.append(gene)
    
    tpm_min_val = min(tpm_df.loc[gene])
    tpm_max_val = max(tpm_df.loc[gene])
    
    for sample in tpm_df.columns:
        scaled_tpm = (tpm_df.loc[gene][sample] - tpm_min_val) / (tpm_max_val - tpm_min_val)
        new_val = scaled_tpm*(max_val - min_val) + min_val
        new_df.at[gene, sample] = new_val
new_df = new_df.drop(index = to_rem)
new_df.to_csv('../data/validation_data_sets/converted_log_tpm_in_M.csv')