In [45]:
# import and loading
import pandas as pd

stable_flags = {
    'include_Amy_samples' : True,
}

metab_df = pd.read_excel('../data/validation_data_sets/heinemann_protein_conc.xlsx', index_col = 0)
metab_cols = ['Glucose', 'Acetate', 'Glycerol', 'Pyruvate', 'Galactose ', 'Succinate', 'Fructose']

# load in un-logged tpm
if stable_flags['include_Amy_samples']:
    # merge together log_tpm_df files
    log_tpm_df = pd.read_csv('../data/precise_1.0/log_tpm.csv', index_col = 0)
    starve_log_tpm = pd.read_csv('../data/validation_data_sets/stationary_phase/cleaned_log_tpm_qc.csv', index_col = 0)
    to_blank_inds = list(set(log_tpm_df.index) - set(starve_log_tpm.index))
    # need to create zero rows for missing values
    zeros_data = {col : 0 for col in starve_log_tpm.columns}
    zeros_df = pd.DataFrame(zeros_data, index = to_blank_inds)
    starve_log_tpm = pd.concat([starve_log_tpm, zeros_df])
    starve_log_tpm = starve_log_tpm.loc[log_tpm_df.index]
    log_tpm_df = pd.concat([starve_log_tpm, log_tpm_df], axis = 1)
else:
    log_tpm_df = pd.read_csv('../data/precise_1.0/log_tpm.csv', index_col = 0)

tpm_df = 2**log_tpm_df
overlap = list(set(tpm_df.index).intersection(set(metab_df.index)))
metab_df = metab_df.loc[overlap]
tpm_df = tpm_df.loc[overlap]

In [46]:
# scale tpm_df by the minimum and maximum
new_df = pd.DataFrame(index = tpm_df.index, columns = tpm_df.columns)

to_rem = []
for gene in overlap:
    min_val = min(metab_df[metab_cols].loc[gene])
    max_val = max(metab_df[metab_cols].loc[gene])
    if min_val == max_val: # no good data, skip
        to_rem.append(gene)
    
    tpm_min_val = min(tpm_df.loc[gene])
    tpm_max_val = max(tpm_df.loc[gene])
    
    for sample in tpm_df.columns:
        scaled_tpm = (tpm_df.loc[gene][sample] - tpm_min_val) / (tpm_max_val - tpm_min_val)
        new_val = scaled_tpm*(max_val - min_val) + min_val
        new_df.at[gene, sample] = new_val
new_df = new_df.drop(index = to_rem)
new_df.to_csv('../data/validation_data_sets/converted_log_tpm_in_M.csv')

In [47]:
new_df

Unnamed: 0,starve_series__t00_growth1__1,starve_series__t00_growth1__2,starve_series__t00_growth1__3,starve_series__t00_growth1__4,starve_series__t01_starve__1,starve_series__t01_starve__2,starve_series__t02_starve__1,starve_series__t02_starve__2,starve_series__t03_starve__1,starve_series__t03_starve__2,...,efeU__menFentC_ale29__1,efeU__menFentC_ale29__2,efeU__menFentC_ale30__1,efeU__menFentC_ale30__2,efeU__menFentCubiC_ale36__1,efeU__menFentCubiC_ale36__2,efeU__menFentCubiC_ale37__1,efeU__menFentCubiC_ale37__2,efeU__menFentCubiC_ale38__1,efeU__menFentCubiC_ale38__2
b1258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
b0811,0.000006,0.000006,0.000006,0.000005,0.000009,0.000009,0.000008,0.000008,0.000008,0.000008,...,0.000006,0.000006,0.000007,0.000006,0.000007,0.000007,0.000007,0.000007,0.000006,0.000006
b1725,0.0,0.0,0.0,0.0,0.0,0.0,0.000001,0.000001,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
b2701,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
b4302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
b3297,0.000009,0.000009,0.000009,0.000009,0.000007,0.000006,0.000005,0.000005,0.000005,0.000005,...,0.000007,0.000007,0.000009,0.000009,0.000007,0.000007,0.000007,0.000007,0.000007,0.000007
b1379,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
b2027,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,...,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001
b1602,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,...,0.000002,0.000002,0.000002,0.000002,0.000003,0.000003,0.000003,0.000003,0.000002,0.000002


In [50]:
new_df.loc['b3237']

starve_series__t00_growth1__1    0.000001
starve_series__t00_growth1__2    0.000001
starve_series__t00_growth1__3    0.000001
starve_series__t00_growth1__4         0.0
starve_series__t01_starve__1          0.0
                                   ...   
efeU__menFentCubiC_ale36__2      0.000001
efeU__menFentCubiC_ale37__1      0.000001
efeU__menFentCubiC_ale37__2      0.000001
efeU__menFentCubiC_ale38__1      0.000001
efeU__menFentCubiC_ale38__2      0.000001
Name: b3237, Length: 328, dtype: object

In [49]:
tpm_df.loc['b3237']

starve_series__t00_growth1__1    166.210056
starve_series__t00_growth1__2    149.613575
starve_series__t00_growth1__3    189.266385
starve_series__t00_growth1__4    128.481169
starve_series__t01_starve__1     100.962668
                                    ...    
efeU__menFentCubiC_ale36__2      370.654289
efeU__menFentCubiC_ale37__1      428.337489
efeU__menFentCubiC_ale37__2      425.032929
efeU__menFentCubiC_ale38__1      346.350252
efeU__menFentCubiC_ale38__2      299.562733
Name: b3237, Length: 328, dtype: float64

In [48]:
metab_df.loc['b3237']

gene_name                                                                                            argR
Synonyms                                                                                        argR;xerA
OLN 2                                                                                              JW3206
Swiss-Prot entry name                                                                          ARGR_ECOLI
Swiss-Prot primary accession number                                                                P0A6D0
Length of the sequence before post--translational processing (AAs)                                  156.0
start                                                                                           3384702.0
stop                                                                                            3385173.0
strand                                                                                                  +
product                                       