In [1]:
# import and loading
import pandas as pd
import numpy as np

stable_flags = {
    'include_Amy_samples' : True,
}

metab_df = pd.read_excel('../data/validation_data_sets/heinemann_protein_conc.xlsx', index_col = 0)
metab_cols = ['Glucose', 'Acetate', 'Glycerol', 'Pyruvate', 'Galactose ', 'Succinate', 'Fructose']

# load in un-logged tpm
if stable_flags['include_Amy_samples']:
    # merge together log_tpm_df files
    log_tpm_df = pd.read_csv('../data/precise_1.0/log_tpm.csv', index_col = 0)
    starve_log_tpm = pd.read_csv('../data/validation_data_sets/stationary_phase/cleaned_log_tpm_qc.csv', index_col = 0)
    to_blank_inds = list(set(log_tpm_df.index) - set(starve_log_tpm.index))
    # need to create zero rows for missing values
    zeros_data = {col : 0 for col in starve_log_tpm.columns}
    zeros_df = pd.DataFrame(zeros_data, index = to_blank_inds)
    starve_log_tpm = pd.concat([starve_log_tpm, zeros_df])
    starve_log_tpm = starve_log_tpm.loc[log_tpm_df.index]
    log_tpm_df = pd.concat([starve_log_tpm, log_tpm_df], axis = 1)
else:
    log_tpm_df = pd.read_csv('../data/precise_1.0/log_tpm.csv', index_col = 0)

tpm_df = 2**log_tpm_df
overlap = list(set(tpm_df.index).intersection(set(metab_df.index)))
metab_df = metab_df.loc[overlap]
tpm_df = tpm_df.loc[overlap]

In [4]:
metab_df.loc['b1594']

gene_name                                                                                                   mlc
Synonyms                                                                                                    mlc
OLN 2                                                                                                    JW1586
Swiss-Prot entry name                                                                                 MLC_ECOLI
Swiss-Prot primary accession number                                                                      P50456
Length of the sequence before post--translational processing (AAs)                                        406.0
start                                                                                                 1667343.0
stop                                                                                                  1668564.0
strand                                                                                                  

In [10]:
metab_df[metab_cols].values.flatten()

array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
       2.64805776e-08, 2.46818046e-08, 2.47402382e-08])

In [11]:
import numpy as np

In [19]:
vals = metab_df[metab_cols].values.flatten()
vals = vals[vals != 0]

In [21]:
min(vals)

1.844334747746874e-17

In [6]:
min(metab_df[metab_cols].loc['b2818'])

1.50258904437862e-07

In [22]:
# scale tpm_df by the minimum and maximum
new_df = pd.DataFrame(index = tpm_df.index, columns = tpm_df.columns)

vals = metab_df[metab_cols].values.flatten()
vals = vals[vals != 0]
global_min = min(vals)
global_max = max(vals)

to_rem = []
for gene in overlap:
    min_val = min(metab_df[metab_cols].loc[gene])
    max_val = max(metab_df[metab_cols].loc[gene])
    if min_val == max_val: # no good data, just pull from overall max and min
        min_val = global_min
        max_val = global_max
        to_rem.append(gene) # used to use this to remove samples

    tpm_min_val = min(tpm_df.loc[gene])
    tpm_max_val = max(tpm_df.loc[gene])
    
    for sample in tpm_df.columns:
        scaled_tpm = (tpm_df.loc[gene][sample] - tpm_min_val) / (tpm_max_val - tpm_min_val)
        new_val = scaled_tpm*(max_val - min_val) + min_val
        new_df.at[gene, sample] = new_val
#new_df = new_df.drop(index = to_rem)
new_df.to_csv('../data/validation_data_sets/converted_log_tpm_in_M.csv')

In [48]:
# limit to regulators
keep = []
for prod in metab_df['product'].values:
    if 'regulator' in str(prod):
        keep.append(True)
    else:
        keep.append(False)
baby = metab_df.loc[keep]
baby_new = new_df.loc[list(set(new_df.index).intersection(baby.index))]
baby_new.replace(0, np.nan, inplace = True)
print(min(baby_new.values.flatten()))
print(max(baby_new.values.flatten()))

2.902870141566294e-13
0.00014190659526601638


In [47]:
baby_new.replace(0, np.nan, inplace = True)

In [34]:
baby

Unnamed: 0_level_0,gene_name,Synonyms,OLN 2,Swiss-Prot entry name,Swiss-Prot primary accession number,Length of the sequence before post--translational processing (AAs),start,stop,strand,product,...,Fructose,Unnamed: 21,Unnamed: 22,Genes_iML1515,Glucose.1,gene_name.1,Number of complexes,At least 1 complex?,Unnamed: 28,Unique Complexes
bnumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
b1217,chaB,chaB,JW1208,CHAB_ECOLI,P0AE63,76.0,1272118.0,1272349.0,+,predicted cation transport regulator,...,6.169295e-08,,,b0589,0.000000e+00,fepG,1.0,1.0,,
b2698,recX,recX;oraA,JW2668,RECX_ECOLI,P33596,166.0,2822138.0,2822639.0,-,regulatory protein RecX; inhibitor of RecA,...,0.000000e+00,,,,,,,,,
b1892,flhD,flhD;flbB,JW1881,FLHD_ECOLI,P0A8S9,116.0,1977846.0,1978197.0,-,FlhD transcriptional dual regulator,...,0.000000e+00,,,,,,,,,
b3674,yidF,yidF,JW3650,YIDF_ECOLI,P31443,165.0,3855113.0,3855611.0,-,predicted DNA-binding transcriptional regulator,...,0.000000e+00,,,,,,,,,
b4401,arcA,arcA;cpxC;dye;fexA;msp;seg;sfrA,JW4364,ARCA_ECOLI,P0A9Q1,238.0,4639589.0,4640306.0,-,ArcA transcriptional dual regulator,...,3.187421e-06,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
b0316,yahB,yahB,JW0308,YAHB_ECOLI,P77700,310.0,333500.0,334433.0,-,predicted DNA-binding transcriptional regulato...,...,0.000000e+00,,,b2750,6.184798e-07,cysC,1.0,1.0,,b1764(1)
b2852,ygeH,ygeH,JW2820,YGEH_ECOLI,P76639,458.0,2992093.0,2993470.0,+,predicted transcriptional regulator,...,0.000000e+00,,,,,,,,,
b4479,dgoR,dgoR;yidW,JW5627,DGOR_ECOLI,P31460,229.0,3874470.0,3875160.0,-,predicted DNA-binding transcriptional regulator,...,1.842217e-08,,,,,,,,,
b2839,lysR,lysR,JW2807,LYSR_ECOLI,P03030,311.0,2979020.0,2979956.0,+,LysR DNA-binding transcriptional dual regulator,...,0.000000e+00,,,,,,,,,


In [17]:
min(new_df.loc['b3934'])

2.863891910657938e-09

In [18]:
max(new_df.loc['b3934'])

1.5207185658878135e-08

In [10]:
min(new_df.loc['b3237'])

4.12710634068676e-07

In [11]:
max(new_df.loc['b3237'])

9.982167238476515e-07