In [29]:
import pandas as pd
import re
import os
import numpy as np

In [4]:
from models import av, av2

exchanges = set(i.id for i in av.exchanges)

# Converting biolog plate data into something that can be used with cobra

Basically mapping biolog ids to bigg ids. There is a reference file (biolog_from_jer) that has the mappings. I still don't know the original source. Will find.

In [21]:
# parsing original file and saving to standard
def create_biolog_to_bigg():
    # read and cleanup mapping
    biolog_map = pd.read_csv('biolog_from_jer.csv')
    biolog_map.dropna(subset=['EX_ rxn ?'], inplace=True)
    # yucky spaces
    biolog_map['ex_rxn'] = biolog_map['EX_ rxn ?']
    biolog_map['compound_name'] = biolog_map['cmpd name']
    del biolog_map['EX_ rxn ?']
    del biolog_map['cmpd name']
    # standard IDs
    biolog_map['plate'] = 'PM' + biolog_map['Identifier'].str[:1] + '-' \
                        + biolog_map['Identifier'].str[1] \
                        + biolog_map['Identifier'].str[2:].astype(int).astype(str)
    biolog_map = biolog_map[['plate', 'ex_rxn', 'compound_name']]
    biolog_map.sort_values('plate', inplace=True)
    # fixing some errors where two underscores are used
    biolog_map['exchange'] = biolog_map.ex_rxn.apply(parse)
    
    # save file for loading later
    biolog_map.to_csv('plate_to_bigg.csv', index=False)
    return biolog_map

def parse(row):
    rate = row

    pattern = r'\((.*?)\)'
    matches = re.findall(pattern, rate)
    for m in matches:
        repl = f'_{m}'
        rate = re.sub(pattern, repl, rate)
    pattern = r'\_(.\w?)\_'

    matches = re.findall(pattern, rate)
    for m in matches:
        repl = f'__{m}_'
        rate = re.sub(pattern, repl, rate)
    return rate

if os.path.exists('plate_to_bigg.csv'):
    biolog_map = pd.read_csv('plate_to_bigg.csv', index_col=False)
else:
    biolog_map = create_biolog_to_bigg()
biolog_map.head()

Unnamed: 0,plate,ex_rxn,compound_name,exchange
0,PM1-A10,EX_tre(e),D-Trehalose,EX_tre_e
1,PM1-A11,EX_man(e),D-Mannose,EX_man_e
2,PM1-A12,EX_galt(e),Dulcitol,EX_galt_e
3,PM1-A2,EX_arab_L(e),L-Arabinose,EX_arab__L_e
4,PM1-A3,EX_acgam(e),N-Acetyl-DGlucosamine,EX_acgam_e


In [23]:
biolog_map['exchange'].sort_values()

36     EX_12ppd__S_e
138      EX_23camp_e
145      EX_23ccmp_e
142      EX_23cgmp_e
151      EX_23cump_e
           ...      
84       EX_val__L_e
112      EX_val__L_e
135         EX_xan_e
136        EX_xtsn_e
21       EX_xyl__D_e
Name: exchange, Length: 158, dtype: object

In [25]:

# AV biolog data
# Supplementary table 2 BIOLOG PLATES.xlsx
# source: https://pubmed.ncbi.nlm.nih.gov/32551229/

av_biolog = pd.read_excel('data_biolog_av/Supplementary table 2 BIOLOG PLATES.xlsx')
av_biolog.head()

Unnamed: 0,PM1,Mean PM1,Error PM1,Adjusted growth PM1,Conclusion PM1,PM2,Mean PM2,Error PM2,Adjusted growth PM2,Conclusion PM2,PM3,Mean PM3,Error PM3,Adjusted growth PM3,Conclusion PM3
0,A1,0.000119,9.2e-05,,Control,A1,0.002058,6e-05,,,A1,0.015827,0.000304,,
1,A2,-0.00023,9.4e-05,0.000578,Non-Growth,A2,0.000174,0.000161,-0.00128,Non-Growth,A2,0.020432,0.000276,0.007646,Growth
2,A3,0.001083,0.000116,0.001889,Growth,A3,0.001942,6e-05,0.000488,Non-Growth,A3,0.013132,0.000347,0.000346,Growth
3,A4,-0.00014,0.000128,0.00067,Non-Growth,A4,0.001746,7.7e-05,0.000292,Non-Growth,A4,0.019363,0.000255,0.006577,Growth
4,A5,0.000121,0.000556,0.000927,Growth,A5,0.001393,0.000161,-6.1e-05,Non-Growth,A5,0.017973,0.000221,0.005187,Growth


In [34]:
def get_plate(pm):
    pm_sub = av_biolog[[pm, f'Conclusion {pm}']].copy()
    pm_sub['plate']  = pm + '-' + pm_sub[pm]
    pm_sub['outcome'] = pm_sub[f'Conclusion {pm}']
    
    pm_sub['growth']= np.nan
    pm_sub.loc[pm_sub.outcome == 'Growth', 'growth']  = True
    pm_sub.loc[pm_sub.outcome == 'Non-Growth', 'growth']  = False
    pm_sub.dropna(subset=['growth'], inplace=True)
    pm_sub['growth'] = pm_sub['growth'].astype(bool)
    return pm_sub[['plate', 'growth']]

pm1 = get_plate('PM1')
pm2 = get_plate('PM2')
pm3 = get_plate('PM3')
carbon_sources = pd.concat([pm1, pm2])
pm1

Unnamed: 0,plate,growth
1,PM1-A2,False
2,PM1-A3,True
3,PM1-A4,False
4,PM1-A5,True
5,PM1-A6,True
...,...,...
91,PM1-H8,True
92,PM1-H9,False
93,PM1-H10,False
94,PM1-H11,False


In [36]:
def merge_with_chem_ids(plate):
    merged = pd.concat(
        [plate.set_index('plate'), biolog_map.set_index('plate')],
        axis=1,
        ignore_index=False
    )
    merged.dropna(subset=['growth', 'ex_rxn'], inplace=True)
    
    merged = merged.reset_index()[['exchange', 'growth']]

    # define uptake, arbitrary for now
    merged['uptake'] = 10
    # reorder to match memote
    merged = merged[['exchange', 'uptake', 'growth']]
    
    return merged

def subset_to_in_model(plate, model_ex=exchanges):
    return plate.loc[plate.exchange.isin(model_ex)]


carbon_w_names = merge_with_chem_ids(carbon_sources)
carbon_w_names_in_model = subset_to_in_model(carbon_w_names, exchanges)
carbon_w_names_in_model.to_csv('pm1_pm2_biolog.csv', index=False)


In [37]:
pm3_w_names = merge_with_chem_ids(pm3)
pm3_w_names_in_model = subset_to_in_model(pm3_w_names)
pm3_w_names_in_model.to_csv('pm3_biolog.csv', index=False)


In [None]:
biolog_ex_pm1 = set(pm1_w_names.exchange.values)
biolog_ex_pm3 = set(pm3_w_names.exchange.values)
biolog_ex = biolog_ex_pm1.union(biolog_ex_pm3)

# TODO
Figure out why there are some missing

In [39]:
print(f'{len(exchanges)} exchange metabolites in model')
print(f'{len(biolog_ex)} exchange metabolites in biolog')

323 exchange metabolites in model
107 exchange metabolites in biolog


In [40]:
print(f'{len(biolog_ex.difference(exchanges))} missing in model')
print(f'{len(exchanges.difference(biolog_ex))} missing in biolog')
print(f'{len(exchanges.intersection(biolog_ex))} from model found in biolog')

26 missing in model
242 missing in biolog
81 from model found in biolog


In [41]:
print(f'{len(biolog_ex.difference(exchanges))} missing in model')
print(f'{len(exchanges.difference(biolog_ex))} missing in biolog')
print(f'{len(exchanges.intersection(biolog_ex))} from model found in biolog')

26 missing in model
242 missing in biolog
81 from model found in biolog


<img src="https://ars.els-cdn.com/content/image/1-s2.0-S2214030120300043-gr3.jpg" alt="Experimental" /> 

In [42]:
for i in sorted(exchanges.difference(biolog_ex)):
    print(i)

EX_12ppd__R_e
EX_14glucan_e
EX_15dap_e
EX_23camp_e
EX_23ccmp_e
EX_23cgmp_e
EX_23cump_e
EX_23dappa_e
EX_26dap__M_e
EX_2ddglcn_e
EX_2dglc_e
EX_2hb_e
EX_2obut_e
EX_34dhpac_e
EX_3amp_e
EX_3cmp_e
EX_3gmp_e
EX_3hcinnm_e
EX_3hpp_e
EX_3hpppn_e
EX_3ump_e
EX_4abzglu_e
EX_4hoxpacd_e
EX_5aptn_e
EX_5mtr_e
EX_LalaDgluMdapDala_e
EX_LalaDgluMdap_e
EX_LalaDglu_e
EX_LalaLglu_e
EX_abt__D_e
EX_ac_e
EX_acald_e
EX_acgal1p_e
EX_acgam1p_e
EX_acmum_e
EX_acon_C_e
EX_acser_e
EX_adocbl_e
EX_ag_e
EX_ala_B_e
EX_alaala_e
EX_alg_e
EX_amp_e
EX_anhgm_e
EX_arbt_e
EX_arbtn_e
EX_arbtn_fe3_e
EX_ascb__L_e
EX_aso3_e
EX_btn_e
EX_butso3_e
EX_bz_e
EX_ca2_e
EX_cbi_e
EX_cbl1_e
EX_cd2_e
EX_cellb_e
EX_cgly_e
EX_ch4_e
EX_chol_e
EX_chtbs_e
EX_cl_e
EX_cm_e
EX_cmp_e
EX_co2_e
EX_cobalt2_e
EX_cpgn_e
EX_cpgn_un_e
EX_crn__D_e
EX_crn_e
EX_cs1_e
EX_cu2_e
EX_cu_e
EX_cyan_e
EX_cynt_e
EX_cys__D_e
EX_dad_2_e
EX_damp_e
EX_dcmp_e
EX_dcyt_e
EX_ddca_e
EX_dgmp_e
EX_dgsn_e
EX_dhps_e
EX_dimp_e
EX_din_e
EX_dms_e
EX_dmso_e
EX_dopa_e
EX_doxrbcn_e
EX_dtmp_


Found case where extra underscores made items missing. Fixed.

<strike>Some have '__\' in model, but only 1 in biolog. EX_12ppd_S_e vs EX_12ppd__S_e. There is also a EX_12ppd__R_e and EX_12ppd__S_e (R and S) difference.

<strike>
    
Fixed

## Data from Biolog that is not in the model.

In [43]:
for i in sorted(biolog_ex.difference(exchanges)):
    print(i)

EX__ac_e
EX_acgal_e
EX_acgam_e
EX_acmana_e
EX_agm_e
EX_arg__L_e
EX_cys__L_e
EX_dad__2_e
EX_etha_e
EX_gam_e
EX_glc_e
EX_glu__L_e
EX_gly_e
EX_hom__L_e
EX_ile__L_e
EX_leu__L_e
EX_mal__L_e
EX_met__L_e
EX_phe__L_e
EX_pro__L_e
EX_ptrc_e
EX_tre_e
EX_trp__L_e
EX_tym_e
EX_tyr__L_e
EX_val__L_e
