# EDA of Compiled Datasets

In [21]:
import numpy as np
import pandas as pd
import bokeh.io
import bokeh.plotting
import bokeh.models
import bokeh.transform
import bokeh.palettes
import prot.viz
colors, palette = prot.viz.bokeh_theme()
bokeh.io.output_notebook()

In [87]:
data = pd.read_csv('../../data/compiled_datasets.csv')

# Group by each dataset, condition, and cog class and compute the mass and count fraction. 
class_fraction = pd.DataFrame([])
for g, d in data.groupby(['dataset', 'condition', 'growth_rate_hr']):

    # Get the total mass and size of the proteome.  
    tot_mass = d['fg_per_cell'].sum()
    tot_count = d['tot_per_cell'].sum()
    
 
    # Group by metabolism and info 
    _info = d[d['cog_category']=='Information Storage and Processing']
    _metab = d[d['cog_category']=='Metabolism']
    info_mass = _info['fg_per_cell'].sum()
    metab_mass = _metab['fg_per_cell'].sum()
    info_count = _info['tot_per_cell'].sum()
    metab_count = _metab['tot_per_cell'].sum()
    df_dict = {'dataset':g[0],
                   'condition':g[1],
                   'growth_rate_hr':g[2],
                   'proteome_mass':tot_mass,
                   'proteome_size':tot_count,
                   'info_mass': info_mass,
                   'info_count': info_count,
                   'metab_mass': metab_mass,
                   'metab_count': metab_count,
                   'info_frac_mass': info_mass / tot_mass,
                   'info_frac_count': info_count / tot_count,
                   'metab_frac_mass': metab_mass / tot_mass,
                   'metab_frac_count': metab_count / tot_count} 
    class_fraction = class_fraction.append(df_dict, ignore_index=True)


In [100]:
factors = class_fraction['dataset'].unique()
cmap = bokeh.transform.factor_cmap('dataset', palette=bokeh.palettes.Spectral5, 
                                    factors=factors)

# Set up the figure canvas. 
tools = ['wheel_zoom', 'pan', 'hover']
tooltips = [('dataset', '@dataset'),
            ('condition', '@condition'),
            ('growth rate [hr^-1]', '@growth_rate_hr')]
            

mass_fraction = bokeh.plotting.figure(width=600, height=450, 
                        x_axis_label='metabolic mass fraction',
                        y_axis_label='information, storage, and processing mass fraction',
                        tools=tools, tooltips=tooltips,
                        title='occupancy by mass')

count_fraction = bokeh.plotting.figure(width=600, height=450, 
                        x_axis_label='metabolic number fraction',
                        y_axis_label='information, storage, and processing number fraction',
                        tools=tools, tooltips=tooltips,
                        title='occupancy by copy number')

# Populate
mass_fraction.circle(x='metab_frac_mass', y='info_frac_mass', line_width=0.75, 
                     line_color='black', size=6, source=class_fraction, color=cmap, 
                     legend_field='dataset')

bokeh.io.show(mass_fraction)

In [94]:
info_gr = bokeh.plotting.figure(width=600, height=450, 
                        x_axis_label='growth_rate_hr',
                        y_axis_label='information, storage, and processing mass fraction',
                        tools=tools, tooltips=tooltips,
                        title='occupancy by mass')

lambda_ = np.linspace(0,2.2)
r = 0.087 + (lambda_/ 4.5)
# Populate
info_gr.circle(x='growth_rate_hr', y='info_count', line_width=0.75, 
                     line_color='black', size=6, source=class_fraction, color=cmap, 
                     legend_field='dataset')
info_gr.line(lambda_, r, legend_label='Scott et al. 2010')
info_gr.legend.location = 'top_left'
bokeh.io.show(info_gr)

In [74]:
# Group by each dataset, condition, and cog class and compute the mass and count fraction. 
cat_fraction = pd.DataFrame([])
for g, d in data.groupby(['dataset', 'condition', 'growth_rate_hr']):

    # Get the total mass and size of the proteome.  
    tot_mass = d['fg_per_cell'].sum()
    tot_count = d['tot_per_cell'].sum()
    
 
    # Group by metabolism and info 
    _ribo = d[d['cog_class']=='Translation, ribosomal structure and biogenesis']
    _carbo = d[d['cog_class']=='Carbohydrate transport and metabolism']
    _amino = d[d['cog_class']=='Amino acid transport and metabolism']
    
    ribo_mass = _ribo['fg_per_cell'].sum()
    carbo_mass = _carbo['fg_per_cell'].sum()
    amino_mass = _amino['fg_per_cell'].sum()
    ribo_count = _ribo['tot_per_cell'].sum() 
    carbo_count = _amino['tot_per_cell'].sum()
    amino_count = _amino['tot_per_cell'].sum()
    df_dict = {'dataset':g[0],
                   'condition':g[1],
                   'growth_rate_hr':g[2],
                   'proteome_mass':tot_mass,
                   'proteome_size':tot_count,
                   'info_mass': info_mass,
                   'info_count': info_count,
                   'metab_mass': metab_mass,
                   'metab_count': metab_count,
                   'info_frac_mass': info_mass / tot_mass,
                   'info_frac_count': info_count / tot_count,
                   'metab_frac_mass': metab_mass / tot_mass,
                   'metab_frac_count': metab_count / tot_count} 
    class_fraction = class_fraction.append(df_dict, ignore_index=True)


Unnamed: 0,cog_class,condition,dataset,frac_count,frac_mass,growth_rate_hr,proteome_mass,proteome_size,sector_mass,sector_size
51,Metabolism,stationary_3day,schmidt_2016,0.360831,0.457024,-0.01,4479.602866,2687127.0,2047.287507,969598.0
49,Metabolism,stationary_1day,schmidt_2016,0.353078,0.453836,-0.01,4484.488699,2755171.0,2035.224492,972789.0
17,Metabolism,chemostat_u0.12,schmidt_2016,0.476598,0.591067,0.12,5419.41009,2808060.0,3203.23524,1338316.0
19,Metabolism,chemostat_u0.2,schmidt_2016,0.466726,0.581587,0.2,5944.960147,3117255.0,3457.512858,1454904.0
29,Metabolism,galactose,schmidt_2016,0.463056,0.578363,0.26,6396.568094,3479634.0,3699.537702,1611267.0
15,Metabolism,acetate,schmidt_2016,0.485289,0.597952,0.3,6654.923361,3488900.0,3979.326399,1693125.0
21,Metabolism,chemostat_u0.35,schmidt_2016,0.457004,0.568445,0.35,6893.7478,3662639.0,3918.719552,1673842.0
47,Metabolism,pyruvate,schmidt_2016,0.429884,0.544287,0.4,7141.46384,3883895.0,3887.007294,1669624.0
27,Metabolism,fumarate,schmidt_2016,0.42663,0.539799,0.42,7354.484371,4049815.0,3969.945297,1727772.0
53,Metabolism,succinate,schmidt_2016,0.42358,0.535351,0.44,7519.33194,4160705.0,4025.481539,1762393.0


In [38]:
class_fraction['cog_class'].unique()

array(['Cellular Processes and Signaling',
       'Information Storage and Processing', 'Metabolism',
       'Poorly Characterized'], dtype=object)

In [79]:
class_fraction[class_fraction['dataset']tangui]

Unnamed: 0,cog_class,condition,dataset,frac_count,frac_mass,growth_rate_hr,proteome_mass,proteome_size,sector_mass,sector_size
