# EDA of Compiled Datasets

In [21]:
import numpy as np
import pandas as pd
import bokeh.io
import bokeh.plotting
import bokeh.models
import bokeh.transform
import bokeh.palettes
import prot.viz
colors, palette = prot.viz.bokeh_theme()
bokeh.io.output_notebook()

In [87]:
data = pd.read_csv('../../data/compiled_datasets.csv')

# Group by each dataset, condition, and cog class and compute the mass and count fraction. 
class_fraction = pd.DataFrame([])
for g, d in data.groupby(['dataset', 'condition', 'growth_rate_hr']):

    # Get the total mass and size of the proteome.  
    tot_mass = d['fg_per_cell'].sum()
    tot_count = d['tot_per_cell'].sum()
    
 
    # Group by metabolism and info 
    _info = d[d['cog_category']=='Information Storage and Processing']
    _metab = d[d['cog_category']=='Metabolism']
    info_mass = _info['fg_per_cell'].sum()
    metab_mass = _metab['fg_per_cell'].sum()
    info_count = _info['tot_per_cell'].sum()
    metab_count = _metab['tot_per_cell'].sum()
    df_dict = {'dataset':g[0],
                   'condition':g[1],
                   'growth_rate_hr':g[2],
                   'proteome_mass':tot_mass,
                   'proteome_size':tot_count,
                   'info_mass': info_mass,
                   'info_count': info_count,
                   'metab_mass': metab_mass,
                   'metab_count': metab_count,
                   'info_frac_mass': info_mass / tot_mass,
                   'info_frac_count': info_count / tot_count,
                   'metab_frac_mass': metab_mass / tot_mass,
                   'metab_frac_count': metab_count / tot_count} 
    class_fraction = class_fraction.append(df_dict, ignore_index=True)


In [100]:
factors = class_fraction['dataset'].unique()
cmap = bokeh.transform.factor_cmap('dataset', palette=bokeh.palettes.Spectral5, 
                                    factors=factors)

# Set up the figure canvas. 
tools = ['wheel_zoom', 'pan', 'hover']
tooltips = [('dataset', '@dataset'),
            ('condition', '@condition'),
            ('growth rate [hr^-1]', '@growth_rate_hr')]
            

mass_fraction = bokeh.plotting.figure(width=600, height=450, 
                        x_axis_label='metabolic mass fraction',
                        y_axis_label='information, storage, and processing mass fraction',
                        tools=tools, tooltips=tooltips,
                        title='occupancy by mass')

count_fraction = bokeh.plotting.figure(width=600, height=450, 
                        x_axis_label='metabolic number fraction',
                        y_axis_label='information, storage, and processing number fraction',
                        tools=tools, tooltips=tooltips,
                        title='occupancy by copy number')

# Populate
mass_fraction.circle(x='metab_frac_mass', y='info_frac_mass', line_width=0.75, 
                     line_color='black', size=6, source=class_fraction, color=cmap, 
                     legend_field='dataset')

bokeh.io.show(mass_fraction)

In [94]:
info_gr = bokeh.plotting.figure(width=600, height=450, 
                        x_axis_label='growth_rate_hr',
                        y_axis_label='information, storage, and processing mass fraction',
                        tools=tools, tooltips=tooltips,
                        title='occupancy by mass')

lambda_ = np.linspace(0,2.2)
r = 0.087 + (lambda_/ 4.5)
# Populate
info_gr.circle(x='growth_rate_hr', y='info_count', line_width=0.75, 
                     line_color='black', size=6, source=class_fraction, color=cmap, 
                     legend_field='dataset')
info_gr.line(lambda_, r, legend_label='Scott et al. 2010')
info_gr.legend.location = 'top_left'
bokeh.io.show(info_gr)

In [122]:
# Group by each dataset, condition, and cog class and compute the mass and count fraction. 
cat_fraction = pd.DataFrame([])
for g, d in data.groupby(['dataset', 'condition', 'growth_rate_hr']):

    # Get the total mass and size of the proteome.  
    tot_mass = d['fg_per_cell'].sum()
    tot_count = d['tot_per_cell'].sum()
    
 
    # Group by metabolism and info 
    _ribo = d[d['cog_class']=='Translation, ribosomal structure and biogenesis']
    _carbo = d[d['cog_class']=='Carbohydrate transport and metabolism']
    _amino = d[d['cog_class']=='Amino acid transport and metabolism']
    
    ribo_mass = _ribo['fg_per_cell'].sum()
    carbo_mass = _carbo['fg_per_cell'].sum()
    amino_mass = _amino['fg_per_cell'].sum()
    ribo_count = _ribo['tot_per_cell'].sum() 
    carbo_count = _carbo['tot_per_cell'].sum()
    amino_count = _amino['tot_per_cell'].sum()
    df_dict = {'dataset':g[0],
                   'condition':g[1],
                   'growth_rate_hr':g[2],
                   'proteome_mass':tot_mass,
                   'proteome_size':tot_count, 
                   'ribo_frac_mass': ribo_mass / tot_mass,
                   'ribo_frac_count': ribo_count / tot_count,
                   'carbo_frac_mass': carbo_mass / tot_mass,
                   'carbo_frac_count': carbo_count / tot_count,
                   'amino_frac_mass': amino_mass / tot_mass,
                   'amino_frac_count': amino_count / tot_count} 
    cat_fraction = cat_fraction.append(df_dict, ignore_index=True)


In [123]:
cat_fraction

Unnamed: 0,amino_frac_count,amino_frac_mass,carbo_frac_count,carbo_frac_mass,condition,dataset,growth_rate_hr,proteome_mass,proteome_size,ribo_frac_count,ribo_frac_mass
0,0.152058,0.150031,0.094088,0.093673,NaCl_stress,calgar_2017,0.664398,196.431766,66237.67,0.158943,0.135918
1,0.179953,0.180998,0.092847,0.093883,gluconate_growth,calgar_2017,0.663012,617.54083,217293.5,0.192648,0.166264
2,0.182545,0.182891,0.087519,0.087222,glucose_time_course,calgar_2017,0.774755,680.967563,238275.0,0.196922,0.172265
3,0.020861,0.041413,0.032566,0.055579,MOPS complete,li_2014,1.934364,21425.395165,16745300.0,0.510559,0.402678
4,0.072956,0.169015,0.03424,0.051929,MOPS complete without methionine,li_2014,1.56939,14179.488538,9454056.0,0.434702,0.301777
5,0.092785,0.176425,0.054351,0.077344,MOPS minimal,li_2014,0.7387,7690.036243,5028793.0,0.282054,0.190345
6,0.100918,0.150372,0.070104,0.086237,42C,schmidt_2016,0.66,8682.335323,4930009.0,0.332423,0.260777
7,0.095382,0.128015,0.095745,0.096992,acetate,schmidt_2016,0.3,6654.923361,3488900.0,0.216907,0.171423
8,0.092836,0.11723,0.099029,0.099641,chemostat_u0.12,schmidt_2016,0.12,5419.41009,2808060.0,0.194187,0.153989
9,0.095628,0.125319,0.100409,0.099857,chemostat_u0.2,schmidt_2016,0.2,5944.960147,3117255.0,0.204753,0.161071


In [124]:
class_fractionddddjk[class_fraction['dataset']tangui]

SyntaxError: invalid syntax (<ipython-input-124-b3f65d68e7d2>, line 1)

In [132]:
info_gr = bokeh.plotting.figure(width=600, height=450, 
                        x_axis_label='growth_rate_hr',
                        y_axis_label='information, storage, and processing mass fraction',
                        tools=tools, tooltips=tooltips,
                        title='occupancy by mass')

lambda_ = np.linspace(0,2.2)
r = 0.087 + (lambda_/ 4.5)
# Populate
info_gr.circle(x='growth_rate_hr', y='ribo_frac_count', line_width=0.75, 
                     line_color='black', size=6, source=cat_fraction, color=cmap, 
                     legend_field='dataset')
info_gr.line(lambda_, 0.76*r, legend_label='Scott et al. 2010')

info_gr.legend.location = 'top_left'
bokeh.io.show(info_gr)

In [128]:
factors = class_fraction['dataset'].unique()
cmap = bokeh.transform.factor_cmap('dataset', palette=bokeh.palettes.Spectral5, 
                                    factors=factors)

# Set up the figure canvas. 
tools = ['wheel_zoom', 'pan', 'hover']
tooltips = [('dataset', '@dataset'),
            ('condition', '@condition'),
            ('growth rate [hr^-1]', '@growth_rate_hr')]
            

mass_fraction = bokeh.plotting.figure(width=600, height=450, 
                        y_axis_label='metabolic subgroup mass fraction',
                        x_axis_label='information, storage, and processing mass fraction',
                        tools=tools, tooltips=tooltips,
                        title='occupancy by mass')

# Populate
mass_fraction.circle(x='ribo_frac_mass', y='amino_frac_mass', line_width=0.75, 
                     line_color='black', size=6, source=cat_fraction, color=cmap, 
                     legend_field='dataset')
mass_fraction.square(x='ribo_frac_mass', y='carbo_frac_mass', line_width=0.5, 
                     size=6, source=cat_fraction, line_color=cmap, color=None, 
                     legend_field='dataset')
# mass_fraction.triangle(x='ribo_fra', y='ribo_frac_mass', line_width=0.5, 
#                      size=6, source=class_fraction, line_color=cmap, color=None, 
#                      legend_field='dataset')


bokeh.io.show(mass_fraction)

ERROR:bokeh.core.validation.check:E-1006 (NON_MATCHING_DATA_SOURCES_ON_LEGEND_ITEM_RENDERERS): LegendItem.label is a field, but renderer data sources don't match: LegendItem(id='22911', ...)
