# All Activity Bar Graphs

This generates the data for all activity bar graphs for i-modulons in PRECISE 1.0 so that they can be plotted in the ModulomeVis site.

In [1]:
import sys
# be sure that this points to a clone of github.com/SBRG/ICA
sys.path.append('../../../ica/') 
from icaviz.plotting import *
DATA_DIR = '../../../ica/data/precise_data/'
GENE_DIR = '../../../ica/data/annotation/'
enrich = pd.read_csv(DATA_DIR+'curated_enrichments.csv', index_col = 0)
names = enrich['name'].tolist()
ica_data = load_data(X=DATA_DIR+'log_tpm.csv',
                     S=DATA_DIR+'S.csv',
                     A=DATA_DIR+'A.csv',
                     metadata=DATA_DIR+'metadata.csv',
                     annotation=GENE_DIR+'gene_info.csv',
                     trn=GENE_DIR+'TRN.csv',
                     fasta=GENE_DIR+'NC_000913.3.fasta',
                     cutoff = 550)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
from activity_bar import *

In [3]:
# public metadata table

# drop the stuff the public doesn't need to see
sample_meta = ica_data.metadata.drop(['contact', 'library creator', 'run_date', 'R1', 'R2', 'BAM', 'alignment', 'Public'], axis=1)

# fix a minor issue with the number of biological replicates in a condition
sample_meta.loc[(sample_meta.project_id == 'ica') & (sample_meta.condition_id == 'wt_glc'), 'Biological Replicates'] = [6]*6

# reset index so that javascript can deal more easily (as numerically indexed arrays)
sample_meta = sample_meta.reset_index()

sample_meta.to_csv('data/sample_metadata.csv')

sample_meta

Unnamed: 0,sample_id,study,project_id,condition_id,rep_id,Strain Description,Strain,Base Media,Carbon Source (g/L),Nitrogen Source (g/L),Electron Acceptor,Trace Element Mixture,Supplement,Temperature (C),pH,Antibiotic,Culture Type,Growth Rate (1/hr),Evolved Sample,Isolate Type,Sequencing Machine,Additional Details,Biological Replicates,DOI,GEO
0,control__wt_glc__1,Control,control,wt_glc,1,Escherichia coli K-12 MG1655,MG1655,M9,glucose(2),NH4Cl(1),O2,sauer trace element mixture,,37,7.0,,Batch,,No,,MiSeq,,2,doi.org/10.1101/080929,GSE65643
1,control__wt_glc__2,Control,control,wt_glc,2,Escherichia coli K-12 MG1655,MG1655,M9,glucose(2),NH4Cl(1),O2,sauer trace element mixture,,37,7.0,,Batch,,No,,MiSeq,,2,doi.org/10.1101/080929,GSE65643
2,fur__wt_dpd__1,Fur,fur,wt_dpd,1,Escherichia coli K-12 MG1655,MG1655,M9,glucose(2),NH4Cl(1),O2,,DPD (0.2mM),37,7.0,,Batch,0.0,No,,MiSeq,,2,doi.org/10.1038/ncomms5910,GSE54900
3,fur__wt_dpd__2,Fur,fur,wt_dpd,2,Escherichia coli K-12 MG1655,MG1655,M9,glucose(2),NH4Cl(1),O2,,DPD (0.2mM),37,7.0,,Batch,0.0,No,,MiSeq,,2,doi.org/10.1038/ncomms5910,GSE54900
4,fur__wt_fe__1,Fur,fur,wt_fe,1,Escherichia coli K-12 MG1655,MG1655,M9,glucose(2),NH4Cl(1),O2,,FeCl2 (0.1mM),37,7.0,,Batch,1.06,No,,MiSeq,,2,doi.org/10.1038/ncomms5910,GSE54900
5,fur__wt_fe__2,Fur,fur,wt_fe,2,Escherichia coli K-12 MG1655,MG1655,M9,glucose(2),NH4Cl(1),O2,,FeCl2 (0.1mM),37,7.0,,Batch,1.06,No,,MiSeq,,2,doi.org/10.1038/ncomms5910,GSE54900
6,fur__delfur_dpd__1,Fur,fur,delfur_dpd,1,Escherichia coli K-12 MG1655 del_fur,MG1655,M9,glucose(2),NH4Cl(1),O2,,DPD (0.2mM),37,7.0,Kanamycin (50 ug/mL),Batch,0.0,No,,MiSeq,,2,doi.org/10.1038/ncomms5910,GSE54900
7,fur__delfur_dpd__2,Fur,fur,delfur_dpd,2,Escherichia coli K-12 MG1655 del_fur,MG1655,M9,glucose(2),NH4Cl(1),O2,,DPD (0.2mM),37,7.0,Kanamycin (50 ug/mL),Batch,0.0,No,,MiSeq,,2,doi.org/10.1038/ncomms5910,GSE54900
8,fur__delfur_fe2__1,Fur,fur,delfur_fe2,1,Escherichia coli K-12 MG1655 del_fur,MG1655,M9,glucose(2),NH4Cl(1),O2,,FeCl2 (0.1mM),37,7.0,Kanamycin (50 ug/mL),Batch,0.62,No,,MiSeq,,2,doi.org/10.1038/ncomms5910,GSE54900
9,fur__delfur_fe2__2,Fur,fur,delfur_fe2,2,Escherichia coli K-12 MG1655 del_fur,MG1655,M9,glucose(2),NH4Cl(1),O2,,FeCl2 (0.1mM),37,7.0,Kanamycin (50 ug/mL),Batch,0.62,No,,MiSeq,,2,doi.org/10.1038/ncomms5910,GSE54900


In [4]:
ignored_cols = ['sample_id','rep_id','condition_id',
                'Sequencing Machine','Growth Rate (1/hr)',
                'Additional Details']
metadata_disagreement_check(sample_meta, ignored_cols)

Disagreement: minspan__wt_glc DOI
Disagreement: minspan__wt_glc GEO


Unnamed: 0,sample_id,study,project_id,condition_id,rep_id,Strain Description,Strain,Base Media,Carbon Source (g/L),Nitrogen Source (g/L),Electron Acceptor,Trace Element Mixture,Supplement,Temperature (C),pH,Antibiotic,Culture Type,Growth Rate (1/hr),Evolved Sample,Isolate Type,Sequencing Machine,Additional Details,Biological Replicates,DOI,GEO
68,minspan__wt_glc__1,MinSpan,minspan,wt_glc,1,Escherichia coli K-12 MG1655,MG1655,M9,glucose(4),NH4Cl(1),O2,sauer trace element mixture,,37,7.0,,Batch,,No,,GAIIX,,4,doi.org/10.15252/msb.20145243,GSE48324
69,minspan__wt_glc__2,MinSpan,minspan,wt_glc,2,Escherichia coli K-12 MG1655,MG1655,M9,glucose(4),NH4Cl(1),O2,sauer trace element mixture,,37,7.0,,Batch,,No,,GAIIX,,4,doi.org/10.15252/msb.20145243,GSE48324
70,minspan__wt_glc__3,MinSpan,minspan,wt_glc,3,Escherichia coli K-12 MG1655,MG1655,M9,glucose(4),NH4Cl(1),O2,sauer trace element mixture,,37,7.0,,Batch,,No,,GAIIX,,4,doi.org/10.15252/msb.20145243,GSE48324
71,minspan__wt_glc__4,MinSpan,minspan,wt_glc,4,Escherichia coli K-12 MG1655,MG1655,M9,glucose(4),NH4Cl(1),O2,sauer trace element mixture,,37,7.0,,Batch,,No,,MiSeq,Bridging Study between MiSeq & GAIIX,4,,GSE122211


Disagreement: minspan__bw_delcra_glc DOI
Disagreement: minspan__bw_delcra_glc GEO


Unnamed: 0,sample_id,study,project_id,condition_id,rep_id,Strain Description,Strain,Base Media,Carbon Source (g/L),Nitrogen Source (g/L),Electron Acceptor,Trace Element Mixture,Supplement,Temperature (C),pH,Antibiotic,Culture Type,Growth Rate (1/hr),Evolved Sample,Isolate Type,Sequencing Machine,Additional Details,Biological Replicates,DOI,GEO
75,minspan__bw_delcra_glc__1,MinSpan,minspan,bw_delcra_glc,1,Escherichia coli BW25113 del_cra,BW25113,M9,glucose(4),NH4Cl(1),O2,sauer trace element mixture,,37,7.0,Kanamycin,Batch,,No,,GAIIX,,2,doi.org/10.15252/msb.20145243,GSE48324
76,minspan__bw_delcra_glc__2,MinSpan,minspan,bw_delcra_glc,2,Escherichia coli BW25113 del_cra,BW25113,M9,glucose(4),NH4Cl(1),O2,sauer trace element mixture,,37,7.0,Kanamycin,Batch,,No,,MiSeq,Bridging Study between MiSeq & GAIIX,2,,GSE122211


Disagreement: ica__wt_glc GEO


Unnamed: 0,sample_id,study,project_id,condition_id,rep_id,Strain Description,Strain,Base Media,Carbon Source (g/L),Nitrogen Source (g/L),Electron Acceptor,Trace Element Mixture,Supplement,Temperature (C),pH,Antibiotic,Culture Type,Growth Rate (1/hr),Evolved Sample,Isolate Type,Sequencing Machine,Additional Details,Biological Replicates,DOI,GEO
168,ica__wt_glc__1,ICA,ica,wt_glc,1,Escherichia coli K-12 MG1655,MG1655,M9,glucose(2),NH4Cl(1),O2,sauer trace element mixture,,37,7.0,,Batch,0.58,No,,HiSeq 4000,,6,,GSE122295
169,ica__wt_glc__2,ICA,ica,wt_glc,2,Escherichia coli K-12 MG1655,MG1655,M9,glucose(2),NH4Cl(1),O2,sauer trace element mixture,,37,7.0,,Batch,0.58,No,,HiSeq 4000,,6,,GSE122295
170,ica__wt_glc__3,ICA,ica,wt_glc,3,Escherichia coli K-12 MG1655,MG1655,M9,glucose(2),NH4Cl(1),O2,sauer trace element mixture,,37,7.0,,Batch,0.66,No,,HiSeq 4000,,6,,GSE122295
171,ica__wt_glc__4,ICA,ica,wt_glc,4,Escherichia coli K-12 MG1655,MG1655,M9,glucose(2),NH4Cl(1),O2,sauer trace element mixture,,37,7.0,,Batch,0.66,No,,HiSeq 4000,,6,,GSE122295
192,ica__wt_glc__5,ICA,ica,wt_glc,5,Escherichia coli K-12 MG1655,MG1655,M9,glucose(2),NH4Cl(1),O2,sauer trace element mixture,,37,7.0,,Batch,0.63,No,,HiSeq 4000,,6,,
193,ica__wt_glc__6,ICA,ica,wt_glc,6,Escherichia coli K-12 MG1655,MG1655,M9,glucose(2),NH4Cl(1),O2,sauer trace element mixture,,37,7.0,,Batch,0.63,No,,HiSeq 4000,,6,,


Based on the output above, it looks like DOI and GEO for replicate #1 should be representative of a condition

## Generate all CSVs

In [5]:
data_folder = 'data/'
for k, row in enrich.iterrows():
    this_fig = activity_bar_df(ica_data, k, sample_meta)
    this_fig.to_csv(data_folder + '%i_activity_bar.csv'%(k))
    