In [1]:
import pandas as pd
from clustergrammer_widget import *
from omics_tools import differential_expression, annotate_GO_KEGG, utils, comparison_generator

In [2]:
ginkgo_data = 'Reordered_ReadCountMatrix_preCAD.csv'

### Fix some aspects of the dataframe, like category type and value

In [3]:
ginkgo_df = pd.read_csv(ginkgo_data, index_col=0, header=None, low_memory=False).T
ginkgo_df['strain'] = [x.split('/')[-2].replace('_','') for x in ginkgo_df['strain'].values]
ginkgo_df.rename(columns={'timepoint':'Timepoint', 'temp': 'Temp'}, inplace=True)
ginkgo_df['IPTG'] = ginkgo_df['IPTG'].replace(' NA', 0)
ginkgo_df['IPTG'] = ginkgo_df['IPTG'].astype(bool)
ginkgo_df['Arabinose'] = ginkgo_df['Arabinose'].replace(' NA', 0)
ginkgo_df['Arabinose'] = ginkgo_df['Arabinose'].astype(bool)
ginkgo_df['Timepoint'] = ginkgo_df['Timepoint'].replace(regex=':hour', value='')
ginkgo_df['Temp'] = ginkgo_df['Temp'].replace(regex=':celsius', value='')
ginkgo_df.drop(['GinkgoID','R1','R2','filename','flags','library','gene_id'], axis=1, inplace=True)
for x in ['Temp', 'Timepoint']:
    ginkgo_df[x] = pd.to_numeric(ginkgo_df[x]).astype('int64')

In [4]:
ginkgo_df.head(4)

Unnamed: 0,strain,Temp,replicate,IPTG,Arabinose,Timepoint,Actuator_YFP,CamR,Sensor_AraC,Sensor_LacI,...,insI1,insH1,insE1,insF1,insC1,insD1,insQ,insJ,insK,insG
1,Strain3MG1655GenomicIcaRGate,37,0,False,False,18,2214,302,36,24,...,84,659,43,71,153,134,29,1,0,21
2,Strain3MG1655GenomicIcaRGate,37,1,False,False,18,3333,532,48,37,...,104,1026,72,121,179,193,41,4,1,42
3,Strain3MG1655GenomicIcaRGate,37,2,False,False,18,3762,559,76,56,...,133,1393,79,206,206,199,58,3,0,34
4,Strain3MG1655GenomicIcaRGate,37,3,False,False,18,3059,484,46,42,...,98,1006,54,141,189,172,48,2,1,32


### Set up the two main components which determin the comparison logic
#### the sub factors are the names of columns in the dataframe that define metadata for each sample
#### the DE_tests list has each comparison we want to construct differential tests for

In [4]:
sub_factors = ['Timepoint', 'Temp', 'Arabinose', 'IPTG']
DE_tests = [
    ['Strain1MG1655WT', 'Strain1MG1655WT'],
    ['Strain1MG1655WT', 'Strain2MG1655GenomicPhlFGate'], #WT vs all
    ['Strain1MG1655WT', 'Strain3MG1655GenomicIcaRGate'], 
    ['Strain1MG1655WT', 'Strain4MG1655GenomicNANDCircuit'],
    ['Strain4MG1655GenomicNANDCircuit', 'Strain3MG1655GenomicIcaRGate'], #NAND vs IcaR genome
    ['Strain4MG1655GenomicNANDCircuit', 'Strain2MG1655GenomicPhlFGate'], #NAND vs PhlF genome
    ['Strain4MG1655GenomicNANDCircuit', 'Strain4MG1655GenomicNANDCircuit']  #NAND vs NAND
]

### This sets up the commands to run in R by one of the two options further below

In [5]:
groups_array = utils.group_by_factors(ginkgo_df, ['strain']+sub_factors)
comparison_indices = comparison_generator.generate_comparisons(ginkgo_df, DE_tests, ['strain'], sub_factors, 1)
contrast_strings = differential_expression.make_contrast_strings(comparison_indices, groups_array)

### Option 1: run everything in python multiprocesssed (will take hours)

In [7]:
r_cmds = differential_expression.make_DE_cmds(
    dataframe = ginkgo_df, 
    base_comparisons = DE_tests, 
    sub_factors = sub_factors)
print('Created {0} differential tests'.format(len(r_cmds)))

Created 112 differential tests


In [None]:
deg_results = differential_expression.run_edgeR(r_cmds, cores=8)

### Option 2: generate script files in a directory ./scripts/ to transfer and run on HPC

In [15]:
differential_expression.make_hpc_de_files(
    dataframe=ginkgo_df,
    base_comparisons=DE_tests,
    sub_factors=sub_factors,
    run_dir='/btl/foundry/users/alex/20190228_novel_chassis/run_DGE/')

0

In [6]:
# Bring the results back into python
deg_results = utils.load_DE_results('/Users/acristo/build/omics_tools/examples/results/')

### Run the annotations

In [None]:
KEGG_df = annotate_GO_KEGG.run_kegg(deg_results)

In [25]:
import pickle
with open('kegg_results.pkl', 'wb') as f:
    pickle.dump(KEGG_df, f)

In [7]:
import pickle
with open('kegg_results.pkl', 'rb') as f:
    KEGG_df = pickle.load(f)

In [7]:
GO_df = annotate_GO_KEGG.run_go(deg_results)

INFO: downloading gene2go
INFO: extracting
INFO: complete
INFO: Downloading bacteria gene info
INFO: extracting
INFO: complete
INFO: downloading GO graph
INFO: complete
/Users/acristo/anaconda2/envs/test/lib/python3.6/site-packages/omics_tools/data/go-basic.obo: fmt(1.2) rel(2019-04-17) 47,398 GO Terms
 78%  3,553 of  4,567 population items found in association
/Users/acristo/anaconda2/envs/test/lib/python3.6/site-packages/omics_tools/data/go-basic.obo: fmt(1.2) rel(2019-04-17) 47,398 GO Terms
 78%  3,553 of  4,567 population items found in association
/Users/acristo/anaconda2/envs/test/lib/python3.6/site-packages/omics_tools/data/go-basic.obo: fmt(1.2) rel(2019-04-17) 47,398 GO Terms
/Users/acristo/anaconda2/envs/test/lib/python3.6/site-packages/omics_tools/data/go-basic.obo: fmt(1.2) rel(2019-04-17) 47,398 GO Terms
/Users/acristo/anaconda2/envs/test/lib/python3.6/site-packages/omics_tools/data/go-basic.obo: fmt(1.2) rel(2019-04-17) 47,398 GO Terms
/Users/acristo/anaconda2/envs/test/l

/Users/acristo/anaconda2/envs/test/lib/python3.6/site-packages/omics_tools/data/go-basic.obo: fmt(1.2) rel(2019-04-17) 47,398 GO Terms
 78%  3,553 of  4,567 population items found in association
/Users/acristo/anaconda2/envs/test/lib/python3.6/site-packages/omics_tools/data/go-basic.obo: fmt(1.2) rel(2019-04-17) 47,398 GO Terms
 78%  3,553 of  4,567 population items found in association
/Users/acristo/anaconda2/envs/test/lib/python3.6/site-packages/omics_tools/data/go-basic.obo: fmt(1.2) rel(2019-04-17) 47,398 GO Terms
 78%  3,553 of  4,567 population items found in association
/Users/acristo/anaconda2/envs/test/lib/python3.6/site-packages/omics_tools/data/go-basic.obo: fmt(1.2) rel(2019-04-17) 47,398 GO Terms
 78%  3,553 of  4,567 population items found in association
 98%     53 of     54 study items found in association
100%     54 of     54 study items found in population(4567)
Calculating 3,408 uncorrected p-values using fisher
   3,408 GO terms are associated with  3,553 of  4,56

/Users/acristo/anaconda2/envs/test/lib/python3.6/site-packages/omics_tools/data/go-basic.obo: fmt(1.2) rel(2019-04-17) 47,398 GO Terms
 78%  3,553 of  4,567 population items found in association
100%      4 of      4 study items found in association
100%      4 of      4 study items found in population(4567)
Calculating 3,408 uncorrected p-values using fisher
   3,408 GO terms are associated with  3,553 of  4,567 population items
      26 GO terms are associated with      4 of      4 study items
       1 GO terms found significant (< 0.05=alpha) (  1 enriched +   0 purified): statsmodels fdr_bh
100%      5 of      5 study items found in association
100%      5 of      5 study items found in population(4567)
Calculating 3,408 uncorrected p-values using fisher
   3,408 GO terms are associated with  3,553 of  4,567 population items
      36 GO terms are associated with      5 of      5 study items
       0 GO terms found significant (< 0.05=alpha) (  0 enriched +   0 purified): statsmodel

 90%    415 of    460 study items found in association
100%    460 of    460 study items found in population(4567)
Calculating 3,408 uncorrected p-values using fisher
   3,408 GO terms are associated with  3,553 of  4,567 population items
     779 GO terms are associated with    415 of    460 study items
       9 GO terms found significant (< 0.05=alpha) (  9 enriched +   0 purified): statsmodels fdr_bh
/Users/acristo/anaconda2/envs/test/lib/python3.6/site-packages/omics_tools/data/go-basic.obo: fmt(1.2) rel(2019-04-17) 47,398 GO Terms
 78%  3,553 of  4,567 population items found in association
 83%     10 of     12 study items found in association
100%     12 of     12 study items found in population(4567)
Calculating 3,408 uncorrected p-values using fisher
   3,408 GO terms are associated with  3,553 of  4,567 population items
      52 GO terms are associated with     10 of     12 study items
       0 GO terms found significant (< 0.05=alpha) (  0 enriched +   0 purified): statsmodel

 78%  3,553 of  4,567 population items found in association
 88%     65 of     74 study items found in association
100%     74 of     74 study items found in population(4567)
Calculating 3,408 uncorrected p-values using fisher
   3,408 GO terms are associated with  3,553 of  4,567 population items
     235 GO terms are associated with     65 of     74 study items
       0 GO terms found significant (< 0.05=alpha) (  0 enriched +   0 purified): statsmodels fdr_bh
 98%     81 of     83 study items found in association
100%     83 of     83 study items found in population(4567)
Calculating 3,408 uncorrected p-values using fisher
   3,408 GO terms are associated with  3,553 of  4,567 population items
     262 GO terms are associated with     81 of     83 study items
       0 GO terms found significant (< 0.05=alpha) (  0 enriched +   0 purified): statsmodels fdr_bh
/Users/acristo/anaconda2/envs/test/lib/python3.6/site-packages/omics_tools/data/go-basic.obo: fmt(1.2) rel(2019-04-17) 47,398 

Calculating 3,408 uncorrected p-values using fisher
 84%     82 of     98 study items found in association
100%     98 of     98 study items found in population(4567)
Calculating 3,408 uncorrected p-values using fisher
   3,408 GO terms are associated with  3,553 of  4,567 population items
      87 GO terms are associated with     29 of     33 study items
   3,408 GO terms are associated with  3,553 of  4,567 population items
     177 GO terms are associated with     82 of     98 study items
       0 GO terms found significant (< 0.05=alpha) (  0 enriched +   0 purified): statsmodels fdr_bh
       4 GO terms found significant (< 0.05=alpha) (  4 enriched +   0 purified): statsmodels fdr_bh
 93%    178 of    192 study items found in association
100%    192 of    192 study items found in population(4567)
Calculating 3,408 uncorrected p-values using fisher
   3,408 GO terms are associated with  3,553 of  4,567 population items
     479 GO terms are associated with    178 of    192 study i

100%    207 of    207 study items found in population(4567)
Calculating 3,408 uncorrected p-values using fisher
   3,408 GO terms are associated with  3,553 of  4,567 population items
     487 GO terms are associated with    184 of    207 study items
       0 GO terms found significant (< 0.05=alpha) (  0 enriched +   0 purified): statsmodels fdr_bh
 98%    127 of    130 study items found in association
100%    130 of    130 study items found in population(4567)
Calculating 3,408 uncorrected p-values using fisher
   3,408 GO terms are associated with  3,553 of  4,567 population items
     426 GO terms are associated with    127 of    130 study items
      10 GO terms found significant (< 0.05=alpha) ( 10 enriched +   0 purified): statsmodels fdr_bh
 78%    200 of    256 study items found in association
100%    256 of    256 study items found in population(4567)
Calculating 3,408 uncorrected p-values using fisher
   3,408 GO terms are associated with  3,553 of  4,567 population items
  

 78%      7 of      9 study items found in association
100%      9 of      9 study items found in population(4567)
Calculating 3,408 uncorrected p-values using fisher
   3,408 GO terms are associated with  3,553 of  4,567 population items
      34 GO terms are associated with      7 of      9 study items
       2 GO terms found significant (< 0.05=alpha) (  2 enriched +   0 purified): statsmodels fdr_bh
 93%     26 of     28 study items found in association
100%     28 of     28 study items found in population(4567)
Calculating 3,408 uncorrected p-values using fisher
   3,408 GO terms are associated with  3,553 of  4,567 population items
      60 GO terms are associated with     26 of     28 study items
       4 GO terms found significant (< 0.05=alpha) (  4 enriched +   0 purified): statsmodels fdr_bh
 94%  1,179 of  1,258 study items found in association
100%  1,258 of  1,258 study items found in population(4567)
Calculating 3,408 uncorrected p-values using fisher
   3,408 GO terms a

Calculating 3,408 uncorrected p-values using fisher
   3,408 GO terms are associated with  3,553 of  4,567 population items
     552 GO terms are associated with    203 of    208 study items
      34 GO terms found significant (< 0.05=alpha) ( 34 enriched +   0 purified): statsmodels fdr_bh
/Users/acristo/anaconda2/envs/test/lib/python3.6/site-packages/omics_tools/data/go-basic.obo: fmt(1.2) rel(2019-04-17) 47,398 GO Terms
 78%  3,553 of  4,567 population items found in association
/Users/acristo/anaconda2/envs/test/lib/python3.6/site-packages/omics_tools/data/go-basic.obo: fmt(1.2) rel(2019-04-17) 47,398 GO Terms
 78%  3,553 of  4,567 population items found in association
/Users/acristo/anaconda2/envs/test/lib/python3.6/site-packages/omics_tools/data/go-basic.obo: fmt(1.2) rel(2019-04-17) 47,398 GO Terms
 78%  3,553 of  4,567 population items found in association
 92%  1,082 of  1,175 study items found in association
100%  1,175 of  1,175 study items found in population(4567)
Calculat

100%    124 of    124 study items found in population(4567)
Calculating 3,408 uncorrected p-values using fisher
   3,408 GO terms are associated with  3,553 of  4,567 population items
     334 GO terms are associated with    117 of    124 study items
       8 GO terms found significant (< 0.05=alpha) (  8 enriched +   0 purified): statsmodels fdr_bh
 94%    263 of    279 study items found in association
100%    279 of    279 study items found in population(4567)
Calculating 3,408 uncorrected p-values using fisher
/Users/acristo/anaconda2/envs/test/lib/python3.6/site-packages/omics_tools/data/go-basic.obo: fmt(1.2) rel(2019-04-17) 47,398 GO Terms
   3,408 GO terms are associated with  3,553 of  4,567 population items
     672 GO terms are associated with    263 of    279 study items
      18 GO terms found significant (< 0.05=alpha) ( 18 enriched +   0 purified): statsmodels fdr_bh
 78%  3,553 of  4,567 population items found in association
 96%     71 of     74 study items found in ass

/Users/acristo/anaconda2/envs/test/lib/python3.6/site-packages/omics_tools/data/go-basic.obo: fmt(1.2) rel(2019-04-17) 47,398 GO Terms
 78%  3,553 of  4,567 population items found in association
/Users/acristo/anaconda2/envs/test/lib/python3.6/site-packages/omics_tools/data/go-basic.obo: fmt(1.2) rel(2019-04-17) 47,398 GO Terms
 78%  3,553 of  4,567 population items found in association
100%      1 of      1 study items found in association
100%      1 of      1 study items found in population(4567)
Calculating 3,408 uncorrected p-values using fisher
   3,408 GO terms are associated with  3,553 of  4,567 population items
       4 GO terms are associated with      1 of      1 study items
       0 GO terms found significant (< 0.05=alpha) (  0 enriched +   0 purified): statsmodels fdr_bh
 91%     10 of     11 study items found in association
100%     11 of     11 study items found in population(4567)
Calculating 3,408 uncorrected p-values using fisher
 82%    252 of    308 study items fo

   3,408 GO terms are associated with  3,553 of  4,567 population items
      58 GO terms are associated with     14 of     14 study items
       4 GO terms found significant (< 0.05=alpha) (  4 enriched +   0 purified): statsmodels fdr_bh
 67%      8 of     12 study items found in association
100%     12 of     12 study items found in population(4567)
Calculating 3,408 uncorrected p-values using fisher
   3,408 GO terms are associated with  3,553 of  4,567 population items
      26 GO terms are associated with      8 of     12 study items
       0 GO terms found significant (< 0.05=alpha) (  0 enriched +   0 purified): statsmodels fdr_bh
 80%    148 of    186 study items found in association
100%    186 of    186 study items found in population(4567)
Calculating 3,408 uncorrected p-values using fisher
   3,408 GO terms are associated with  3,553 of  4,567 population items
     341 GO terms are associated with    148 of    186 study items
       0 GO terms found significant (< 0.05=alp

Calculating 3,408 uncorrected p-values using fisher
   3,408 GO terms are associated with  3,553 of  4,567 population items
      20 GO terms are associated with      2 of      2 study items
       0 GO terms found significant (< 0.05=alpha) (  0 enriched +   0 purified): statsmodels fdr_bh
/Users/acristo/anaconda2/envs/test/lib/python3.6/site-packages/omics_tools/data/go-basic.obo: fmt(1.2) rel(2019-04-17) 47,398 GO Terms
 78%  3,553 of  4,567 population items found in association
/Users/acristo/anaconda2/envs/test/lib/python3.6/site-packages/omics_tools/data/go-basic.obo: fmt(1.2) rel(2019-04-17) 47,398 GO Terms
 78%  3,553 of  4,567 population items found in association
 97%     28 of     29 study items found in association
100%     29 of     29 study items found in population(4567)
Calculating 3,408 uncorrected p-values using fisher
   3,408 GO terms are associated with  3,553 of  4,567 population items
     104 GO terms are associated with     28 of     29 study items
       1 GO 

      10 GO terms found significant (< 0.05=alpha) ( 10 enriched +   0 purified): statsmodels fdr_bh
/Users/acristo/anaconda2/envs/test/lib/python3.6/site-packages/omics_tools/data/go-basic.obo: fmt(1.2) rel(2019-04-17) 47,398 GO Terms
 78%  3,553 of  4,567 population items found in association
/Users/acristo/anaconda2/envs/test/lib/python3.6/site-packages/omics_tools/data/go-basic.obo: fmt(1.2) rel(2019-04-17) 47,398 GO Terms
 78%  3,553 of  4,567 population items found in association
 89%    148 of    167 study items found in association
100%    167 of    167 study items found in population(4567)
Calculating 3,408 uncorrected p-values using fisher
   3,408 GO terms are associated with  3,553 of  4,567 population items
     318 GO terms are associated with    148 of    167 study items
       0 GO terms found significant (< 0.05=alpha) (  0 enriched +   0 purified): statsmodels fdr_bh
 92%    153 of    167 study items found in association
100%    167 of    167 study items found in popu

In [13]:
import pickle
with open('go_results.pkl', 'wb') as f:
    pickle.dump(GO_df, f)

In [8]:
import pickle
with open('go_results.pkl', 'rb') as f:
    GO_df = pickle.load(f)

### Combine annotations

In [10]:
anno_df = annotate_GO_KEGG.combine_annotations(GO_df, KEGG_df)

### Map descriptions of GO/KEGG annotations and create clustergrammer dataframe

In [11]:
cg_file = 'new_results'
factor_to_categories = utils.get_factor_categories(sub_factors, ginkgo_df)
terms = annotate_GO_KEGG.functional_annotations(GO_df, KEGG_df)
utils.create_clustergrammer_matrix_file(
    annotated_dataframe = anno_df, 
    factor_to_categories = factor_to_categories,
    terms = terms, 
    fname = cg_file)

### View clustergrammer matrix

In [12]:
net = Network(clustergrammer_widget)
net.load_file(cg_file + '.txt')
cgdf = net.export_df()
cgdf = utils.log10_conv(cgdf)
net.load_df(cgdf)
net.cluster(dist_type='euclidean')
net.widget()

  tmp_df['mat'] = pd.read_table(file_buffer, index_col=row_arr)
  x[x > 0] = -1*np.log10(x)
  x[x > 0] = -1*np.log10(x)
  x[x < 0] = np.log10(-1*x)
  x[x < 0] = np.log10(-1*x)


clustergrammer_widget(network='{"row_nodes": [{"name": "Strain1MG1655WT_False_False_30_18-vs-Strain1MG1655WT_F…