# PRECISE 1.0 File Generator

This notebook generates all files for the Modulytics web page of E. coli, based on ICA of the PRECISE RNA-seq database.

In [1]:
import sys
# be sure that this points to a clone of github.com/SBRG/ICA
# which is the source of data in this case
sys.path.append('../../../../ica/') 
from icaviz.plotting import *
DATA_DIR = '../../../../ica/data/precise_data/'
GENE_DIR = '../../../../ica/data/annotation/'
enrich = pd.read_csv(DATA_DIR+'curated_enrichments.csv', index_col = 0)
names = enrich['name'].tolist()
ica_data = load_data(X=DATA_DIR+'log_tpm.csv',
                     S=DATA_DIR+'S.csv',
                     A=DATA_DIR+'A.csv',
                     metadata=DATA_DIR+'metadata.csv',
                     annotation=GENE_DIR+'gene_info.csv',
                     trn=GENE_DIR+'TRN.csv',
                     fasta=GENE_DIR+'NC_000913.3.fasta',
                     cutoff = 550,
                     organism='ecoli')
pd.set_option('display.max_rows', None)

In [2]:
# get functions from py files
sys.path.append('../../../py')
from gene_table import *
from gene_histogram import *
from gene_scatter import *
from activity_bar import *
from regulon_venn import *
from regulon_scatter import *

# gene dashboards
from gene_activity_bar import *

import os

In [3]:
# read in other necessary annotation files
locus_to_db = pd.read_csv('../annotation/locus_to_db.csv',sep='\t', index_col = 1)
links = pd.read_csv('../annotation/gene_links.csv', index_col = 0, header = None).to_dict()[1]
sample_meta = pd.read_csv('sample_metadata.csv', index_col = 0)
tf_links = pd.read_csv('../annotation/tf_links.csv', index_col = 1, header=None).to_dict()[2]
gene_info = pd.read_csv(GENE_DIR+'gene_info.csv', index_col = 0)
trn = pd.read_csv(GENE_DIR+'TRN.csv', index_col = 0)

## Generate Sample Meta and TF_links, if necessary

Sample_metadata is simply the metadata table with all non-public columns removed.

In [4]:
# public metadata table

# drop the stuff the public doesn't need to see
sample_meta = ica_data.metadata.drop(['contact', 'library creator', 'run_date', 'R1', 'R2', 'BAM', 'alignment', 'Public'], axis=1)

# fix a minor issue with the number of biological replicates in a condition
sample_meta.loc[(sample_meta.project_id == 'ica') & (sample_meta.condition_id == 'wt_glc'), 'Biological Replicates'] = [6]*6

# reset index so that javascript can deal more easily (as numerically indexed arrays)
sample_meta = sample_meta.reset_index()

#sample_meta.to_csv('data/sample_metadata.csv')

I created the following csv and then **manually** annotated each TF with a relevant link in RegulonDB. I would need another mapping file, like locus_to_db, in order to auto generate this for all TFs.

Note that RegulonDB is often very slow, so these may in the future be replaced with ecocyc links.

In [5]:
all_tfs = []

for k, row in enrich.iterrows():
    if type(row.TF) == str:
        if '/' in row.TF:
            all_tfs += row.TF.split('/')
        elif '+' in row.TF:
            all_tfs += row.TF.split('+')
        else:
            all_tfs += [row.TF]
all_tfs = pd.Series(all_tfs).drop_duplicates()
#all_tfs.to_csv('../annotation/tf_links.csv')

## Generate Files

In [6]:
num_conds = 0;
for n, g in ica_data.metadata.groupby(['condition_id', 'project_id']):
    num_conds += 1
num_conds

163

In [13]:
Mbin = pd.DataFrame(index = ica_data.S.index, columns = ica_data.S.columns)
g_im_list = pd.DataFrame(columns = ['iModulon', 'Gene'])
for k in Mbin.columns:
    Mbin[k] = ica_data.S[k].abs()>ica_data.thresholds[k]
    for g in Mbin.index[Mbin[k]]:
        g_im_list = g_im_list.append({'iModulon':k, 'Gene':g}, ignore_index = True)

Mbin.to_csv('data_files/gene_presence_matrix.csv')
g_im_list.to_csv('data_files/gene_presence_list.csv')

In [11]:
g_im_list

Unnamed: 0,iModulon,Gene
0,0,b0306
1,0,b0307
2,0,b0308
3,0,b0447
4,0,b1826
5,0,b2141
6,0,b2142
7,0,b2143
8,0,b3755
9,0,b4116


In [7]:
# dataset_meta: Stores information for the header of the dataset page
num_genes = ica_data.X.shape[0]
num_samps = ica_data.X.shape[1]
num_conds = 0;
for n, g in ica_data.metadata.groupby(['condition_id', 'project_id']):
    num_conds += 1
num_ims = ica_data.S.shape[1]

dataset_meta = pd.Series({'Title':'<i>E. coli</i> PRECISE',
                          'Organism': '<i>Escherichia coli</i> K-12',
                          'Strain': 'MG1655 and BW25113',
                          'Publication':'<a href="https://doi.org/10.1038/s41467-019-13483-w">Sastry, et al., 2019</a>',
                          'Number of Samples': num_samps,
                          'Number of Unique Conditions': num_conds,
                          'Number of Genes':num_genes,
                          'Number of iModulons': num_ims})
dataset_meta.to_csv('dataset_meta.csv')



In [8]:
# iM_table: this is the main component of the dataset page

cat_order = ['Carbon Source Utilization', 
             'Amino Acid and Nucleotide Biosynthesis',
             'Energy Metabolism',
             'Metal Homeostasis',
             'Miscellaneous Metabolism',
             'Structural Components',
             'Stress Response',
             'Regulator Discovery',
             'Biological Enrichment',
             'Genomic Alterations',
             'Uncharacterized']

cat_dict = {cat_order[i]:i for i in range(len(cat_order))}

im_table = enrich[['name', 'Regulator', 'Function', 'Category', 'n_genes', 'precision', 'recall']]
im_table = im_table.rename(columns={'name':'Name'})
im_table.index.name = 'k'
im_table['category_num'] = [cat_dict[im_table.Category[k]] for k in im_table.index]
im_table.to_csv('iM_table.csv')

In [9]:
num_ims = im_table.shape[0]-1
file = open('num_ims.txt', 'w')
file.write(str(num_ims))
file.close()

In [10]:
# This function is needed to create the header for each iModulon
def tf_with_links(k, row, tf_links):
    tf_str = row.TF

    if not(type(tf_str) == str):
        return tf_str

    # get a list of transcription factors
    and_or = ''
    if '/' in tf_str:
        and_or = ' or '
        tfs = tf_str.split('/')
    elif '+' in tf_str:
        and_or = ' and '
        tfs = tf_str.split('+')
    else:
        tfs = [tf_str]

    # start building an html string
    tfs_html = []
    for tf in tfs:
        if tf in tf_links.keys():
            link = tf_links[tf]
            if type(link)==str:# this tf has a link
                tf_ = '<a href="' + link + '" target="_blank">'+ tf + '</a>'
                tfs_html += [tf_]
            else: # this tf has no link
                tfs_html += [tf]
        # this tf isn't in the tf_links file
        else:
            tfs_html += [tf]
    res = and_or.join(tfs_html)
    return res


In [11]:
# This function generates all necessary files for a single iModulon
def make_directory(ica_data, k, row, links, sample_meta, tf_links):
    # generate the plot files
    gene_table = gene_table_df(ica_data, k, row, links = links)
    gene_hist = gene_hist_df(ica_data, k, row)
    base_conds = ['control__wt_glc__1','control__wt_glc__2']
    gene_scatter = gene_scatter_df(ica_data, k, base_conds, links)
    act_bar = activity_bar_df(ica_data, k, sample_meta)

    reg_venn = regulon_venn_df(ica_data, k, row)
    reg_scatter = regulon_scatter_df(ica_data, k, row)

    # generate a basic data df
    res = pd.Series(index=['name', 'TF', 'Regulator',
                           'Function', 'Category', 
                           'has_venn', 'scatter', 'has_meme',
                           'precision', 'recall'])
    res.loc['name'] = row.loc['name']
    res.loc['TF'] = row.TF
    res.loc['Regulator'] = tf_with_links(k, row, tf_links)
    res.loc['Function'] = row.Function
    res.loc['Category'] = row.Category
    res.loc['has_venn'] = not(reg_venn is None)
    if reg_scatter is None:
        res.loc['scatter'] = 0
    else:
        res.loc['scatter'] = reg_scatter.shape[1] - 1
    res.loc['has_meme'] = False # update later
    res.loc['precision'] = row.precision
    res.loc['recall'] = row.recall
    

    # save output
    folder = 'iModulon_files/'+str(k)+'/'
    if not(os.path.isdir(folder)):
        os.mkdir(folder)
    res.to_csv(folder+str(k)+'_meta.csv')
    gene_table.to_csv(folder+str(k)+'_gene_table.csv')
    gene_hist.to_csv(folder+str(k)+'_gene_hist.csv')
    gene_scatter.to_csv(folder+str(k)+'_gene_scatter.csv')
    act_bar.to_csv(folder+str(k)+'_activity_bar.csv')
    if not(reg_venn is None):
        reg_venn.to_csv(folder+str(k)+'_reg_venn.csv')
    if not(reg_scatter is None):
        reg_scatter.to_csv(folder+str(k)+'_reg_scatter.csv')
    ica_data.S[k].to_csv(folder+str(k)+'_gene_weights.csv')
    ica_data.A.loc[k].to_csv(folder+str(k)+'_activity.csv')

In [12]:
for k, row in enrich.iterrows():
    make_directory(ica_data, k, row, links, sample_meta, tf_links)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]


# Gene Files

In [32]:
# Information for Gene Metadata

def make_gene_files(ica_data, k, row, links, sample_meta, tf_links):

    # generate metadata df
    res = pd.Series(index=['gene_id', 'name', 'operon', 'product',
                           'cog', 'regulator(s)', 'link'])
    res.loc['gene_id'] = k
    res.loc['name'] = row.gene_name
    res.loc['operon'] = row.operon
    res.loc['product'] = row['product']
    res.loc['cog'] = row.cog
    res.loc['regulator(s)'] = ", ".join(trn[trn.gene_id == k].TF.to_list())
    res.loc['link'] = '<a href="' + str(links[k]) + '">'+ 'EcoCyc' + '</a>'
    res.fillna(value="<i> Not Available </i>", inplace = True) 
    
    # save output
    folder = 'gene_page_files/'+str(k)+'/'
    if not(os.path.isdir(folder)):
        os.mkdir(folder)
    res.to_csv(folder+str(k)+'_meta.csv')

In [11]:
# Filter out genes with no ICA data
reduced_gene_info = gene_info.copy()
reduced_gene_info = reduced_gene_info[reduced_gene_info.index.isin(ica_data.X.index)]

In [33]:
# generate metadata csv for each gene
for k, row in reduced_gene_info.iterrows():
    make_gene_files(ica_data, k, row, links, sample_meta, tf_links)



In [26]:
# activity plot + expression for each gene
for gene_id in ica_data.X.index:
    this_fig = gene_activity_bar_df(ica_data, gene_id, sample_meta)
    folder = 'gene_page_files/'+str(gene_id)+'/'
    this_fig.to_csv(folder + gene_id + '_activity_bar.csv')
    ica_data.X.loc[gene_id].to_csv(folder+gene_id+'_expression.csv')

In [13]:
# table for each gene

# generate df with all iM genes indicated T/F 
im_genes= ica_data.S.T.copy()
for k,row in ica_data.S.T.iterrows():
    im_genes.loc[k,:] = abs(row) > abs(ica_data.thresholds[k])
    
# iM_table: start with main component of iM page
im_table = enrich[['name', 'Regulator', 'Function', 'Category']]
im_table = im_table.rename(columns={'name':'Name'})
im_table.index.name = 'k'

#loop through genes:
for gene_id in ica_data.S.index:
    perGene_table = im_table.copy()
    perGene_table.insert(column ='in_iM', value = im_genes.loc[:, gene_id], loc = 1)
    perGene_table.insert(column ='gene_weight', value = ica_data.S.loc[gene_id, :], loc = 2)

    #sort
    perGene_table = (perGene_table.assign(A=perGene_table['gene_weight'].abs())
            .sort_values(['in_iM','A'],ascending=[False, False])
            .drop('A', 1))
    folder = 'gene_page_files/'+str(gene_id)+'/'
    perGene_table.to_csv(folder + gene_id + '_perGene_table.csv')

### make json file for Search

#### Genes

In [12]:
# Get df in correct format
gene_df = reduced_gene_info.copy()
gene_df["gene_id"] = reduced_gene_info.index
gene_df = gene_df[['gene_name', "gene_id", "product"]]
gene_df = gene_df.sort_values(by="gene_name").fillna(value = "not available")

#create gene info json
# gene_df.to_json('./gene_page_files/gene_list.json', orient="records")

#### iModulons

In [14]:
enrich_df = enrich.copy()
enrich_df['component'] = enrich_df.index
enrich_df = enrich_df[['component', 'name', 'Regulator', 'Function']]
enrich_df = enrich_df.sort_values(by="name").fillna(value = "N/A")

enrich_df.to_json('./iModulon_files/im_list.json', orient="records")