# Nicolas File Generator

This notebook generates all files for the Modulytics web page of Bacillus subtilis, based on ICA of Nicolas, et al., 2012 database

In [1]:
import sys
# be sure that this point to a clone of github.com/SBRG/ICA
sys.path.append('../../../../ica/')
from icaviz.plotting import *
DATA_DIR = 'data_files/'
GENE_DIR = '../annotation/gene_files/'
enrich = pd.read_csv(DATA_DIR + 'curated_enrichments.csv', index_col=0)
names = enrich['name'].tolist()
ica_data = load_data(X=DATA_DIR+'log_tpm.csv',
                     S=DATA_DIR+'M.csv',
                     A=DATA_DIR+'A.csv',
                     metadata='sample_metadata.csv',
                     annotation=GENE_DIR+'gene_info.csv',
                     trn=GENE_DIR+'TRN.csv',
                     cutoff=1300, 
                     organism='bsubtilis')
ica_data.thresholds = enrich.threshold.to_dict()
pd.set_option('display.max_rows', None)

In [2]:
# get functions from py files
sys.path.append('../../../py')
from gene_table import *
from gene_histogram import *
from gene_scatter import *
from activity_bar import *
from regulon_venn import *
from regulon_scatter import *

# gene dashboards
from gene_activity_bar import *

import os

In [3]:
links = pd.read_csv('../annotation/gene_links.csv', index_col = 0, header=None).to_dict()[1]
sample_meta = ica_data.metadata.reset_index()
gene_info = pd.read_csv(GENE_DIR+'gene_info.csv', index_col = 0)
trn = pd.read_csv(GENE_DIR+'TRN.csv', index_col = 0)

In [4]:
# Bacillus has overly complicated strings for regulation
# E coli and future datasets: see any other file generator
# for a different version of this function
def tf_with_links(k, row):
    tf_str = row.TF

    if not(type(tf_str) == str):
        return tf_str

    # get a list of transcription factors
    def convert_to_link(tf):
        pre = 'http://subtiwiki.uni-goettingen.de/v3/gene/search/exact/'
        link = pre + tf
        return '<a href="' + link + '" target="_blank">'+ tf + '</a>'
    
    i = 0;
    res = ''
    while i < len(tf_str):
        if tf_str[i] in '[ ]+/':
            res += tf_str[i]
            i += 1
        else:
            tf = tf_str[i:].partition(']')[0].partition(' ')[0]
            res += convert_to_link(tf)
            i += len(tf)
    return res

## Generate Files

In [5]:
# dataset_meta: Stores information for the header of the dataset page
num_genes = ica_data.X.shape[0]
num_samps = ica_data.X.shape[1]
num_conds = 0;
for n, g in ica_data.metadata.groupby(['condition_id', 'project_id']):
    num_conds += 1
num_ims = ica_data.S.shape[1]

dataset_meta = pd.Series({'Title':'<i>B. subtilis</i> Microarray',
                          'Organism': '<i>Bacillus subtilis</i>',
                          'Strain': '168/BSB1',
                          'Publication':'<a href="https://doi.org/10.1101/2020.04.26.062638">Rychel, et al., 2020</a>',
                          'Number of Samples': num_samps,
                          'Number of Unique Conditions': num_conds,
                          'Number of Genes':num_genes,
                          'Number of iModulons': num_ims})
dataset_meta.to_csv('dataset_meta.csv')



In [6]:
im_table = enrich[['name', 'Regulator', 'Function', 'Category', 'n_genes', 'precision', 'recall']]
im_table = im_table.rename(columns={'name':'Name'})
im_table.index.name = 'k'
im_table['category_num'] = enrich.new_idx
im_table.to_csv('iM_table.csv')

In [7]:
num_ims = im_table.shape[0]-1
file = open('num_ims.txt', 'w')
file.write(str(num_ims))
file.close()

In [8]:
enrich

Unnamed: 0,name,TF,high_conf,precision,recall,reg_type,Function,Category,category2,category3,new_idx,Regulator,threshold,n_genes
1,FadR – Fatty Acids,FadR,1,1.0,0.73913,well defined,Fatty acid degradation,Carbon,Carbon Source,Fatty Acids,0,FadR,0.08736,17.0
6,MalR – Malate,MalR,1,1.0,1.0,well defined,Malate uptake and utilization,Carbon,Carbon Source,Malate,1,MalR,0.144234,5.0
38,GlpP – Glycerol,GlpP,1,1.0,1.0,well defined,Glycerol uptake and utilization,Carbon,Carbon Source,Glycerol,2,GlpP,0.106678,10.0
59,FruR – Fructose,FruR,1,1.0,1.0,well defined,Fructose uptake and utilization,Carbon,Carbon Source,Fructose,3,FruR,0.206403,4.0
72,LevR – Levan and Fructose,LevR,1,1.0,1.0,well defined,"Fuctose uptake and phosphorylation, degradatio...",Carbon,Carbon Source,Fructose,4,LevR,0.142297,6.0
61,FrlR – Amino Sugars,FrlR,1,1.0,1.0,well defined,Amino sugar uptake and metabolism,Carbon,Carbon Source,Amino Sugars,5,FrlR,0.127036,7.0
30,ManR – Mannose,ManR,1,1.0,1.0,well defined,Utilization of mannose for cell wall synthesis...,Carbon,Carbon Source,Mannose,6,ManR,0.119812,6.0
3,AcoR – Acetoin,AcoR,1,1.0,0.75,well defined,"Utilization of acetoin, extracellular energy s...",Carbon,Carbon Source,Acetoin,7,AcoR,0.153858,3.0
80,LicR – Lichenan,LicR,1,0.444444,0.8,contains unknown,"Uptake, phosphorylation, and utilization of le...",Carbon,Carbon Source,Lichenan,8,LicR,0.076635,9.0
82,CcpA-1 – Low Glucose 1,CcpA + SigA,0,0.794872,0.126531,subset,Uptake and utilization of alternative carbon s...,Carbon,General Carbon,Low glucose response,9,CcpA + SigA,0.057071,39.0


In [9]:
def make_directory(ica_data, k, row, links, sample_meta):
    # generate the plot files
    gene_table = gene_table_df(ica_data, k, row, links = links, operon_commas=False)
    gene_hist = gene_hist_df(ica_data, k, row)
    base_conds = ['M9exp_1', 'M9exp_2', 'M9exp_3']
    gene_scatter = gene_scatter_df(ica_data, k, base_conds, links)
    act_bar = activity_bar_df(ica_data, k, sample_meta)

    reg_venn = regulon_venn_df(ica_data, k, row)
    reg_scatter = regulon_scatter_df(ica_data, k, row)

    # generate a basic data df
    res = pd.Series(index=['name', 'TF', 'Regulator',
                           'Function', 'Category', 
                           'has_venn', 'scatter', 'has_meme',
                           'precision', 'recall'])
    res.loc['name'] = row.loc['name']
    res.loc['TF'] = row.TF
    res.loc['Regulator'] = tf_with_links(k, row)
    res.loc['Function'] = row.Function
    res.loc['Category'] = row.Category
    res.loc['has_venn'] = not(reg_venn is None)
    if reg_scatter is None:
        res.loc['scatter'] = 0
    else:
        res.loc['scatter'] = reg_scatter.shape[1] - 1
    res.loc['has_meme'] = False # update later
    res.loc['precision'] = row.precision
    res.loc['recall'] = row.recall

    # save output
    folder = 'iModulon_files/'+str(k)+'/'
    if not(os.path.isdir(folder)):
        os.mkdir(folder)
    res.to_csv(folder+str(k)+'_meta.csv')
    gene_table.to_csv(folder+str(k)+'_gene_table.csv')
    gene_hist.to_csv(folder+str(k)+'_gene_hist.csv')
    gene_scatter.to_csv(folder+str(k)+'_gene_scatter.csv')
    act_bar.to_csv(folder+str(k)+'_activity_bar.csv')
    if not(reg_venn is None):
        reg_venn.to_csv(folder+str(k)+'_reg_venn.csv')
    if not(reg_scatter is None):
        reg_scatter.to_csv(folder+str(k)+'_reg_scatter.csv')
    ica_data.S[k].to_csv(folder+str(k)+'_gene_weights.csv')
    ica_data.A.loc[k].to_csv(folder+str(k)+'_activity.csv')

In [10]:
for k, row in enrich.iterrows():
    make_directory(ica_data, k, row, links, sample_meta)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the cavea

MISSING TF: stringentresponse
MISSING TF: stringentresponse


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the cavea

# Gene Files

In [6]:
# Filter out genes with no ICA data
reduced_gene_info = gene_info.copy()
reduced_gene_info = reduced_gene_info[reduced_gene_info.index.isin(ica_data.X.index)]

In [10]:
# Information for Gene Metadata

def make_gene_files(ica_data, k, row, links, sample_meta):

    # generate metadata df
    res = pd.Series(index=['gene_id', 'name', 'operon', 'product',
                           'cog', 'regulator(s)', 'link'])
    res.loc['gene_id'] = k
    res.loc['name'] = row.gene_name
    res.loc['operon'] = row.operon
    res.loc['product'] = row['product']
    res.loc['cog'] = row.cog
    res.loc['regulator(s)'] = ", ".join(trn[trn.gene_id == k].TF.to_list())
    try:
        res.loc['link'] = '<a href="' + str(links[k]) + '">'+ 'SubtiWiki' + '</a>'
    except:
        res.loc['link'] = None
    res.fillna(value="<i> Not Available </i>", inplace = True) 
    
    # save output
    folder = 'gene_page_files/'+str(k)+'/'
    if not(os.path.isdir(folder)):
        os.mkdir(folder)
    res.to_csv(folder+str(k)+'_meta.csv')

In [11]:
# generate metadata csv for each gene
for k, row in reduced_gene_info.iterrows():
    make_gene_files(ica_data, k, row, links, sample_meta)



In [12]:
# activity plot + expression for each gene
for gene_id in reduced_gene_info.index:
    this_fig = gene_activity_bar_df(ica_data, gene_id, sample_meta)
    folder = 'gene_page_files/'+str(gene_id)+'/'
    this_fig.to_csv(folder + gene_id + '_activity_bar.csv')
    ica_data.X.loc[gene_id].to_csv(folder+gene_id+'_expression.csv')

In [None]:
# table for each gene

# generate df with all iM genes indicated T/F 
im_genes= ica_data.S.T.copy()
for k,row in ica_data.S.T.iterrows():
    im_genes.loc[k,:] = abs(row) > abs(ica_data.thresholds[k])
    
# iM_table: start with main component of iM page
im_table = enrich[['name', 'Regulator', 'Function', 'Category']]
im_table = im_table.rename(columns={'name':'Name'})
im_table.index.name = 'k'

#loop through genes:
for gene_id in reduced_gene_info.index:
    perGene_table = im_table.copy()
    perGene_table.insert(column ='in_iM', value = im_genes.loc[:, gene_id], loc = 1)
    perGene_table.insert(column ='gene_weight', value = ica_data.S.loc[gene_id, :], loc = 2)

    #sort
    perGene_table = (perGene_table.assign(A=perGene_table['gene_weight'].abs())
            .sort_values(['in_iM','A'],ascending=[False, False])
            .drop('A', 1))
    folder = 'gene_page_files/'+str(gene_id)+'/'
    perGene_table.to_csv(folder + gene_id + '_perGene_table.csv')

### make json file for Search

#### Genes

In [7]:
# Get df in correct format
gene_df = reduced_gene_info.copy()
gene_df["gene_id"] = reduced_gene_info.index
gene_df = gene_df[['gene_name', "gene_id", "product"]]
gene_df = gene_df.sort_values(by="gene_name")
gene_df["gene_name"] = gene_df["gene_name"].fillna(value = "Unnamed Gene")
gene_df["product"] = gene_df["product"].fillna(value = "not available")

#create gene info json
gene_df.to_json('./gene_page_files/gene_list.json', orient="records")

#### iModulons

In [5]:
enrich_df = enrich.copy()
enrich_df['component'] = enrich_df.index
enrich_df = enrich_df[['component', 'name', 'Regulator', 'Function']]
enrich_df = enrich_df.sort_values(by="name").fillna(value = "N/A")

enrich_df.to_json('./iModulon_files/im_list.json', orient="records")