# PRECISE 1.0 File Generator

This notebook generates all files for the Modulytics web page of E. coli, based on ICA of the PRECISE RNA-seq database.

In [1]:
import sys
# be sure that this points to a clone of github.com/SBRG/ICA
# which is the source of data in this case
sys.path.append('../../../../ica/') 
from icaviz.plotting import *
DATA_DIR = '../../../../ica/data/precise_data/'
GENE_DIR = '../../../../ica/data/annotation/'
enrich = pd.read_csv(DATA_DIR+'curated_enrichments.csv', index_col = 0)
names = enrich['name'].tolist()
ica_data = load_data(X=DATA_DIR+'log_tpm.csv',
                     S=DATA_DIR+'S.csv',
                     A=DATA_DIR+'A.csv',
                     metadata=DATA_DIR+'metadata.csv',
                     annotation=GENE_DIR+'gene_info.csv',
                     trn=GENE_DIR+'TRN.csv',
                     fasta=GENE_DIR+'NC_000913.3.fasta',
                     cutoff = 550,
                     organism='ecoli')
pd.set_option('display.max_rows', None)

In [3]:
# get functions from py files
sys.path.append('../../../py')
from gene_table import *
from gene_histogram import *
from gene_scatter import *
from activity_bar import *
from regulon_venn import *
from regulon_scatter import *

# gene dashboards
# from perGene_table import *
from gene_activity_bar import *

import os

In [4]:
# read in other necessary annotation files
locus_to_db = pd.read_csv('../annotation/locus_to_db.csv',sep='\t', index_col = 1)
links = pd.read_csv('../annotation/gene_links.csv', index_col = 0, header = None).to_dict()[1]
sample_meta = pd.read_csv('sample_metadata.csv', index_col = 0)
tf_links = pd.read_csv('../annotation/tf_links.csv', index_col = 1, header=None).to_dict()[2]
gene_info = pd.read_csv(GENE_DIR+'gene_info.csv', index_col = 0)
trn = pd.read_csv(GENE_DIR+'TRN.csv', index_col = 0)

## Generate Sample Meta and TF_links, if necessary

Sample_metadata is simply the metadata table with all non-public columns removed.

In [5]:
# public metadata table

# drop the stuff the public doesn't need to see
sample_meta = ica_data.metadata.drop(['contact', 'library creator', 'run_date', 'R1', 'R2', 'BAM', 'alignment', 'Public'], axis=1)

# fix a minor issue with the number of biological replicates in a condition
sample_meta.loc[(sample_meta.project_id == 'ica') & (sample_meta.condition_id == 'wt_glc'), 'Biological Replicates'] = [6]*6

# reset index so that javascript can deal more easily (as numerically indexed arrays)
sample_meta = sample_meta.reset_index()

#sample_meta.to_csv('data/sample_metadata.csv')

I created the following csv and then **manually** annotated each TF with a relevant link in RegulonDB. I would need another mapping file, like locus_to_db, in order to auto generate this for all TFs.

Note that RegulonDB is often very slow, so these may in the future be replaced with ecocyc links.

In [None]:
all_tfs = []

for k, row in enrich.iterrows():
    if type(row.TF) == str:
        if '/' in row.TF:
            all_tfs += row.TF.split('/')
        elif '+' in row.TF:
            all_tfs += row.TF.split('+')
        else:
            all_tfs += [row.TF]
all_tfs = pd.Series(all_tfs).drop_duplicates()
#all_tfs.to_csv('../annotation/tf_links.csv')

## Generate Files

In [9]:
# dataset_meta: Stores information for the header of the dataset page
dataset_meta = pd.Series({'Title':'<i>E. Coli</i> PRECISE',
                          'Organism': '<i>Escherichia Coli</i> K-12',
                          'Strain': 'MG1655 and BW25113',
                          'Publication':'<a href="https://www.nature.com/articles/s41467-019-13483-w">Sastry, et al., 2019</a>'})
dataset_meta.to_csv('dataset_meta.csv')

  


In [11]:
# iM_table: this is the main component of the dataset page

cat_order = ['Carbon Source Utilization', 
             'Amino Acid and Nucleotide Biosynthesis',
             'Energy Metabolism',
             'Metal Homeostasis',
             'Miscellaneous Metabolism',
             'Structural Components',
             'Stress Response',
             'Regulator Discovery',
             'Biological Enrichment',
             'Genomic Alterations',
             'Uncharacterized']

cat_dict = {cat_order[i]:i for i in range(len(cat_order))}

im_table = enrich[['name', 'Regulator', 'Function', 'Category', 'n_genes', 'precision', 'recall']]
im_table = im_table.rename(columns={'name':'Name'})
im_table.index.name = 'k'
im_table['category_num'] = [cat_dict[im_table.Category[k]] for k in im_table.index]
im_table.to_csv('iM_table.csv')

In [12]:
num_ims = im_table.shape[0]-1
file = open('num_ims.txt', 'w')
file.write(str(num_ims))
file.close()

In [4]:
# This function is needed to create the header for each iModulon
def tf_with_links(k, row, tf_links):
    tf_str = row.TF

    if not(type(tf_str) == str):
        return tf_str

    # get a list of transcription factors
    and_or = ''
    if '/' in tf_str:
        and_or = ' or '
        tfs = tf_str.split('/')
    elif '+' in tf_str:
        and_or = ' and '
        tfs = tf_str.split('+')
    else:
        tfs = [tf_str]

    # start building an html string
    tfs_html = []
    for tf in tfs:
        if tf in tf_links.keys():
            link = tf_links[tf]
            if type(link)==str:# this tf has a link
                tf_ = '<a href="' + link + '" target="_blank">'+ tf + '</a>'
                tfs_html += [tf_]
            else: # this tf has no link
                tfs_html += [tf]
        # this tf isn't in the tf_links file
        else:
            tfs_html += [tf]
    res = and_or.join(tfs_html)
    return res


In [6]:
# This function generates all necessary files for a single iModulon
def make_directory(ica_data, k, row, links, sample_meta, tf_links):
    # generate the plot files
    gene_table = gene_table_df(ica_data, k, row, links = links)
    gene_hist = gene_hist_df(ica_data, k, row)
    base_conds = ['control__wt_glc__1','control__wt_glc__2']
    gene_scatter = gene_scatter_df(ica_data, k, base_conds, links)
    act_bar = activity_bar_df(ica_data, k, sample_meta)

    reg_venn = regulon_venn_df(ica_data, k, row)
    reg_scatter = regulon_scatter_df(ica_data, k, row)

    # generate a basic data df
    res = pd.Series(index=['name', 'TF', 'Regulator',
                           'Function', 'Category', 
                           'has_venn', 'scatter', 'has_meme'])
    res.loc['name'] = row.loc['name']
    res.loc['TF'] = row.TF
    res.loc['Regulator'] = tf_with_links(k, row, tf_links)
    res.loc['Function'] = row.Function
    res.loc['Category'] = row.Category
    res.loc['has_venn'] = not(reg_venn is None)
    if reg_scatter is None:
        res.loc['scatter'] = 0
    else:
        res.loc['scatter'] = reg_scatter.shape[1] - 1
    res.loc['has_meme'] = False # update later
    # may also want to add the stats from enrich

    # save output
    folder = 'iModulon_files/'+str(k)+'/'
    if not(os.path.isdir(folder)):
        os.mkdir(folder)
    res.to_csv(folder+str(k)+'_meta.csv')
    gene_table.to_csv(folder+str(k)+'_gene_table.csv')
    gene_hist.to_csv(folder+str(k)+'_gene_hist.csv')
    gene_scatter.to_csv(folder+str(k)+'_gene_scatter.csv')
    act_bar.to_csv(folder+str(k)+'_activity_bar.csv')
    if not(reg_venn is None):
        reg_venn.to_csv(folder+str(k)+'_reg_venn.csv')
    if not(reg_scatter is None):
        reg_scatter.to_csv(folder+str(k)+'_reg_scatter.csv')
    ica_data.S[k].to_csv(folder+str(k)+'_gene_weights.csv')
    ica_data.A.loc[k].to_csv(folder+str(k)+'_activity.csv')

In [7]:
for k, row in enrich.iterrows():
    make_directory(ica_data, k, row, links, sample_meta, tf_links)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.link[g] = links[g]


# Gene Files

In [6]:
# Information for Gene Metadata

def make_gene_files(ica_data, k, row, links, sample_meta, tf_links):

    # generate metadata df
    res = pd.Series(index=['bNum', 'name', 'operon', 'product',
                           'cog', 'regulator(s)'])
    res.loc['bNum'] = k
    res.loc['name'] = row.gene_name
    res.loc['operon'] = row.operon
    res.loc['product'] = row['product']
    res.loc['cog'] = row.cog
    res.loc['regulator(s)'] = ", ".join(trn[trn.gene_id == k].TF.to_list())
    
    # save output
    folder = 'gene_page_files/'+str(k)+'/'
    if not(os.path.isdir(folder)):
        os.mkdir(folder)
    res.to_csv(folder+str(k)+'_meta.csv')
    
    #activity plot
    this_fig = gene_activity_bar_df(ica_data, k, sample_meta)
    this_fig.to_csv(data_folder + k + '_activity_bar.csv')

In [7]:
for k, row in gene_info.iterrows():
    make_gene_files(ica_data, k, row, links, sample_meta, tf_links)

  import sys


KeyError: 'product'

### make json file for Search

In [6]:
# Get df in correct format
gene_df = gene_info.copy()
gene_df["bNum"] = gene_info.index
gene_df = gene_df[['gene_name', "bNum", "product"]]
gene_df = gene_df.sort_values(by="gene_name").fillna(value = "unknown gene product")

gene_df.to_json('./gene_page_files/gene_list.json', orient="records")

In [8]:
gene_info

Unnamed: 0,start,stop,strand,gene_name,length,operon,cog
b0001,189,255,+,thrL,66,thrLABC,No COG Annotation
b0002,336,2799,+,thrA,2463,thrLABC,Amino acid transport and metabolism
b0003,2800,3733,+,thrB,933,thrLABC,Amino acid transport and metabolism
b0004,3733,5020,+,thrC,1287,thrLABC,Amino acid transport and metabolism
b0005,5233,5530,+,yaaX,297,yaaX,Function unknown
b0006,5682,6459,-,yaaA,777,yaaA,Function unknown
b0007,6528,7959,-,yaaJ,1431,yaaJ,Amino acid transport and metabolism
b0008,8237,9191,+,talB,954,talB,Carbohydrate transport and metabolism
b0009,9305,9893,+,mog,588,mog,Coenzyme transport and metabolism
b0010,9927,10494,-,satP,567,satP,Function unknown
