# StaphPRECISE File Generator

This notebook generates all files for the Modulytics web page of Staph aureus, based on ICA of StaphPRECISE.

In [6]:
import sys
# be sure that this point to a clone of github.com/SBRG/ICA
sys.path.append('../../../../ica/')
from icaviz.plotting import *
DATA_DIR = 'data_files/'
GENE_DIR = '../annotation/gene_files/'
enrich = pd.read_csv(DATA_DIR + 'curated_enrichments.csv', index_col=0)
names = enrich['name'].tolist()
ica_data = load_data(X=DATA_DIR+'log_tpm.csv',
                     S=DATA_DIR+'M.csv',
                     A=DATA_DIR+'A.csv',
                     metadata='sample_metadata.csv',
                     annotation=GENE_DIR+'gene_info.csv',
                     trn=GENE_DIR+'TRN.csv',
                     cutoff=280, 
                     organism='saureus')
ica_data.thresholds = enrich.threshold.to_dict()
pd.set_option('display.max_rows', None)

KeyError: "['USA300_TCH1516_BK-Set001_TSB_2.5ug/mLNafcillin_2', 'USA300_TCH1516_BK-Set001_TSB_Control_1', 'USA300_TCH1516_BK-Set001_TSB_0.25ug/mLLinezolid_2', 'USA300_TCH1516_BK-Set001_TSB_0.5ug/mLVancomycin_2', 'USA300_TCH1516_BK-Set001_TSB_0.5ug/mLVancomycin_1', 'USA300_TCH1516_BK-Set001_TSB_2.5ug/mLNafcillin_1', 'USA300_TCH1516_BK-Set001_TSB_0.25ug/mLLinezolid_1', 'USA300_TCH1516_BK-Set001_TSB_Control_2'] not in index"

In [9]:
metadata = pd.read_csv('sample_metadata.csv', index_col = 0)
X = pd.read_csv(DATA_DIR+'log_tpm.csv', index_col = 0)

In [13]:
len(X.columns)

108

In [14]:
len(metadata.index)

98

In [15]:
# get functions from py files
sys.path.append('../../../py')
from gene_table import *
from gene_histogram import *
from gene_scatter import *
from activity_bar import *
from regulon_venn import *
from regulon_scatter import *

import os

In [17]:
links = pd.read_csv('../annotation/gene_links.csv', index_col = 0, header=None).to_dict()[1]
sample_meta = ica_data.metadata.reset_index()

NameError: name 'ica_data' is not defined

In [None]:
# this function currently works for bacillus, need to update for staph
def tf_with_links(k, row):
    tf_str = row.TF

    if not(type(tf_str) == str):
        return tf_str

    # get a list of transcription factors
    def convert_to_link(tf):
        pre = 'http://subtiwiki.uni-goettingen.de/v3/gene/search/exact/'
        link = pre + tf
        return '<a href="' + link + '" target="_blank">'+ tf + '</a>'
    
    i = 0;
    res = ''
    while i < len(tf_str):
        if tf_str[i] in '[ ]+/':
            res += tf_str[i]
            i += 1
        else:
            tf = tf_str[i:].partition(']')[0].partition(' ')[0]
            res += convert_to_link(tf)
            i += len(tf)
    return res

## Generate Files

In [18]:
# dataset_meta: Stores information for the header of the dataset page
dataset_meta = pd.Series({'Title':'<i>S. aureus</i> StaphPRECISE',
                          'Organism': '<i>Staphylococcus aureus</i> USA 300',
                          'Strain': 'TCH1516 and LAC',
                          'Publication':'<a href="https://www.biorxiv.org/content/10.1101/2020.03.18.997296v1">Poudel, et al., 2020</a>'})
dataset_meta.to_csv('dataset_meta.csv')

  


In [20]:
enrich.Category.unique()

array(['Carbon Source Utilization', 'Virulence', 'Uncharacterized',
       'Amino Acid and Nucleotide Metabolism', 'Stress Response',
       'Energy Metabolism', 'Biological Enrichment', 'Metal Homeostasis',
       'Miscellaneous Metabolism', 'Structural Components',
       'Mobile Elements'], dtype=object)

In [25]:
cat_order = ['Virulence',
             'Carbon Source Utilization',
             'Amino Acid and Nucleotide Metabolism',
             'Energy Metabolism',
             'Metal Homeostasis',
             'Miscellaneous Metabolism',
             'Structural Components',
             'Stress Response',
             'Biological Enrichment',
             'Mobile Elements',
             'Uncharacterized']
cat_dict = {cat_order[i]:i for i in range(len(cat_order))}
im_table = enrich[['name', 'Regulator', 'Function', 'Category', 'n_genes', 'precision', 'recall']]
im_table = im_table.rename(columns={'name':'Name'})
im_table.index.name = 'k'
im_table['category_num'] = [cat_dict[im_table.Category[k]] for k in im_table.index]
im_table.to_csv('iM_table.csv')

In [None]:
num_ims = im_table.shape[0]-1
file = open('num_ims.txt', 'w')
file.write(str(num_ims))
file.close()

In [None]:
def make_directory(ica_data, k, row, links, sample_meta):
    # generate the plot files
    gene_table = gene_table_df(ica_data, k, row, links = links, operon_commas=False)
    gene_hist = gene_hist_df(ica_data, k, row)
    base_conds = ['M9exp_1', 'M9exp_2', 'M9exp_3']
    gene_scatter = gene_scatter_df(ica_data, k, base_conds, links)
    act_bar = activity_bar_df(ica_data, k, sample_meta)

    reg_venn = regulon_venn_df(ica_data, k, row)
    reg_scatter = regulon_scatter_df(ica_data, k, row)

    # generate a basic data df
    res = pd.Series(index=['name', 'TF', 'Regulator',
                           'Function', 'Category', 
                           'has_venn', 'scatter', 'has_meme'])
    res.loc['name'] = row.loc['name']
    res.loc['TF'] = row.TF
    res.loc['Regulator'] = tf_with_links(k, row)
    res.loc['Function'] = row.Function
    res.loc['Category'] = row.Category
    res.loc['has_venn'] = not(reg_venn is None)
    if reg_scatter is None:
        res.loc['scatter'] = 0
    else:
        res.loc['scatter'] = reg_scatter.shape[1] - 1
    res.loc['has_meme'] = False # update later
    # may also want to add the stats from enrich

    # save output
    folder = 'iModulon_files/'+str(k)+'/'
    if not(os.path.isdir(folder)):
        os.mkdir(folder)
    res.to_csv(folder+str(k)+'_meta.csv')
    gene_table.to_csv(folder+str(k)+'_gene_table.csv')
    gene_hist.to_csv(folder+str(k)+'_gene_hist.csv')
    gene_scatter.to_csv(folder+str(k)+'_gene_scatter.csv')
    act_bar.to_csv(folder+str(k)+'_activity_bar.csv')
    if not(reg_venn is None):
        reg_venn.to_csv(folder+str(k)+'_reg_venn.csv')
    if not(reg_scatter is None):
        reg_scatter.to_csv(folder+str(k)+'_reg_scatter.csv')
    ica_data.S[k].to_csv(folder+str(k)+'_gene_weights.csv')
    ica_data.A.loc[k].to_csv(folder+str(k)+'_activity.csv')

In [None]:
for k, row in enrich.iterrows():
    make_directory(ica_data, k, row, links, sample_meta)