# All Gene Scatter Plots

This generates the data for all gene scatter plots for i-modulons in PRECISE 1.0 so that they can be plotted in the ModulomeVis site.

In [1]:
import sys
# be sure that this points to a clone of github.com/SBRG/ICA
sys.path.append('../../../ica/') 
from icaviz.plotting import *
DATA_DIR = '../../../ica/data/precise_data/'
GENE_DIR = '../../../ica/data/annotation/'
enrich = pd.read_csv(DATA_DIR+'curated_enrichments.csv', index_col = 0)
names = enrich['name'].tolist()
ica_data = load_data(X=DATA_DIR+'log_tpm.csv',
                     S=DATA_DIR+'S.csv',
                     A=DATA_DIR+'A.csv',
                     metadata=DATA_DIR+'metadata.csv',
                     annotation=GENE_DIR+'gene_info.csv',
                     trn=GENE_DIR+'TRN.csv',
                     fasta=GENE_DIR+'NC_000913.3.fasta',
                     cutoff = 550)
pd.set_option('display.max_rows', None)

In [2]:
from gene_scatter import * #replace py file name

In [21]:
# this contains the EG numbers for generating URLs to the database
# indexed the same as genes, with "ID" column for EG numbers
locus_to_db = pd.read_csv('locus_to_db.csv',sep='\t', index_col = 1)
import requests
from matplotlib.colors import to_hex

def get_db_link(gene, locus_to_db):
    '''
    input: gene, the b number
           locus_to_db, a df indexed by b number with "ID" column of EG numbers
    output: link to the gene on EcoCyc database
    '''
    # skip gene if it's not in locus_to_db
    if not(gene in locus_to_db.index):
        #print('Gene missing from DB:', gene)
        return np.nan
    
    # generate link
    new_id = locus_to_db.ID[gene]
    link = 'https://ecocyc.org/gene?orgid=ECOLI&id='+new_id
    
    # test link
    request = requests.get(link)
    if request.status_code == 200:
        return link
    else:
        #print('Web site does not exist:', gene, new_id) 
        return np.nan

## Working Here

In [15]:
links = pd.Series(index = ica_data.S.index)
for g in links.index:
    links[g] = get_db_link(g, locus_to_db)

In [36]:
def gene_scatter_df(ica_data, k, links):
    columns = ['name', 'x', 'y', 'cog', 'color', 'link']
    res = pd.DataFrame(columns = columns, index = ica_data.S.index)
    res.index.name = 'locus'

    cutoff = ica_data.thresholds[k]

    # Draw scatterplot
    res.x = ica_data.X[['control__wt_glc__1','control__wt_glc__2']].mean(axis=1)
    res.y = ica_data.S[k]

    # add other data
    res.name = [ica_data.num2name[l] for l in res.index]
    res.cog = ica_data.gene_info.cog[res.index]
    res.color = [to_hex(ica_data.gene_colors[gene]) for gene in res.index]

    # if the gene is in the i-modulon, it is clickable
    in_im = res.index[res.y.abs()>cutoff]
    for g in in_im:
        res.link[g] = links[g]

    # add a row to store the threshold
    cutoff_row = pd.DataFrame([cutoff] + [np.nan]*5, columns=['thresh'], index = columns).T
    res = pd.concat([cutoff_row, res])

    return res

## Generate all CSVs

In [39]:
data_folder = 'data/'
for k, row in enrich.iterrows():
    this_fig = gene_scatter_df(ica_data, k, links)
    this_fig.to_csv(data_folder + '%i_gene_scatter.csv'%(k)) 
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [42]:
enrich

Unnamed: 0_level_0,name,TF,log_odds,f1score,pvalue,precision,recall,TP,n_genes,n_tf,Regulator,Function,Category,threshold
component,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,YieP,,,,,,,,11,1,YieP,Unknown,Regulator Discovery,0.089767
1,GlpR,glpR,inf,4.0,0.0,1.0,1.0,9.0,9,1,GlpR,Glycerol catabolism,Carbon Source Utilization,0.066801
2,YgbI,,,,,,,,9,1,YgbI,Unknown Function,Regulator Discovery,0.08224
3,efeU-repair,,,,,,,,8,0,,Accounts for repair and expression of efeU operon,Genomic Alterations,0.150925
4,ydcI-KO,,,,,,,,3,0,,Accounts for ydcI knock-out,Genomic Alterations,0.099229
5,BW25113,,,,,,,,17,0,,Transcriptional difference between BW25113 and...,Genomic Alterations,0.085425
6,proVWX,,,,,,,,4,0,,Glycine betaine transport,Biological Enrichment,0.117523
7,DhaR/Mlc,dhaR/mlc,7.459339,6.25,3.5699999999999996e-19,0.727273,0.571429,8.0,11,2,DhaR or Mlc,Dihydroxyacetone kinase and phosphotransferase...,Carbon Source Utilization,0.104547
8,ArgR,argR,6.027183,22.5,6.03e-18,0.923077,0.098361,12.0,13,1,ArgR,Arginine biosynthesis,Amino Acid and Nucleotide Biosynthesis,0.080441
9,YneJ,,,,,,,,5,1,YneJ,Unknown Function,Regulator Discovery,0.114203
