In [1]:
# @name transcriptomics
# @description notebook to build data-driven networks from various rna-seq edges datasets
# @author Núria Queralt Rosinach
# @date 01-17-2019

# gene names in ensembl
# http://uswest.ensembl.org/info/genome/genebuild/gene_names.html

# avoid interpret commas inside text by the csv reader
# https://stackoverflow.com/questions/21527057/python-parse-csv-ignoring-comma-with-double-quotes?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html

In [2]:
# to do:
#     * 

In [3]:
import pandas as pd
from biothings_client import get_client
import os, datetime
if not os.path.exists('./graph'): os.makedirs('./graph')
today = datetime.date.today()

### Load NGLY1 knock out and expressed gene edges from fly

#### what sets should i load?

* Chow fly transcriptome subsets:
    * all: 15.000 genes
    * FC1.5, FDR 5%: 386 genes
    * FC2.0, FDR 5%: 87 genes

**Conclusion**: I am going to load Chow_FC1.5 (paper analysis).

#### read datasets

In [4]:
chow = pd.read_csv('./ngly1-fly-chow-2018/out/fc1.5_fdr5_transcriptome_fly.csv')
print(chow.shape)
chow.head(2)

(386, 6)


Unnamed: 0,FlyBase ID,Symbol,log2FoldChange,pvalue,padj,Regulation
0,FBgn0035904,GstO3,0.576871,2.13e-08,2e-06,Upregulated
1,FBgn0051469,CG31469,0.575625,0.002022103,0.035791,Upregulated


### prepare and save individual dataset edges

In [5]:
chow = ( chow
            .rename(columns={'FlyBase ID': 'flybase_id', 'Symbol': 'symbol', 'Regulation': 'regulation'})
            .assign(source = 'Chow')
            .assign(subject_id = 'FlyBase:FBgn0033050')
            .assign(subject_label = 'Pngl')
            .assign(property_id = 'RO:0002434')
            .assign(property_label = 'interacts with')
            .assign(reference_id = 'PMID:29346549')
)
chow['object_id'] = chow.flybase_id.apply(lambda x: 'FlyBase:' + str(x))
chow.to_csv('./ngly1-fly-chow-2018/out/chow_fc1.5_fdr5_transcriptome_fly_edges.csv', index=False)
chow.head(2)

Unnamed: 0,flybase_id,symbol,log2FoldChange,pvalue,padj,regulation,source,subject_id,subject_label,property_id,property_label,reference_id,object_id
0,FBgn0035904,GstO3,0.576871,2.13e-08,2e-06,Upregulated,Chow,FlyBase:FBgn0033050,Pngl,RO:0002434,interacts with,PMID:29346549,FlyBase:FBgn0035904
1,FBgn0051469,CG31469,0.575625,0.002022103,0.035791,Upregulated,Chow,FlyBase:FBgn0033050,Pngl,RO:0002434,interacts with,PMID:29346549,FlyBase:FBgn0051469


### Prepare RNA edges to build the graph
* concat edges in case you have more than one expression dataset
* give graph format
* build edges and nodes files

#### concat datasets in case you have more than one expression dataset

In [6]:
# chow 
chow = pd.read_csv('./ngly1-fly-chow-2018/out/chow_fc1.5_fdr5_transcriptome_fly_edges.csv')
print(chow.shape)
print(chow.columns)
chow.head(2)

(386, 13)
Index(['flybase_id', 'symbol', 'log2FoldChange', 'pvalue', 'padj',
       'regulation', 'source', 'subject_id', 'subject_label', 'property_id',
       'property_label', 'reference_id', 'object_id'],
      dtype='object')


Unnamed: 0,flybase_id,symbol,log2FoldChange,pvalue,padj,regulation,source,subject_id,subject_label,property_id,property_label,reference_id,object_id
0,FBgn0035904,GstO3,0.576871,2.13e-08,2e-06,Upregulated,Chow,FlyBase:FBgn0033050,Pngl,RO:0002434,interacts with,PMID:29346549,FlyBase:FBgn0035904
1,FBgn0051469,CG31469,0.575625,0.002022103,0.035791,Upregulated,Chow,FlyBase:FBgn0033050,Pngl,RO:0002434,interacts with,PMID:29346549,FlyBase:FBgn0051469


#### select gene ID

In [7]:
# select and rename key columns
chow = (chow
           [['symbol', 'log2FoldChange', 'pvalue', 'padj',
       'regulation', 'source', 'subject_id', 'subject_label', 'property_id',
       'property_label', 'reference_id', 'object_id']]
            .rename(columns={'symbol': 'object_label','padj':'fdr'})
        
       )
print(chow.shape)
print(chow.columns)

(386, 12)
Index(['object_label', 'log2FoldChange', 'pvalue', 'fdr', 'regulation',
       'source', 'subject_id', 'subject_label', 'property_id',
       'property_label', 'reference_id', 'object_id'],
      dtype='object')


In [8]:
# reorder columns
chow = chow[['subject_id', 'subject_label', 'property_id',
       'property_label', 'object_id', 'object_label', 'log2FoldChange', 'pvalue', 'fdr', 'regulation',
       'source', 'reference_id']]
print(chow.shape)
print(chow.columns)

(386, 12)
Index(['subject_id', 'subject_label', 'property_id', 'property_label',
       'object_id', 'object_label', 'log2FoldChange', 'pvalue', 'fdr',
       'regulation', 'source', 'reference_id'],
      dtype='object')


In [9]:
# edges: concat edges 
edges = pd.concat([chow,pd.DataFrame()],ignore_index=True)
#edges = chow.copy()
print(edges.shape)

# see duplicates
print(len(edges[edges.duplicated(keep=False)]))

# drop duplicates
edges.drop_duplicates(inplace=True)
print(len(edges))

(386, 12)
0
386


#### build edges and nodes files
file format: csv
fill null wiht 'NA'. can be done with python function before saving
##### edges
1. `subject_id` str curie required
2. `object_id` str curie required
3. `property_id` str curie NA
4. `property_label` str NA
5. `property_description` str NA
6. `property_uri` str NA
7. `reference_uri` str NA
8. `reference_supporting_text` str NA
9. `reference_date` str NA | yyyy-mm-dd

In [10]:
edges.head(2)

Unnamed: 0,subject_id,subject_label,property_id,property_label,object_id,object_label,log2FoldChange,pvalue,fdr,regulation,source,reference_id
0,FlyBase:FBgn0033050,Pngl,RO:0002434,interacts with,FlyBase:FBgn0035904,GstO3,0.576871,2.13e-08,2e-06,Upregulated,Chow,PMID:29346549
1,FlyBase:FBgn0033050,Pngl,RO:0002434,interacts with,FlyBase:FBgn0051469,CG31469,0.575625,0.002022103,0.035791,Upregulated,Chow,PMID:29346549


In [11]:
# give graph format
print(edges.columns)
curie_dct = {
    'ro': 'http://purl.obolibrary.org/obo/',
    'pmid': 'https://www.ncbi.nlm.nih.gov/pubmed/',
    'encode': 'https://www.encodeproject.org/search/?searchTerm='
}

edges_l = list()
for i, row in edges.iterrows():
    # property uri: http://purl.obolibrary.org/obo/RO_0002434
    property_uri = 'NA'
    if ':' in row['property_id']:
        property_uri = curie_dct[row['property_id'].split(':')[0].lower()]+row['property_id'].replace(':','_')       
    
    # reference_uri: https://www.ncbi.nlm.nih.gov/pubmed/25416956
    # capture nan or None values, i.e. all possible nulls
    if (isinstance(row['reference_id'], float) and str(row['reference_id']).lower() == 'nan') or row['reference_id'] is None:
        row['reference_id'] = 'NA'
    if ':' not in row['reference_id']:
        reference_uri = row['reference_id']
    else:
        try:
            reference_uri = curie_dct[row['reference_id'].split(':')[0].lower()]+row['reference_id'].split(':')[1]
        except KeyError:
            reference_uri = row['reference_id']
            print('There is a reference curie with and unrecognized namespace:', row['reference_id'])
    # build list of edges as list of dict, i.e a df, where a dict is an edge        
    edge = dict()
    edge['subject_id'] = row['subject_id']
    edge['object_id'] = row['object_id']
    edge['property_id'] = row['property_id']
    edge['property_label'] = row['property_label']
    edge['property_description'] = 'NA'
    edge['property_uri'] = property_uri
    edge['reference_uri'] = reference_uri
    edge['reference_supporting_text'] = 'To understand how loss of NGLY1 contributes to disease, we developed a Drosophila model of NGLY1 deficiency. Loss of NGLY1 function resulted in developmental delay and lethality. We used RNAseq to determine which processes are misregulated in the absence of NGLY1.' if row['source'] == 'Chow' else 'This edge comes from the RNA-seq profile dataset extracted by the XXX Lab YYYY.'
    edge['reference_date'] = '2018-03-15' if row['source'] == 'Chow' else 'NA'
    edges_l.append(edge)

# save edges file
pd.DataFrame(edges_l).fillna('NA').to_csv('./graph/rna_edges_v{}.csv'.format(today), index=False)

Index(['subject_id', 'subject_label', 'property_id', 'property_label',
       'object_id', 'object_label', 'log2FoldChange', 'pvalue', 'fdr',
       'regulation', 'source', 'reference_id'],
      dtype='object')


##### nodes
1. `id` str curie required
2. `semantic_groups` str required
3. `preflabel` str label required
4. `synonyms` str NA
5. `description` str NA
6. **new** `name` str NA

In [12]:
# retrieve node attributes from biothings and build dictionary
# from biothings we retrieve: name (new attribute for short description), alias (synonyms), summary (description)
# symbols in this case come from the original source. otherwise are gonna be retrieved from biothings as well.
# build concept dict: {id:symbol}
concept_dct = dict()
for i, row in edges.iterrows():
    # node for subject
    concept_dct[row['subject_id']] = {'preflabel': row['subject_label']}
    # node for object
    concept_dct[row['object_id']] = {'preflabel': row['object_label']}
len(concept_dct.keys())

386

In [13]:
# biothings api + dictionaries
# input list for api: since by id we have flybase, hgnc/entrez or ensembl, i am gonna use symbol
symbols = list()
for idx,symbol in concept_dct.items():
    #id = key.split(':')[1] if ':' in key else key
    symbols.append(symbol['preflabel'])
    
print(symbols[0:5])    
len(symbols)

['Pngl', 'GstO3', 'CG31469', 'LpR2', 'TwdlG']


386

In [14]:
# api call
mg = get_client('gene')
df = mg.querymany(symbols, scopes = 'symbol,alias', fields='alias,name,summary', size=1, as_dataframe=True)
df.head(2)

querying 1-386...done.
Finished.


Unnamed: 0_level_0,_id,_score,alias,name,summary
query,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Pngl,35527,71.66328,"[CG7865, Dmel\CG7865, PNGase, png1]",PNGase-like,
GstO3,732973,71.75126,GSTo,glutathione S-transferase omega 3,


In [15]:
print(df.shape)
print(len(concept_dct.keys()))

(386, 5)
386


In [16]:
# dictionaries {id: {name:, alias:, summary:}}
i = 0
print(len(concept_dct))
for symbol, row in df.iterrows():
    # associate concept to symbol
    for concept in concept_dct:
        if concept_dct[concept]['preflabel'] == symbol:
            i += 1
            # add attributes
            concept_dct[concept]['name'] = row['name']
            concept_dct[concept]['synonyms'] = row['alias']
            concept_dct[concept]['description'] = row['summary']
print(i)
print(len(concept_dct))
# i is 2295 because there is a symbol that is associated to 2 different concepts

386
386
386


In [17]:
# build a list of nodes as list of dict, i.e a df, where a dict is a node
nodes_l = list()
for concept in concept_dct:
    # node for subject
    node = dict()
    node['id'] = concept
    node['semantic_groups'] = 'GENE'
    node['preflabel'] = concept_dct[concept]['preflabel']
    node['name'] = concept_dct[concept]['name']
    node['synonyms'] = '|'.join(list(concept_dct[concept]['synonyms'])) if isinstance(concept_dct[concept]['synonyms'], list) else concept_dct[concept]['synonyms']
    node['description'] = concept_dct[concept]['description']
    nodes_l.append(node)
    
# save nodes file    
pd.DataFrame(nodes_l).fillna('NA').to_csv('./graph/rna_nodes_v{}.csv'.format(today), index=False)
len(nodes_l)

386

In [18]:
len(edges_l)

386

In [19]:
df = pd.read_csv('./graph/rna_edges_v{}.csv'.format(today))
df.head(5)

Unnamed: 0,object_id,property_description,property_id,property_label,property_uri,reference_date,reference_supporting_text,reference_uri,subject_id
0,FlyBase:FBgn0035904,,RO:0002434,interacts with,http://purl.obolibrary.org/obo/RO_0002434,2018-03-15,To understand how loss of NGLY1 contributes to...,https://www.ncbi.nlm.nih.gov/pubmed/29346549,FlyBase:FBgn0033050
1,FlyBase:FBgn0051469,,RO:0002434,interacts with,http://purl.obolibrary.org/obo/RO_0002434,2018-03-15,To understand how loss of NGLY1 contributes to...,https://www.ncbi.nlm.nih.gov/pubmed/29346549,FlyBase:FBgn0033050
2,FlyBase:FBgn0051092,,RO:0002434,interacts with,http://purl.obolibrary.org/obo/RO_0002434,2018-03-15,To understand how loss of NGLY1 contributes to...,https://www.ncbi.nlm.nih.gov/pubmed/29346549,FlyBase:FBgn0033050
3,FlyBase:FBgn0037225,,RO:0002434,interacts with,http://purl.obolibrary.org/obo/RO_0002434,2018-03-15,To understand how loss of NGLY1 contributes to...,https://www.ncbi.nlm.nih.gov/pubmed/29346549,FlyBase:FBgn0033050
4,FlyBase:FBgn0039711,,RO:0002434,interacts with,http://purl.obolibrary.org/obo/RO_0002434,2018-03-15,To understand how loss of NGLY1 contributes to...,https://www.ncbi.nlm.nih.gov/pubmed/29346549,FlyBase:FBgn0033050


In [20]:
df = pd.read_csv('./graph/rna_nodes_v{}.csv'.format(today))
df.head(5)

Unnamed: 0,description,id,name,preflabel,semantic_groups,synonyms
0,,FlyBase:FBgn0033050,PNGase-like,Pngl,GENE,CG7865|Dmel\CG7865|PNGase|png1
1,,FlyBase:FBgn0035904,glutathione S-transferase omega 3,GstO3,GENE,GSTo
2,,FlyBase:FBgn0051469,uncharacterized protein,CG31469,GENE,CG9599|Dmel\CG31469
3,,FlyBase:FBgn0051092,Lipophorin receptor 2,LpR2,GENE,CG31092|CG4823|CG4834|Dmel\CG31092|LDLR|Lpr2|lpr2
4,,FlyBase:FBgn0037225,TweedleG,TwdlG,GENE,CG14643|Dmel\CG14643
