In [1]:
# @name transcriptomics
# @description notebook to build data-driven networks from various rna-seq edges datasets
# @author Núria Queralt Rosinach
# @date 04-02-2018

# gene names in ensembl
# http://uswest.ensembl.org/info/genome/genebuild/gene_names.html

# avoid interpret commas inside text by the csv reader
# https://stackoverflow.com/questions/21527057/python-parse-csv-ignoring-comma-with-double-quotes?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html

In [2]:
# to do:
#     * 

In [3]:
import pandas as pd
from biothings_client import get_client

### Load ngly1_{fly, hyman}-gene edges

#### what sets should i load?

* Chow fly transcriptome subsets:
    * all: 15.000 genes
    * FC1.5, FDR 5%: 386 genes
    * FC2.0, FDR 5%: 87 genes
* Freeze human transcriptome subsets: 
    * all: 1906 genes
    * FC1.5, FDR 5%: 1897 genes
    * FC2.0, FDR 5%: 1349 genes

**Conclusion**: I am going to load Chow_FC1.5 (paper analysis) and Freeze_all transcriptomes (it seems already filtered, and these are the set of genes they want to analyse).

In [4]:
chow = pd.read_csv('./ngly1-fly-chow-2018/out/fc1.5_fdr5_transcriptome_fly.csv')
print(chow.shape)
chow.head(2)

(386, 6)


Unnamed: 0,FlyBase ID,Symbol,log2FoldChange,pvalue,padj,Regulation
0,FBgn0035904,GstO3,0.576871,2.13e-08,2e-06,Upregulated
1,FBgn0051469,CG31469,0.575625,0.002022103,0.035791,Upregulated


In [5]:
freeze = pd.read_csv('./ngly1-human-freezelab-2018/out/all_transcriptome_human.csv')
print(freeze.shape)
freeze.head(2)

(1906, 7)


Unnamed: 0,ensembl_id,gene_name,gene_biotype,log2FoldChange,pvalue,padj,Regulation
0,ENSG00000138829,FBN2,protein_coding,4.147393,1.09e-74,1.81e-70,Upregulated
1,ENSG00000169515,CCDC8,protein_coding,5.233138,1.33e-53,5.519999999999999e-50,Upregulated


### Normalize human genes 2 entrez, HGNC Id

* **ensembl**: all genes 

#### Input ID list

In [6]:
# ensembl
ensembl = list(freeze.ensembl_id)
print(len(ensembl))
print(ensembl[0:5])

1906
['ENSG00000138829', 'ENSG00000169515', 'ENSG00000242265', 'ENSG00000204941', 'ENSG00000187720']


#### ID dictionaries: ensembl2entrez, ensembl2hgnc

In [7]:
# ensembl2entrez and ensembl2hgnc dict
mg = get_client('gene')
df = mg.querymany(ensembl, scopes = 'ensembl.gene', fields='entrezgene,HGNC', size=1, as_dataframe=True)

querying 1-1000...done.
querying 1001-1906...done.
Finished.
40 input query terms found no hit:
	['ENSG00000226958', 'ENSG00000199916', 'ENSG00000171282', 'ENSG00000264063', 'ENSG00000260833', 'ENS
Pass "returnall=True" to return complete lists of duplicate or missing query terms.


In [8]:
df.head(2)

Unnamed: 0_level_0,HGNC,_id,_score,entrezgene,notfound
query,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000138829,3604,2201,17.464752,2201.0,
ENSG00000169515,25367,83987,18.010357,83987.0,


In [9]:
# not found
missing = df[['notfound']].copy()
missing = missing.reset_index().rename(columns={'query': 'ensembl'})
missing = missing[missing['notfound'] == True][['ensembl']] 
missing_ensembl_l = list(missing.ensembl)

# save not found
with open('./ngly1-human-freezelab-2018/out/not_found_ensembl.list','w') as f:
    f.write('\n'.join(missing_ensembl_l))

In [10]:
# prepare ids dataframe for dictionary construction
ids =( df.reset_index()
         .rename(columns={'query': 'ensembl','HGNC': 'hgnc'}) 
         [['ensembl','hgnc','entrezgene']]
         .copy()
     )
ids['entrez'] = ids.entrezgene.apply(lambda x: str(round(x)) if x > 0 else 0)
print('ensembl:', type(ids['ensembl'][0]))
print('hgnc:', type(ids['hgnc'][0]))
print('entrez:', type(ids['entrez'][0]))
ids.head(2)

ensembl: <class 'str'>
hgnc: <class 'str'>
entrez: <class 'str'>


Unnamed: 0,ensembl,hgnc,entrezgene,entrez
0,ENSG00000138829,3604,2201.0,2201
1,ENSG00000169515,25367,83987.0,83987


In [11]:
# build dictionaries
# value types: 
# ensembl=> str, no nulls
# hgnc=> str, null=> float(nan)
# entrez=> str, null=> float(0)
# dictionaries for normalization to an ID (not null allowed). the final ID must be from one of the several schemes.
# namespace added. no namespace added to ensembl_ids
ensembl2entrez_dict = dict(zip(ids.ensembl,ids.entrez))
ensembl2hgnc_dict = dict(zip(ids.ensembl,ids.hgnc))

# associate ensembl for those genes without entrez: ensembl=> entrez > ensembl
for ensembl in ensembl2entrez_dict:
    if ensembl2entrez_dict[ensembl] == 0:
        ensembl2entrez_dict[ensembl] = 'ENSEMBL:'+ensembl
    else:
        entrez = ensembl2entrez_dict[ensembl]
        ensembl2entrez_dict[ensembl] = 'NCBIGene:'+entrez

# associate ensembl for those genes without hgnc: ensembl=> hgnc > ensembl
## obsolete: associate entrez else ensembl for those genes without hgnc: ensembl=> hgnc > entrez > ensembl
## monarch change: human genes are HGNC ID or ENSEMBL ID.  NGCBIGene ID for other species, more:
## gene names in ensembl
## http://uswest.ensembl.org/info/genome/genebuild/gene_names.html
for ensembl in ensembl2hgnc_dict:
    if isinstance(ensembl2hgnc_dict.get(ensembl),float):
        ensembl2hgnc_dict[ensembl] = 'ENSEMBL:'+ensembl #ensembl2entrez_dict[ensembl]
    else:
        hgnc = ensembl2hgnc_dict[ensembl]
        ensembl2hgnc_dict[ensembl] = 'HGNC:'+hgnc

#### Normalize ID to entrez, else: ensembl

In [12]:
# add field object_ensembl_id with namespace
freeze['o_ensembl_id'] = freeze.ensembl_id.apply(lambda x: 'ENSEMBL:' + str(x))

# Map ensembl to entrez
freeze['o_entrez_id'] = freeze.ensembl_id.apply(lambda x: ensembl2entrez_dict[x])
freeze.head(2)

Unnamed: 0,ensembl_id,gene_name,gene_biotype,log2FoldChange,pvalue,padj,Regulation,o_ensembl_id,o_entrez_id
0,ENSG00000138829,FBN2,protein_coding,4.147393,1.09e-74,1.81e-70,Upregulated,ENSEMBL:ENSG00000138829,NCBIGene:2201
1,ENSG00000169515,CCDC8,protein_coding,5.233138,1.33e-53,5.519999999999999e-50,Upregulated,ENSEMBL:ENSG00000169515,NCBIGene:83987


#### Normalize ID to hgnc, else: ensembl

In [13]:
# Map ensembl to hgnc
freeze['o_hgnc_id'] = freeze.ensembl_id.apply(lambda x: ensembl2hgnc_dict[x])
freeze.head(2)

Unnamed: 0,ensembl_id,gene_name,gene_biotype,log2FoldChange,pvalue,padj,Regulation,o_ensembl_id,o_entrez_id,o_hgnc_id
0,ENSG00000138829,FBN2,protein_coding,4.147393,1.09e-74,1.81e-70,Upregulated,ENSEMBL:ENSG00000138829,NCBIGene:2201,HGNC:3604
1,ENSG00000169515,CCDC8,protein_coding,5.233138,1.33e-53,5.519999999999999e-50,Upregulated,ENSEMBL:ENSG00000169515,NCBIGene:83987,HGNC:25367


### save edges

In [14]:
chow = ( chow
            .rename(columns={'FlyBase ID': 'flybase_id', 'Symbol': 'symbol', 'Regulation': 'regulation'})
            .assign(source = 'Chow')
            .assign(subject_id = 'FlyBase:FBgn0033050')
            .assign(subject_label = 'Pngl')
            .assign(property_id = 'RO:0002434')
            .assign(property_label = 'interacts with')
            .assign(reference_id = 'PMID:29346549')
)
chow['object_id'] = chow.flybase_id.apply(lambda x: 'FlyBase:' + str(x))
chow.to_csv('./ngly1-fly-chow-2018/out/chow_fc1.5_fdr5_transcriptome_fly_edges.csv', index=False)
chow.head(2)

Unnamed: 0,flybase_id,symbol,log2FoldChange,pvalue,padj,regulation,source,subject_id,subject_label,property_id,property_label,reference_id,object_id
0,FBgn0035904,GstO3,0.576871,2.13e-08,2e-06,Upregulated,Chow,FlyBase:FBgn0033050,Pngl,RO:0002434,interacts with,PMID:29346549,FlyBase:FBgn0035904
1,FBgn0051469,CG31469,0.575625,0.002022103,0.035791,Upregulated,Chow,FlyBase:FBgn0033050,Pngl,RO:0002434,interacts with,PMID:29346549,FlyBase:FBgn0051469


In [15]:
freeze = ( freeze
            .rename(columns={'gene_name': 'symbol', 'Regulation': 'regulation'})
            .assign(source = 'Freeze')
            .assign(s_hgnc_id = 'HGNC:17646')
            .assign(subject_label = 'NGLY1')
            .assign(s_entrez_id = 'NCBIGene:55768') 
            .assign(property_id = 'RO:0002434')
            .assign(property_label = 'interacts with')
            .assign(reference_id = 'NA')
)
freeze.to_csv('./ngly1-human-freezelab-2018/out/freeze_all_transcriptome_human_edges.csv', index=False)
freeze.head(2)

Unnamed: 0,ensembl_id,symbol,gene_biotype,log2FoldChange,pvalue,padj,regulation,o_ensembl_id,o_entrez_id,o_hgnc_id,source,s_hgnc_id,subject_label,s_entrez_id,property_id,property_label,reference_id
0,ENSG00000138829,FBN2,protein_coding,4.147393,1.09e-74,1.81e-70,Upregulated,ENSEMBL:ENSG00000138829,NCBIGene:2201,HGNC:3604,Freeze,HGNC:17646,NGLY1,NCBIGene:55768,RO:0002434,interacts with,
1,ENSG00000169515,CCDC8,protein_coding,5.233138,1.33e-53,5.519999999999999e-50,Upregulated,ENSEMBL:ENSG00000169515,NCBIGene:83987,HGNC:25367,Freeze,HGNC:17646,NGLY1,NCBIGene:55768,RO:0002434,interacts with,


### Prepare Regulation edges to build the graph
* concat chow and freeze
* give graph format
* build statements and concepts file
* concat with graph (edges and nodes): first curated > monarch > regulation
* drop duplicates ((edges/nodes) rows, concepts
* save graph
* format for neo4j
* save neo4j files

#### concat chow and freeze

In [16]:
import pandas as pd

In [17]:
# chow 
chow = pd.read_csv('./ngly1-fly-chow-2018/out/chow_fc1.5_fdr5_transcriptome_fly_edges.csv')
print(chow.shape)
print(chow.columns)
chow.head(2)

(386, 13)
Index(['flybase_id', 'symbol', 'log2FoldChange', 'pvalue', 'padj',
       'regulation', 'source', 'subject_id', 'subject_label', 'property_id',
       'property_label', 'reference_id', 'object_id'],
      dtype='object')


Unnamed: 0,flybase_id,symbol,log2FoldChange,pvalue,padj,regulation,source,subject_id,subject_label,property_id,property_label,reference_id,object_id
0,FBgn0035904,GstO3,0.576871,2.13e-08,2e-06,Upregulated,Chow,FlyBase:FBgn0033050,Pngl,RO:0002434,interacts with,PMID:29346549,FlyBase:FBgn0035904
1,FBgn0051469,CG31469,0.575625,0.002022103,0.035791,Upregulated,Chow,FlyBase:FBgn0033050,Pngl,RO:0002434,interacts with,PMID:29346549,FlyBase:FBgn0051469


In [18]:
# freeze
freeze = pd.read_csv('./ngly1-human-freezelab-2018/out/freeze_all_transcriptome_human_edges.csv')
print(freeze.shape)
print(freeze.columns)
freeze.head(2)

(1906, 17)
Index(['ensembl_id', 'symbol', 'gene_biotype', 'log2FoldChange', 'pvalue',
       'padj', 'regulation', 'o_ensembl_id', 'o_entrez_id', 'o_hgnc_id',
       'source', 's_hgnc_id', 'subject_label', 's_entrez_id', 'property_id',
       'property_label', 'reference_id'],
      dtype='object')


Unnamed: 0,ensembl_id,symbol,gene_biotype,log2FoldChange,pvalue,padj,regulation,o_ensembl_id,o_entrez_id,o_hgnc_id,source,s_hgnc_id,subject_label,s_entrez_id,property_id,property_label,reference_id
0,ENSG00000138829,FBN2,protein_coding,4.147393,1.09e-74,1.81e-70,Upregulated,ENSEMBL:ENSG00000138829,NCBIGene:2201,HGNC:3604,Freeze,HGNC:17646,NGLY1,NCBIGene:55768,RO:0002434,interacts with,
1,ENSG00000169515,CCDC8,protein_coding,5.233138,1.33e-53,5.519999999999999e-50,Upregulated,ENSEMBL:ENSG00000169515,NCBIGene:83987,HGNC:25367,Freeze,HGNC:17646,NGLY1,NCBIGene:55768,RO:0002434,interacts with,


#### select gene ID

In [19]:
# select and rename key columns
chow = (chow
           [['symbol', 'log2FoldChange', 'pvalue', 'padj',
       'regulation', 'source', 'subject_id', 'subject_label', 'property_id',
       'property_label', 'reference_id', 'object_id']]
            .rename(columns={'symbol': 'object_label','padj':'fdr'})
        
       )
print(chow.shape)
print(chow.columns)

freeze = (freeze
             [['symbol', 'log2FoldChange', 'pvalue',
       'padj', 'regulation','o_hgnc_id',
       'source', 's_hgnc_id', 'subject_label', 'property_id',
       'property_label', 'reference_id']]
            .rename(columns={'symbol': 'object_label','padj':'fdr','s_hgnc_id': 'subject_id','o_hgnc_id': 'object_id'})
                  
         )
print(freeze.shape)
print(freeze.columns)

(386, 12)
Index(['object_label', 'log2FoldChange', 'pvalue', 'fdr', 'regulation',
       'source', 'subject_id', 'subject_label', 'property_id',
       'property_label', 'reference_id', 'object_id'],
      dtype='object')
(1906, 12)
Index(['object_label', 'log2FoldChange', 'pvalue', 'fdr', 'regulation',
       'object_id', 'source', 'subject_id', 'subject_label', 'property_id',
       'property_label', 'reference_id'],
      dtype='object')


In [20]:
# reorder columns
chow = chow[['subject_id', 'subject_label', 'property_id',
       'property_label', 'object_id', 'object_label', 'log2FoldChange', 'pvalue', 'fdr', 'regulation',
       'source', 'reference_id']]
print(chow.shape)
print(chow.columns)
freeze = freeze[['subject_id', 'subject_label', 'property_id',
       'property_label', 'object_id', 'object_label', 'log2FoldChange', 'pvalue', 'fdr', 'regulation',
       'source', 'reference_id']]
print(freeze.shape)
print(freeze.columns)

(386, 12)
Index(['subject_id', 'subject_label', 'property_id', 'property_label',
       'object_id', 'object_label', 'log2FoldChange', 'pvalue', 'fdr',
       'regulation', 'source', 'reference_id'],
      dtype='object')
(1906, 12)
Index(['subject_id', 'subject_label', 'property_id', 'property_label',
       'object_id', 'object_label', 'log2FoldChange', 'pvalue', 'fdr',
       'regulation', 'source', 'reference_id'],
      dtype='object')


In [21]:
# concat
edges = pd.concat([chow,freeze],ignore_index=True)
print(edges.shape)

# see duplicates
print(len(edges[edges.duplicated(keep=False)]))

# drop duplicates
edges.drop_duplicates(inplace=True)
print(len(edges))

(2292, 12)
0
2292


#### build edges and nodes file
file format: csv
fill null wiht 'NA'. can be done with python function before saving
##### edges
1. `subject_id` str curie required
2. `object_id` str curie required
3. `property_id` str curie NA
4. `property_label` str NA
5. `property_description` str NA
6. `property_uri` str NA
7. `reference_uri` str NA
8. `reference_supporting_text` str NA
9. `reference_date` str NA | yyyy-mm-dd

In [22]:
import os
if not os.path.exists('./graph'): os.makedirs('./graph')

In [23]:
edges.head(2)

Unnamed: 0,subject_id,subject_label,property_id,property_label,object_id,object_label,log2FoldChange,pvalue,fdr,regulation,source,reference_id
0,FlyBase:FBgn0033050,Pngl,RO:0002434,interacts with,FlyBase:FBgn0035904,GstO3,0.576871,2.13e-08,2e-06,Upregulated,Chow,PMID:29346549
1,FlyBase:FBgn0033050,Pngl,RO:0002434,interacts with,FlyBase:FBgn0051469,CG31469,0.575625,0.002022103,0.035791,Upregulated,Chow,PMID:29346549


In [24]:
# give graph format
print(edges.columns)
curie_dct = {
    'ro': 'http://purl.obolibrary.org/obo/',
    'pmid': 'https://www.ncbi.nlm.nih.gov/pubmed/',
    'encode': 'https://www.encodeproject.org/search/?searchTerm='
}

edges_l = list()
for i, row in edges.iterrows():
    # property uri: http://purl.obolibrary.org/obo/RO_0002434
    property_uri = 'NA'
    if ':' in row['property_id']:
        property_uri = curie_dct[row['property_id'].split(':')[0].lower()]+row['property_id'].replace(':','_')       
    
    # reference_uri: https://www.ncbi.nlm.nih.gov/pubmed/25416956
    # capture nan or None values, i.e. all possible nulls
    if (isinstance(row['reference_id'], float) and str(row['reference_id']).lower() == 'nan') or row['reference_id'] is None:
        row['reference_id'] = 'NA'
    if ':' not in row['reference_id']:
        reference_uri = row['reference_id']
    else:
        try:
            reference_uri = curie_dct[row['reference_id'].split(':')[0].lower()]+row['reference_id'].split(':')[1]
        except KeyError:
            reference_uri = row['reference_id']
            print('There is a reference curie with and unrecognized namespace:', row['reference_id'])
    # build list of edges as list of dict, i.e a df, where a dict is an edge        
    edge = dict()
    edge['subject_id'] = row['subject_id']
    edge['object_id'] = row['object_id']
    edge['property_id'] = row['property_id']
    edge['property_label'] = row['property_label']
    edge['property_description'] = 'NA'
    edge['property_uri'] = property_uri
    edge['reference_uri'] = reference_uri
    edge['reference_supporting_text'] = 'To understand how loss of NGLY1 contributes to disease, we developed a Drosophila model of NGLY1 deficiency. Loss of NGLY1 function resulted in developmental delay and lethality. We used RNAseq to determine which processes are misregulated in the absence of NGLY1.' if row['source'] == 'Chow' else 'This edge comes from the RNA-seq profile dataset extracted by the Freeze Lab 2018.'
    edge['reference_date'] = '2018-03-15' if row['source'] == 'Chow' else 'NA'
    edges_l.append(edge)

# save edges file
pd.DataFrame(edges_l).fillna('NA').to_csv('./graph/rna_edges.csv', index=False)

Index(['subject_id', 'subject_label', 'property_id', 'property_label',
       'object_id', 'object_label', 'log2FoldChange', 'pvalue', 'fdr',
       'regulation', 'source', 'reference_id'],
      dtype='object')


##### nodes
1. `id` str curie required
2. `semantic_groups` str required
3. `preflabel` str label required
4. `synonyms` str NA
5. `description` str NA
6. **new** `name` str NA

In [25]:
# retrieve node attributes from biothings and build dictionary
# from biothings we retrieve: name (new attribute for short description), alias (synonyms), summary (description)
# symbols in this case come from the original source. otherwise are gonna be retrieved from biothings as well.
# build concept dict: {id:symbol}
concept_dct = dict()
for i, row in edges.iterrows():
    # node for subject
    concept_dct[row['subject_id']] = {'preflabel': row['subject_label']}
    # node for object
    concept_dct[row['object_id']] = {'preflabel': row['object_label']}
len(concept_dct.keys())

2293

In [26]:
# biothings api + dictionaries
# input list for api: since by id we have flybase, hgnc/entrez or ensembl, i am gonna use symbol
symbols = list()
for idx,symbol in concept_dct.items():
    #id = key.split(':')[1] if ':' in key else key
    symbols.append(symbol['preflabel'])
    
print(symbols[0:5])    
len(symbols)

['Pngl', 'GstO3', 'CG31469', 'LpR2', 'TwdlG']


2293

In [27]:
# api call
mg = get_client('gene')
df = mg.querymany(symbols, scopes = 'symbol,alias', fields='alias,name,summary', size=1, as_dataframe=True)
df.head(2)

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-2293...done.
Finished.
1 input query terms found dup hits:
	[('Metazoa_SRP', 2)]
90 input query terms found no hit:
	['J01415.13', 'RP11-54O7.3', 'RP11-77I22.3', 'RP4-788L13.1', 'RP3-410C9.1', 'MIR3687', 'AC124789.1',
Pass "returnall=True" to return complete lists of duplicate or missing query terms.


Unnamed: 0_level_0,_id,_score,alias,name,notfound,summary
query,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Pngl,35527,67.018524,"[CG7865, Dmel\CG7865, PNGase, png1]",PNGase-like,,
GstO3,732973,71.2423,GSTo,glutathione S-transferase omega 3,,


In [28]:
print(df.shape)
print(len(concept_dct.keys()))

(2293, 6)
2293


In [29]:
# dictionaries {id: {name:, alias:, summary:}}
i = 0
print(len(concept_dct))
for symbol, row in df.iterrows():
    # associate concept to symbol
    for concept in concept_dct:
        if concept_dct[concept]['preflabel'] == symbol:
            i += 1
            # add attributes
            concept_dct[concept]['name'] = row['name']
            concept_dct[concept]['synonyms'] = row['alias']
            concept_dct[concept]['description'] = row['summary']
print(i)
print(len(concept_dct))
# i is 2295 because there is a symbol that is associated to 2 different concepts

2293
2295
2293


In [47]:
# build a list of nodes as list of dict, i.e a df, where a dict is a node
nodes_l = list()
for concept in concept_dct:
    # node for subject
    node = dict()
    node['id'] = concept
    node['semantic_groups'] = 'GENE'
    node['preflabel'] = concept_dct[concept]['preflabel']
    node['name'] = concept_dct[concept]['name']
    node['synonyms'] = '|'.join(list(concept_dct[concept]['synonyms'])) if isinstance(concept_dct[concept]['synonyms'], list) else concept_dct[concept]['synonyms']
    node['description'] = concept_dct[concept]['description']
    nodes_l.append(node)
    
# save nodes file    
pd.DataFrame(nodes_l).fillna('NA').to_csv('./graph/rna_nodes.csv', index=False)
len(nodes_l)

2293

In [None]:
len(edges_l)

In [48]:
import pandas as pd
df = pd.read_csv('./graph/rna_edges.csv')
df.head(5)

Unnamed: 0,object_id,property_description,property_id,property_label,property_uri,reference_date,reference_supporting_text,reference_uri,subject_id
0,FlyBase:FBgn0035904,,RO:0002434,interacts with,http://purl.obolibrary.org/obo/RO_0002434,2018-03-15,To understand how loss of NGLY1 contributes to...,https://www.ncbi.nlm.nih.gov/pubmed/29346549,FlyBase:FBgn0033050
1,FlyBase:FBgn0051469,,RO:0002434,interacts with,http://purl.obolibrary.org/obo/RO_0002434,2018-03-15,To understand how loss of NGLY1 contributes to...,https://www.ncbi.nlm.nih.gov/pubmed/29346549,FlyBase:FBgn0033050
2,FlyBase:FBgn0051092,,RO:0002434,interacts with,http://purl.obolibrary.org/obo/RO_0002434,2018-03-15,To understand how loss of NGLY1 contributes to...,https://www.ncbi.nlm.nih.gov/pubmed/29346549,FlyBase:FBgn0033050
3,FlyBase:FBgn0037225,,RO:0002434,interacts with,http://purl.obolibrary.org/obo/RO_0002434,2018-03-15,To understand how loss of NGLY1 contributes to...,https://www.ncbi.nlm.nih.gov/pubmed/29346549,FlyBase:FBgn0033050
4,FlyBase:FBgn0039711,,RO:0002434,interacts with,http://purl.obolibrary.org/obo/RO_0002434,2018-03-15,To understand how loss of NGLY1 contributes to...,https://www.ncbi.nlm.nih.gov/pubmed/29346549,FlyBase:FBgn0033050


In [49]:
import pandas as pd
df = pd.read_csv('./graph/rna_nodes.csv')
df.head(5)

Unnamed: 0,description,id,name,preflabel,semantic_groups,synonyms
0,,FlyBase:FBgn0033050,PNGase-like,Pngl,GENE,CG7865|Dmel\CG7865|PNGase|png1
1,,FlyBase:FBgn0035904,glutathione S-transferase omega 3,GstO3,GENE,GSTo
2,,FlyBase:FBgn0051469,uncharacterized protein,CG31469,GENE,CG9599|Dmel\CG31469
3,,FlyBase:FBgn0051092,Cupredoxin superfamily protein,LpR2,GENE,F23N20.3|F23N20_3|Low Phosphate Root2
4,,FlyBase:FBgn0037225,TweedleG,TwdlG,GENE,CG14643|Dmel\CG14643
