### Graph database builder for AtgO Ontology Viewer

## Introduction

This is a script to generate AtgO database (DAG and raw interactions) with Neo4j v1.9.x.

This notebook is the V3.  Additional information will be added from Gene Association file, including:

* Full gene name
* Synonyms
* SGD ID

## Log

* 8/22/2016 - Final version for the camera-ready paper
* 3/2/2016 - Adding new data, such as publications.
* 3/11/2016 - first working version.
* 4/14/2016 - missing SGD ID problems fixed.
* 4/25/2016 - New version for updated data files.

----
## Requirments
* Neo4j 1.9.9
* py2neo 2.0.x
* Rexster 2.4.x


## How to use
1. Install Neo4j 1.9.9 to the local machine
1. ```pip install py2neo==2.0.9```
1. Run Neo4j with ```./bin/neo4j start```

In [1]:
from py2neo import Graph
from py2neo import Node, Relationship
import pandas as pd
import numpy as np

# For legacy INDEXER
from py2neo import neo4j

# Start from scratch...
g = Graph()
g.delete_all()

# Create index object
idx = g.legacy.get_or_create_index(neo4j.Node, "Vertex")

print(idx)
print(g.size)

Index(Node, 'http://localhost:7474/db/data/index/node/Vertex')
0


## Create Ontology DAG in Neo4j

In [2]:
old_df_term_props = pd.read_csv('./atgo2_term_info.txt', sep="\t")
old_df_term_props.head()

Unnamed: 0,Term Number,Term Name,Similarity Score,Term Size
0,1,1,0.0,492
1,2,2,1.446,484
2,3,3,1.941,476
3,5,5,2.927,384
4,6,6,3.07,156


In [3]:
# Read properties for AtgO terms
# df_term_props = pd.read_csv('./atgo2_term_info.txt', sep="\t")

df_term_props = pd.read_csv('./data/8_22_2016/atgo2_term_info.txt', sep="\t")
df_term_props.tail()

Unnamed: 0,Term Number,Similarity Score,Num Genes,Curated name
215,2,1.446,489,AtgO:2
216,28,1.415,22,AtgO:28
217,45,1.412,8,AtgO:45
218,36,1.4,9,AtgO:36
219,1,0.0,496,AtgO:1


In [4]:
# df = pd.read_csv('./AtgO2.txt', sep="\t", names=["target", "source", "type"])

df_graph = pd.read_csv('./data/8_22_2016/AtgO2.txt', sep="\t", names=["target", "source", "type", "score"])

nodes = []

s = df_graph["source"].astype(str)
t = df_graph["target"].astype(str)

all_nodes = s.append(t).unique()

print(len(all_nodes))
df_graph.tail()

716


Unnamed: 0,target,source,type,score
1516,1,227,default,0.0
1517,1,29,default,0.0
1518,1,28,default,0.0
1519,1,225,default,0.0
1520,1,2,default,0.0


In [5]:
# Assign genes
#df2 = pd.read_csv('AtgO2_terms_to_genes.txt', sep="\t", names=["term", "genes"])

df2 = pd.read_csv('data/8_22_2016/AtgO2_terms_to_genes.txt', sep="\t", names=["term", "genes"])
df2.head()

Unnamed: 0,term,genes
0,168,CDC5|MCD1
1,78,TAX4|IRS4
2,217,IPL1|BIR1
3,218,VAC8|YCK3
4,215,SSH4|EAR1


In [6]:
assigned_genes = {}

for row in df2.itertuples():
    genes = row[2]
    gene_list = genes.split('|')
    assigned_genes[str(row[1])] = gene_list
    
assigned_genes["78"]

['TAX4', 'IRS4']

In [7]:
# Extract gene info from GA file
HEADER = ["DB", "DB_Object_ID", "DB_Object_Symbol", "Qualifier", "GO ID", "DB:Reference", "Evidence", "With (or) From",
    "Aspect", "DB_Object_Name", "DB_Object_Synonym", "DB_Object_Type", "taxon", "Date", "Assigned_by", 
          "Annotation Extension", "Gene Product Form ID"]
df_ga = pd.read_csv('./gene_association.sgd', sep="\t", comment="!", header=None, names=HEADER);

df_ga.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,DB,DB_Object_ID,DB_Object_Symbol,Qualifier,GO ID,DB:Reference,Evidence,With (or) From,Aspect,DB_Object_Name,DB_Object_Synonym,DB_Object_Type,taxon,Date,Assigned_by,Annotation Extension,Gene Product Form ID
0,SGD,S000007287,15S_RRNA,,GO:0005763,SGD_REF:S000073641|PMID:6262728,IDA,,C,Ribosomal RNA of the small mitochondrial ribos...,Q0020|14s rRNA|15S_RRNA_2,gene,taxon:559292,20150612,SGD,,
1,SGD,S000007287,15S_RRNA,,GO:0032543,SGD_REF:S000073641|PMID:6262728,IC,GO:0005763,P,Ribosomal RNA of the small mitochondrial ribos...,Q0020|14s rRNA|15S_RRNA_2,gene,taxon:559292,20150612,SGD,,
2,SGD,S000007287,15S_RRNA,,GO:0003735,SGD_REF:S000073641|PMID:6262728,IC,GO:0005763,F,Ribosomal RNA of the small mitochondrial ribos...,Q0020|14s rRNA|15S_RRNA_2,gene,taxon:559292,20150612,SGD,,
3,SGD,S000007288,21S_RRNA,,GO:0005762,SGD_REF:S000073372|PMID:6759872,IDA,,C,Mitochondrial 21S rRNA,Q0158|21S_rRNA_3|21S_rRNA_4,gene,taxon:559292,20040202,SGD,,
4,SGD,S000007288,21S_RRNA,,GO:0032543,SGD_REF:S000073372|PMID:6759872,IMP,,P,Mitochondrial 21S rRNA,Q0158|21S_rRNA_3|21S_rRNA_4,gene,taxon:559292,20100715,SGD,,


In [8]:
gene_details = {}

for row in df_ga.itertuples():
    details = {}
    sgd_id = row[2]
    key = row[3]
    full_name = row[10]
    synonym = str(row[11])
    details["sgd"] = str(sgd_id)
    details["name"] = str(full_name)
    syn_list = synonym.split('|')
    details["syn_list"] = syn_list
    
    gene_details[key] = details
    # Use aliases, too
    for syn in syn_list:
         gene_details[syn] = details


In [9]:
# Check gene list is complete
print(gene_details['AIM13'])
print(gene_details['AIM5'])
print(gene_details['AIM37'])
print(gene_details['FCJ1'])
print(gene_details['GSG1'])

# Swap special case for CIS1
original_entry = gene_details['ATG31']
gene_details['CIS1'] = original_entry

print(gene_details['ATG31'])
print(gene_details['CIS1'])

{'syn_list': ['YFR011C', 'AIM13', 'MCS19'], 'name': 'Component of the MICOS complex', 'sgd': 'S000001907'}
{'syn_list': ['YBR262C', 'AIM5', 'FMP51', 'MCS12'], 'name': 'Component of the MICOS complex', 'sgd': 'S000000466'}
{'syn_list': ['YNL100W', 'AIM37', 'MCS27'], 'name': 'Component of the MICOS complex', 'sgd': 'S000005044'}
{'syn_list': ['YKR016W', 'AIM28', 'FCJ1', 'FMP13'], 'name': 'Component of the MICOS complex', 'sgd': 'S000001724'}
{'syn_list': ['YDR108W', 'GSG1', 'MUM1'], 'name': 'Component of transport protein particle (TRAPP) complex III', 'sgd': 'S000002515'}
{'syn_list': ['YDR022C', 'CIS1'], 'name': 'Autophagy-specific protein required for autophagosome formation', 'sgd': 'S000002429'}
{'syn_list': ['YDR022C', 'CIS1'], 'name': 'Autophagy-specific protein required for autophagosome formation', 'sgd': 'S000002429'}


In [10]:
# Test find term info
ti = df_term_props.loc[df_term_props["Term Number"] == 243]
print(ti["Curated name"].values[0])

TOR complex catalytic subunits


In [11]:
def is_int(str_val):
    try: 
        int(str_val)
        return True
    except ValueError:
        return False

def add_node(node_name, assigned_genes, index, df_t):
    
    node_props = {
        "name": str(node_name)
    }
    
    if is_int(node_name):
        # This is an ontology term
        tid = int(node_name)
        term_info = df_t.loc[df_t["Term Number"] == tid]
    
        if term_info.empty == False:
            node_props["term_name"] = str(term_info["Curated name"].values[0])
            node_props["similarity_score"] = term_info["Similarity Score"].values[0]
            node_props["term_size"] = int(term_info["Num Genes"].values[0])
    else:
        node_props["term_name"] = str(node_name)
    
    if node_name in assigned_genes:
        node_props["assigned_genes"] = assigned_genes[node_name]
        if "term_size" not in node_props:
            node_props["term_size"] = len(assigned_genes[node_name])
            node_props["similarity_score"] = 0
    
    # This is a gene entry:
    if node_name in gene_details:
        details = gene_details[node_name]
        node_props["sgd"] = str(details["sgd"])
        node_props["full_name"] = details["name"]
        node_props["synonyms"] = details["syn_list"] 
        
    node = index.get_or_create("name", node_name.lower(), node_props)
     
    if node_name in assigned_genes:
        for gene in node_props["assigned_genes"]:
            index.add("assigned_genes", gene.lower(), node)
        
    if node_name in gene_details:
        index.add_if_none("sgd", node_props["sgd"].lower(), node)
        index.add_if_none("full_name", node_props["full_name"].lower(), node)
        for gene in node_props["synonyms"]:
            index.add("synonyms", gene.lower(), node)
    
    # Extra index
    if "term_name" in node_props:
        index.add_if_none("term_name", node_props["term_name"].lower(), node)
    else:
        print("ERR!!!!!!!!!!!!")
        print(node_props)

    
    return node

In [12]:
# Create actual DAG in database

nodes = {}

for row in df_graph.itertuples():
    source_name = str(row[2])
    target_name = str(row[1])
    
    if source_name in nodes:
        source = nodes[source_name]
    else:
        source = add_node(source_name, assigned_genes, idx, df_term_props)
        nodes[source_name] = source
    
    if target_name in nodes:
        target = nodes[target_name]
    else:
        target = add_node(target_name, assigned_genes, idx, df_term_props)
        nodes[target_name] = target

    edge = Relationship(source, "child_of", target)
    edge.properties["type"] = "term_relation" 
    g.create(edge)

In [13]:
print(g.size)

# test result
results0 = idx.query("name:YBR056W")
results1 = idx.query("name:ybr*")

results2 = idx.query("sgd:S000006124")
results3 = idx.query("synonyms:PKA2")
results4 = idx.query("full_name:*mapk*")


print(results1)
for n in results1:
    print(n)

res2 = idx.query("term_name:'autophagy and related processes'")

for n in results4:
    print(n)

1521
<generator object Index.query at 0x116a11db0>
(n571 {full_name:"Putative glycoside hydrolase of the mitochondrial intermembrane space",name:"YBR056W",sgd:"S000000260",synonyms:["YBR056W","17-beta-hydroxysteroid dehydrogenase-like protein"],term_name:"YBR056W"})
(n665 {full_name:"Protein of unknown function",name:"YBR287W",sgd:"S000000491",synonyms:["YBR287W","ZSP1"],term_name:"YBR287W"})
(n698 {full_name:"Protein with a role in ER delivery of tail-anchored membrane proteins",name:"YBR137W",sgd:"S000000341",synonyms:["YBR137W"],term_name:"YBR137W"})
(n115 {full_name:"Middle sporulation-specific mitogen-activated protein kinase (MAPK)",name:"SMK1",sgd:"S000006258",synonyms:["YPR054W","mitogen-activated protein kinase SMK1"],term_name:"SMK1"})
(n485 {full_name:"Mitogen-activated protein kinase (MAPK)",name:"KSS1",sgd:"S000003272",synonyms:["YGR040W","mitogen-activated serine/threonine-protein kinase KSS1"],term_name:"KSS1"})
(n586 {full_name:"MAPKKK acting in the protein kinase C sig

## Insert raw interactions

In [14]:
# New Raw interaction data 
#df_raw2 = pd.read_csv('./raw_interactions_2_w_pm_ids.txt', sep="\t");
df_raw2 = pd.read_csv('./data/8_22_2016/intTable_kei_pm_ranks.txt', sep='\t')

print(df_raw2.shape)
print(df_raw2.columns)
df_raw2.head(10)

(122760, 22)
Index(['INT1', 'INT2', 'Co-expression', 'Genetic interactions',
       'Protein-protein interactions (high-throughput)',
       'Domain co-occurrence', 'Genomic context', 'Phylogenetic similarity',
       'Predicted from 3D structure', 'Conditional genetic interactions',
       'Co-citation', 'Protein-protein interactions (low-throughput)',
       'Co-expression.1', 'Genetic interactions.1',
       'Protein-protein interactions (high-throughput).1',
       'Domain co-occurrence.1', 'Genomic context.1',
       'Phylogenetic similarity.1', 'Predicted from 3D structure.1',
       'Conditional genetic interactions.1', 'Co-citation.1',
       'Protein-protein interactions (low-throughput).1'],
      dtype='object')


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,INT1,INT2,Co-expression,Genetic interactions,Protein-protein interactions (high-throughput),Domain co-occurrence,Genomic context,Phylogenetic similarity,Predicted from 3D structure,Conditional genetic interactions,...,Co-expression.1,Genetic interactions.1,Protein-protein interactions (high-throughput).1,Domain co-occurrence.1,Genomic context.1,Phylogenetic similarity.1,Predicted from 3D structure.1,Conditional genetic interactions.1,Co-citation.1,Protein-protein interactions (low-throughput).1
0,YBR056W,GPM2,0.798,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,22745270,0,0,0,0,0,0,0,0,0
1,YBR056W,KKQ8,0.517,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,22289124,0,0,0,0,0,0,0,0,0
2,YBR056W,RTK1,0.824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,21348598,0,0,0,0,0,0,0,0,0
3,YBR056W,YBR287W,0.483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,24170935,0,0,0,0,0,0,0,0,0
4,YBR056W,YET2,0.763,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,24170935,0,0,0,0,0,0,0,0,0
5,YBR056W,YIL055C,0.849,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,26150418,0,0,0,0,0,0,0,0,0
6,YBR056W,YDL027C,0.58,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,GSE30054,0,0,0,0,0,0,0,0,0
7,YBR056W,YDL057W,0.275,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,18854817,0,0,0,0,0,0,0,0,0
8,YBR056W,UGX2,0.986,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,22289124,0,0,0,0,0,0,0,0,0
9,YBR056W,YGR130C,0.788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,20581845,0,0,0,0,0,0,0,0,0


In [None]:
old_df_raw2 = pd.read_csv('./raw_interactions_2_w_pm_ids.txt', sep="\t");
print(old_df_raw2.shape)
print(old_df_raw2.columns)
old_df_raw2.head(10)

In [15]:
interaction_types = df_raw2.columns[2:12]
print(interaction_types)

Index(['Co-expression', 'Genetic interactions',
       'Protein-protein interactions (high-throughput)',
       'Domain co-occurrence', 'Genomic context', 'Phylogenetic similarity',
       'Predicted from 3D structure', 'Conditional genetic interactions',
       'Co-citation', 'Protein-protein interactions (low-throughput)'],
      dtype='object')


In [16]:
def create_edges(source, target, row, headers, g):
    edges = []
    
    scores = row[3:13]
    pubs = row[13:]
    
    for i, score in enumerate(scores):
        if score == 0:
            continue
        else:
            e = Relationship(source, headers[i], target)
            e.properties["score"] = score
            e.properties["publication"] = str(pubs[i])

            e.properties["type"] = "raw_interaction" 
            g.create(e)
            
def create_node(key, idx, gene_details):
    node_dict = None
    
    if key in gene_details:
        details = gene_details[key]
        node_dict = {
                    "name": key,
                    "term_name": key,
                    "sgd": details['sgd'],
                    "full_name": details["name"],
                    "synonyms": details["syn_list"] }
    else:
        node_dict = {
           "name": key,
            "term_name": key
        }
    
    node = idx.get_or_create("name", key.lower(), node_dict)
    
    return node

In [17]:
if "SNX4" in gene_details:
    print("OK")

OK


In [18]:
for row in df_raw2.itertuples():
    s = str(row[1])
    t = str(row[2])
    s_node = idx.get("name", s)
    t_node = idx.get("name", t)
    
    if len(t_node) == 0:
        target = create_node(t, idx, gene_details)
    else:
        target = t_node[0]
        
    if len(s_node) == 0:
        source = create_node(s, idx, gene_details)
    else:
        source = s_node[0]
    
    create_edges(source, target, row, interaction_types, g)

In [19]:
g.size

44555