### Graph database builder for AtgO Ontology Viewer

## Introduction

This is a script to insert AtgO data (DAG and raw interactions) into Neo4j database.

This is the V3.  Additional information will be added from Gene Association file, including:

* Full gene name
* Synonyms
* SGD ID

## Log

* 3/2/2016 - Adding new data, such as publications.
* 3/11/2016 - first working version.
* 4/14/2016 - missing SGD ID problems fixed.

In [1]:
from py2neo import Graph
from py2neo import Node, Relationship
import pandas as pd
import numpy as np

# For legacy INDEXER
from py2neo import neo4j

# Start from scratch...
g = Graph()
g.delete_all()

# Create index object
idx = g.legacy.get_or_create_index(neo4j.Node, "Vertex")

print(idx)
print(g.size)

Index(Node, 'http://localhost:7474/db/data/index/node/Vertex')
0


## Create Ontology DAG in Neo4j

In [2]:
# Read properties for AtgO terms
df_term_props = pd.read_csv('./atgo2_term_info.txt', sep="\t")
df_term_props.tail()

Unnamed: 0,Term Number,Term Name,Similarity Score,Term Size
215,247,247,3.855,44
216,248,248,1.482,46
217,249,autophagy and related processes,3.856,300
218,250,250,2.43,431
219,251,251,2.004,444


In [3]:
df = pd.read_csv('./AtgO2.txt', sep="\t", names=["target", "source", "type"])

nodes = []

s = df["source"].astype(str)
t = df["target"].astype(str)

all_nodes = s.append(t).unique()

print(len(all_nodes))
df.head()

712


Unnamed: 0,target,source,type
0,212,RIM13,gene
1,212,VPS4,gene
2,174,IWR1,gene
3,174,VPS21,gene
4,172,MSN1,gene


In [4]:
# Assign genes
df2 = pd.read_csv('AtgO2_terms_to_genes.txt', sep="\t", names=["term", "genes"])
df2.head()

Unnamed: 0,term,genes
0,1,RIM13|VPS4|IWR1|VPS21|MSN1|SSD1|AKR1|AKR2|GCN2...
1,2,RIM13|VPS4|IWR1|VPS21|MSN1|SSD1|AKR1|AKR2|GCN2...
2,3,RIM13|VPS4|IWR1|VPS21|MSN1|SSD1|AKR1|AKR2|GCN2...
3,5,VPS4|IWR1|VPS21|SSD1|AKR1|AKR2|GCN2|GCN4|CTK1|...
4,6,VPS4|IWR1|VPS21|GCN2|GCN4|CTK1|SEC7|DOM34|HBS1...


In [5]:
assigned_genes = {}

for row in df2.itertuples():
    genes = row[2]
    gene_list = genes.split('|')
    assigned_genes[str(row[1])] = gene_list

In [6]:
# Extract gene info from GA file
HEADER = ["DB", "DB_Object_ID", "DB_Object_Symbol", "Qualifier", "GO ID", "DB:Reference", "Evidence", "With (or) From",
    "Aspect", "DB_Object_Name", "DB_Object_Synonym", "DB_Object_Type", "taxon", "Date", "Assigned_by", 
          "Annotation Extension", "Gene Product Form ID"]
df_ga = pd.read_csv('./gene_association.sgd', sep="\t", comment="!", header=None, names=HEADER);

df_ga.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,DB,DB_Object_ID,DB_Object_Symbol,Qualifier,GO ID,DB:Reference,Evidence,With (or) From,Aspect,DB_Object_Name,DB_Object_Synonym,DB_Object_Type,taxon,Date,Assigned_by,Annotation Extension,Gene Product Form ID
0,SGD,S000007287,15S_RRNA,,GO:0005763,SGD_REF:S000073641|PMID:6262728,IDA,,C,Ribosomal RNA of the small mitochondrial ribos...,Q0020|14s rRNA|15S_RRNA_2,gene,taxon:559292,20150612,SGD,,
1,SGD,S000007287,15S_RRNA,,GO:0032543,SGD_REF:S000073641|PMID:6262728,IC,GO:0005763,P,Ribosomal RNA of the small mitochondrial ribos...,Q0020|14s rRNA|15S_RRNA_2,gene,taxon:559292,20150612,SGD,,
2,SGD,S000007287,15S_RRNA,,GO:0003735,SGD_REF:S000073641|PMID:6262728,IC,GO:0005763,F,Ribosomal RNA of the small mitochondrial ribos...,Q0020|14s rRNA|15S_RRNA_2,gene,taxon:559292,20150612,SGD,,
3,SGD,S000007288,21S_RRNA,,GO:0005762,SGD_REF:S000073372|PMID:6759872,IDA,,C,Mitochondrial 21S rRNA,Q0158|21S_rRNA_3|21S_rRNA_4,gene,taxon:559292,20040202,SGD,,
4,SGD,S000007288,21S_RRNA,,GO:0032543,SGD_REF:S000073372|PMID:6759872,IMP,,P,Mitochondrial 21S rRNA,Q0158|21S_rRNA_3|21S_rRNA_4,gene,taxon:559292,20100715,SGD,,


In [7]:
gene_details = {}

for row in df_ga.itertuples():
    details = {}
    sgd_id = row[2]
    key = row[3]
    full_name = row[10]
    synonym = str(row[11])
    details["sgd"] = str(sgd_id)
    details["name"] = str(full_name)
    syn_list = synonym.split('|')
    details["syn_list"] = syn_list
    
    gene_details[key] = details
    # Use aliases, too
    for syn in syn_list:
         gene_details[syn] = details


In [10]:
# Check gene list is complete
print(gene_details['AIM13'])
print(gene_details['AIM5'])
print(gene_details['AIM37'])
print(gene_details['FCJ1'])
print(gene_details['GSG1'])

# Swap special case for CIS1
original_entry = gene_details['ATG31']
gene_details['CIS1'] = original_entry

print(gene_details['ATG31'])
print(gene_details['CIS1'])

{'syn_list': ['YFR011C', 'AIM13', 'MCS19'], 'sgd': 'S000001907', 'name': 'Component of the MICOS complex'}
{'syn_list': ['YBR262C', 'AIM5', 'FMP51', 'MCS12'], 'sgd': 'S000000466', 'name': 'Component of the MICOS complex'}
{'syn_list': ['YNL100W', 'AIM37', 'MCS27'], 'sgd': 'S000005044', 'name': 'Component of the MICOS complex'}
{'syn_list': ['YKR016W', 'AIM28', 'FCJ1', 'FMP13'], 'sgd': 'S000001724', 'name': 'Component of the MICOS complex'}
{'syn_list': ['YDR108W', 'GSG1', 'MUM1'], 'sgd': 'S000002515', 'name': 'Component of transport protein particle (TRAPP) complex III'}
{'syn_list': ['YDR022C', 'CIS1'], 'sgd': 'S000002429', 'name': 'Autophagy-specific protein required for autophagosome formation'}
{'syn_list': ['YDR022C', 'CIS1'], 'sgd': 'S000002429', 'name': 'Autophagy-specific protein required for autophagosome formation'}


In [11]:
# Test find term info
ti = df_term_props.loc[df_term_props["Term Number"] == 243]
print(ti["Term Name"].values[0])

def is_int(str_val):
    try: 
        int(str_val)
        return True
    except ValueError:
        return False

def add_node(node_name, assigned_genes, index, df_t):
    
    node_props = {
        "name": str(node_name)
    }
    
    if is_int(node_name):
        tid = int(node_name)
        term_info = df_t.loc[df_t["Term Number"] == tid]
    
        if term_info.empty == False:
            node_props["term_name"] = str(term_info["Term Name"].values[0])
            node_props["similarity_score"] = term_info["Similarity Score"].values[0]
            node_props["term_size"] = int(term_info["Term Size"].values[0])
    else:
        node_props["term_name"] = str(node_name)
    
    if node_name in assigned_genes:
        node_props["assigned_genes"] = assigned_genes[node_name]
    
    # This is a gene entry:
    if node_name in gene_details:
        details = gene_details[node_name]
        node_props["sgd"] = str(details["sgd"])
        node_props["full_name"] = details["name"]
        node_props["synonyms"] = details["syn_list"] 
        
    node = index.get_or_create("name", node_name.lower(), node_props)
     
    if node_name in assigned_genes:
        for gene in node_props["assigned_genes"]:
            index.add("assigned_genes", gene.lower(), node)
        
    if node_name in gene_details:
        index.add_if_none("sgd", node_props["sgd"].lower(), node)
        index.add_if_none("full_name", node_props["full_name"].lower(), node)
        for gene in node_props["synonyms"]:
            index.add("synonyms", gene.lower(), node)
    
    # Extra index
    index.add_if_none("term_name", node_props["term_name"].lower(), node)
    
    return node

cell cycle and Snf1 signaling pathway


In [12]:
# Create actual DAG in database

nodes = {}

for row in df.itertuples():
    source_name = str(row[2])
    target_name = str(row[1])
    
    if source_name in nodes:
        source = nodes[source_name]
    else:
        source = add_node(source_name, assigned_genes, idx, df_term_props)
        nodes[source_name] = source
    
    if target_name in nodes:
        target = nodes[target_name]
    else:
        target = add_node(target_name, assigned_genes, idx, df_term_props)
        nodes[target_name] = target

    edge = Relationship(source, "child_of", target)
    edge.properties["type"] = "term_relation" 
    g.create(edge)

In [13]:
print(g.size)

# test result
results0 = idx.query("name:YBR056W")
results1 = idx.query("name:ybr*")

results2 = idx.query("sgd:S000006124")
results3 = idx.query("synonyms:PKA2")
results4 = idx.query("full_name:*mapk*")


print(results1)
for n in results1:
    print(n)

res2 = idx.query("term_name:'autophagy and related processes'")

for n in results4:
    print(n)

1445
<generator object query at 0x1092a0d38>
(n570 {full_name:"Protein of unknown function",name:"YBR287W",sgd:"S000000491",synonyms:["YBR287W","ZSP1"],term_name:"YBR287W"})
(n580 {full_name:"Putative glycoside hydrolase of the mitochondrial intermembrane space",name:"YBR056W",sgd:"S000000260",synonyms:["YBR056W","17-beta-hydroxysteroid dehydrogenase-like protein"],term_name:"YBR056W"})
(n698 {full_name:"Protein with a role in ER delivery of tail-anchored membrane proteins",name:"YBR137W",sgd:"S000000341",synonyms:["YBR137W"],term_name:"YBR137W"})
(n81 {full_name:"Middle sporulation-specific mitogen-activated protein kinase (MAPK)",name:"SMK1",sgd:"S000006258",synonyms:["YPR054W","mitogen-activated protein kinase SMK1"],term_name:"SMK1"})
(n589 {full_name:"MAPKKK acting in the protein kinase C signaling pathway",name:"BCK1",sgd:"S000003631",synonyms:["YJL095W","LAS3","SAP3","SLK1","SSP31","mitogen-activated protein kinase kinase kinase BCK1"],term_name:"BCK1"})
(n591 {full_name:"MAPKK 

## Insert raw interactions

In [14]:
# New Raw interaction data 
df_raw2 = pd.read_csv('./raw_interactions_w_pm_ids.txt', sep="\t");
print(df_raw2.shape)
df_raw2.head(10)

(26548, 18)


Unnamed: 0,INT1,INT2,Co-expression,Genetic interactions,Protein-protein interactions,Domain co-occurrence,Genomic context,Phylogenetic similarity,Predicted from 3D structure,Conditional genetic interactions,Co-expression.1,Genetic interactions.1,Protein-protein interactions.1,Domain co-occurrence.1,Genomic context.1,Phylogenetic similarity.1,Predicted from 3D structure.1,Conditional genetic interactions.1
0,YBR056W,GPM2,1.998,0,0,0,0,0,0,0,22745270,0,0,0,0,0,0,0
1,YBR056W,KKQ8,1.642,0,0,0,0,0,0,0,22289124,0,0,0,0,0,0,0
2,YBR056W,RTK1,2.047,0,0,0,0,0,0,0,21348598,0,0,0,0,0,0,0
3,YBR056W,YBR287W,1.612,0,0,0,0,0,0,0,24170935,0,0,0,0,0,0,0
4,YBR056W,YET2,1.942,0,0,0,0,0,0,0,24170935,0,0,0,0,0,0,0
5,YBR056W,YIL055C,2.096,0,0,0,0,0,0,0,26150418,0,0,0,0,0,0,0
6,YBR056W,YDL027C,1.706,0,0,0,0,0,0,0,GSE30054,0,0,0,0,0,0,0
7,YBR056W,YDL057W,1.436,0,0,0,0,0,0,0,18854817,0,0,0,0,0,0,0
8,YBR056W,UGX2,2.716,0,0,0,0,0,0,0,22289124,0,0,0,0,0,0,0
9,YBR056W,YGR130C,1.982,0,0,0,0,0,0,0,20581845,0,0,0,0,0,0,0


In [15]:
interaction_types = df_raw2.columns[2:]
print(interaction_types)

Index(['Co-expression', 'Genetic interactions', 'Protein-protein interactions',
       'Domain co-occurrence', 'Genomic context', 'Phylogenetic similarity',
       'Predicted from 3D structure', 'Conditional genetic interactions',
       'Co-expression.1', 'Genetic interactions.1',
       'Protein-protein interactions.1', 'Domain co-occurrence.1',
       'Genomic context.1', 'Phylogenetic similarity.1',
       'Predicted from 3D structure.1', 'Conditional genetic interactions.1'],
      dtype='object')


In [16]:
def create_edges(source, target, row, headers, g):
    edges = []
    
    scores = row[3:11]
    pubs = row[11:]
    
    for i, score in enumerate(scores):
        if score == 0:
            continue
        else:
            e = Relationship(source, headers[i], target)
            e.properties["score"] = score
            e.properties["publication"] = str(pubs[i])

            e.properties["type"] = "raw_interaction" 
            g.create(e)
            
def create_node(key, idx, gene_details):
    node_dict = None
    
    if key in gene_details:
        details = gene_details[key]
        node_dict = {
                    "name": key,
                    "term_name": key,
                    "sgd": details['sgd'],
                    "full_name": details["name"],
                    "synonyms": details["syn_list"] }
    else:
        node_dict = {
           "name": key,
            "term_name": key
        }
    
    node = idx.get_or_create("name", key.lower(), node_dict)
    
    return node

In [17]:
if "SNX4" in gene_details:
    print("OK")

OK


In [18]:
for row in df_raw2.itertuples():
    s = str(row[1])
    t = str(row[2])
    s_node = idx.get("name", s)
    t_node = idx.get("name", t)
    
    if len(t_node) == 0:
        target = create_node(t, idx, gene_details)
    else:
        target = t_node[0]
        
    if len(s_node) == 0:
        source = create_node(s, idx, gene_details)
    else:
        source = s_node[0]
    
    create_edges(source, target, row, interaction_types, g)
    
#     e = Relationship(source, "interact_with", target)
    
#     # Add properties
#     props = row[3:]
    
#     for i, prop in enumerate(props):
#         e.properties[interaction_types[i]] = prop
    
#     g.create(e)

In [19]:
g.size

39769