# Database builder scripts for Ontology Viewer

This is a script to insert AtgO data (DAG and raw interactions) into Neo4j database.

In [1]:
from py2neo import Graph
from py2neo import Node, Relationship
import pandas as pd
import numpy as np

# For legacy INDEXER
from py2neo import neo4j

# Start from scratch...
g = Graph()
g.delete_all()

# Create index object
idx = g.legacy.get_or_create_index(neo4j.Node, "Vertex")

print(idx)
print(g.size)

Index(Node, 'http://localhost:7474/db/data/index/node/Vertex')
0


## Create Ontology DAG in Neo4j

In [2]:
# Read properties for AtgO terms
df_term_props = pd.read_csv('./atgo2_term_info.txt', sep="\t")
df_term_props.tail(10)

Unnamed: 0,Term Number,Term Name,Similarity Score,Term Size
210,242,242,2.524,23
211,243,cell cycle and Snf1 signaling pathway,5.303,24
212,244,244,3.356,28
213,245,245,3.483,30
214,246,246,2.285,41
215,247,247,3.855,44
216,248,248,1.482,46
217,249,autophagy and related processes,3.856,300
218,250,250,2.43,431
219,251,251,2.004,444


In [3]:
df = pd.read_csv('./AtgO2.txt', sep="\t", names=["target", "source", "type"])

nodes = []

s = df["source"].astype(str)
t = df["target"].astype(str)

all_nodes = s.append(t).unique()

print(len(all_nodes))
df.head()

712


Unnamed: 0,target,source,type
0,212,RIM13,gene
1,212,VPS4,gene
2,174,IWR1,gene
3,174,VPS21,gene
4,172,MSN1,gene


In [4]:
# Assign genes
df2 = pd.read_csv('AtgO2_terms_to_genes.txt', sep="\t", names=["term", "genes"])
df2.head()

Unnamed: 0,term,genes
0,1,RIM13|VPS4|IWR1|VPS21|MSN1|SSD1|AKR1|AKR2|GCN2...
1,2,RIM13|VPS4|IWR1|VPS21|MSN1|SSD1|AKR1|AKR2|GCN2...
2,3,RIM13|VPS4|IWR1|VPS21|MSN1|SSD1|AKR1|AKR2|GCN2...
3,5,VPS4|IWR1|VPS21|SSD1|AKR1|AKR2|GCN2|GCN4|CTK1|...
4,6,VPS4|IWR1|VPS21|GCN2|GCN4|CTK1|SEC7|DOM34|HBS1...


In [5]:
assigned_genes = {}

for row in df2.itertuples():
    genes = row[2]
    gene_list = genes.split('|')
    assigned_genes[str(row[1])] = gene_list

In [6]:
# Test find term info
ti = df_term_props.loc[df_term_props["Term Number"] == 243]
print(ti["Term Name"].values[0])

def is_int(str_val):
    try: 
        int(str_val)
        return True
    except ValueError:
        return False

def add_node(node_name, assigned_genes, index, df_t):
    
    node_props = {
        "name": str(node_name)
    }
    
    if is_int(node_name):
        tid = int(node_name)
        term_info = df_t.loc[df_t["Term Number"] == tid]
    
        if term_info.empty == False:
            node_props["term_name"] = str(term_info["Term Name"].values[0])
            node_props["similarity_score"] = term_info["Similarity Score"].values[0]
            node_props["term_size"] = int(term_info["Term Size"].values[0])
    else:
        node_props["term_name"] = str(node_name)
    
    if node_name in assigned_genes:
        node_props["assigned_genes"] = assigned_genes[node_name]
   
    node = index.get_or_create("name", node_name, node_props)
    
    # Extra index
    index.add_if_none("term_name", node_props["term_name"], node)
    
    return node

cell cycle and Snf1 signaling pathway


In [7]:
# Create actual DAG in database

nodes = {}

for row in df.itertuples():
    source_name = str(row[2])
    target_name = str(row[1])
    
    if source_name in nodes:
        source = nodes[source_name]
    else:
        source = add_node(source_name, assigned_genes, idx, df_term_props)
        nodes[source_name] = source
    
    if target_name in nodes:
        target = nodes[target_name]
    else:
        target = add_node(target_name, assigned_genes, idx, df_term_props)
        nodes[target_name] = target

    edge = Relationship(source, "child_of", target)
    edge.properties["type"] = "term_relation" 
    g.create(edge)

In [8]:
print(g.size)

# test result
results = idx.query("name:YBR056W")

res2 = idx.get("term_name", "autophagy and related processes")

print(res2[0])

for n in results:
    print(n)

1445
(n671 {assigned_genes:["VPS4","VPS21","SSD1","AKR1","AKR2","GCN2","GCN4","CTK1","SGV1","PTK2","PTK1","ARF1","SEC7","ATG8","SNX4","IRS4","IPL1","BIR1","RHO1","PKC1","ATG14","VPS30","ARP2","ACT1","STT4","LSB6","VMA2","YPT32","YPT31","GGA1","EPL1","ESA1","SMK1","CDC28","KIN82","FPK1","CDC5","PSK2","GPD1","ATG7","OPI1","CBF1","VAC8","YCK3","NTH1","NVJ1","MIG1","MON1","CCZ1","ARL1","KDX1","PTC1","ATG18","RIM15","GIS1","VPS9","ATG19","ATG26","TOF2","CSM1","STE11","STE50","NBP2","GCN3","BMH1","BMH2","SWH1","ARF2","GEA1","GEA2","SIP2","SNF1","SLG1","WSC3","KIN1","KIN2","AFT1","AFT2","SIC1","GTR2","YCK1","YCK2","YPK1","YPK2","PHO85","PHO80","YPT52","UBP3","BRE5","SEC4","YPT1","CKA2","CKA1","ATG1","ATG13","ATG10","ATG27","SSK22","SSK2","VPS29","PEP8","ATG3","MID2","MTL1","VPS52","VPS51","RCK2","HOG1","PKH1","PKH2","TOR2","TOR1","PEP12","VAM7","VAM3","VTC3","VTC4","VTC2","COG3","ATG12","SNF8","VPS36","HAL5","SAT4","TLG2","VPS13","VPS1","CMK1","SLT2","MPS1","SKM1","STE20","CLA4","REG1","DUN1"

## Insert raw interactions

In [9]:
df_raw = pd.read_csv('./raw_interactions.txt', sep="\t")
print(df_raw.shape)
df_raw.tail()

(26548, 10)


Unnamed: 0,INT1,INT2,Co-expression,Genetic interactions,Protein-protein interactions,Domain co-occurrence,Genomic context,Phylogenetic similarity,Predicted from 3D structure,Conditional genetic interactions
26543,SAC1,TCD2,0.0,1.155,0.0,0.0,0,0.0,0,0
26544,SAC1,PKR1,0.0,2.771,2.515,0.0,0,0.0,0,0
26545,TCD1,TCD2,1.154,3.299,4.542,4.13,0,4.609,0,0
26546,TCD1,PKR1,0.0,1.051,0.0,0.0,0,0.0,0,0
26547,TCD2,PKR1,0.0,0.996,0.0,0.0,0,0.0,0,0


In [10]:
interaction_types = df_raw.columns[2:]
print(interaction_types)

Index(['Co-expression', 'Genetic interactions', 'Protein-protein interactions',
       'Domain co-occurrence', 'Genomic context', 'Phylogenetic similarity',
       'Predicted from 3D structure', 'Conditional genetic interactions'],
      dtype='object')


In [11]:
def create_edges(source, target, row, headers, g):
    edges = []
    
    scores = row[3:]
    for i, score in enumerate(scores):
        if score == 0:
            continue
        else:
            e = Relationship(source, headers[i], target)
            e.properties["score"] = score
            e.properties["type"] = "raw_interaction" 
            g.create(e)

In [12]:
for row in df_raw.itertuples():
    s = str(row[1])
    t = str(row[2])
    s_node = idx.get("name", s)
    t_node = idx.get("name", t)
    
    if len(t_node) == 0:
        target = idx.get_or_create(
                "name", t, {
                    "name": t
                })
    else:
        target = t_node[0]
        
    if len(s_node) == 0:
        source = idx.get_or_create(
                "name", s, {
                    "name": s
                })
    else:
        source = s_node[0]
    
    create_edges(source, target, row, interaction_types, g)
    
#     e = Relationship(source, "interact_with", target)
    
#     # Add properties
#     props = row[3:]
    
#     for i, prop in enumerate(props):
#         e.properties[interaction_types[i]] = prop
    
#     g.create(e)

In [13]:
g.size

39769