# Creation of the Neo4J Database


In [6]:
import config
import time

import pandas as pd

from neo4j import GraphDatabase

In [7]:
# set up connection
driver = GraphDatabase.driver(config.uri, auth=(config.user, config.password), encrypted=False)

# Check if connection did work
def check_connectivity(driver):
    try:
        with driver.session() as session:
            result = session.run("RETURN 1 as result")
            record = result.single()
            if record and record["result"] == 1:
                return True
            else:
                print('False')
                return False
    except Exception as e:
        return False


if check_connectivity(driver):
    print("Verbindung erfolgreich hergestellt.")
else:
    print("Fehler bei der Verbindungsherstellung.")

Verbindung erfolgreich hergestellt.


## Indexing of Database
### Delete Database Edges and Nodes

In [8]:
# Delete Nodes for Genes
def delete_gene_nodes(batch_size = 10000):
    with driver.session() as session:
        result = session.run("MATCH (n:gene) RETURN n.id AS id")
        node_ids = [record["id"] for record in result]
    
        for i in range(0, len(node_ids), batch_size):
            batch = node_ids[i:i + batch_size]
            session.run("""
            UNWIND $batch AS id
            MATCH (n:gene {id: id})
            DETACH DELETE n
            """, batch=batch)
    
            now = time.strftime("%H:%M:%S", time.localtime())
            print(f"{now}\tBatch {i // batch_size + 1} of {len(node_ids) // batch_size + 1} complete")
            
# To delete everything:delete the docker container
# docker rm container-number -v

In [9]:
"""## DELETES Nodes and Edges
delete_gene_nodes()
print("Delete Gene Nodes done.")"""

'## DELETES Nodes and Edges\ndelete_gene_nodes()\nprint("Delete Gene Nodes done.")'

## Load Data as Dataframes


In [10]:
df_nodes_gene = pd.read_csv('../processed_data/nodes_genes.csv')
df_nodes_protein = pd.read_csv('../processed_data/nodes_protein.csv')

df_edges_protein = pd.read_csv('../processed_data/edges_protein.csv')
df_edges_protein_gene = pd.read_csv('../processed_data/edges_protein_gene.csv')

In [11]:
# Measure the time
start_time = time.time()

## Proteins
### Protein Nodes

**CYPHER QUERY:**
```
CREATE (p:protein {id: 'Protein stable ID'})
```

**Node-Types:**
* protein
	

**Node-Properties:**
* id

In [12]:
with driver.session() as session:
    query = """CREATE INDEX IF NOT EXISTS FOR (p:protein) ON (p.id)"""
    
    session.run(query)

In [13]:
print(f"There are {df_nodes_protein.shape[0]} Protein nodes")

There are 104235 Protein nodes


In [14]:
data = [{'id': row['Protein ID']} for index, row in df_nodes_protein.iterrows()]

with driver.session() as session:
    batch_size = 500
    for i in range(0, len(data), batch_size):
        batch = data[i:i + batch_size]

        query = """
        UNWIND $batch AS row
        CREATE (p:protein {id: row.id})
        """

        # Batch als Parameter übergeben
        session.run(query, batch=batch)

        if i % 50000 == 0:
            now = time.strftime("%H:%M:%S", time.localtime())
            print(f"{now}\tBatch {i // batch_size + 1} of {len(data) // batch_size + 1} complete")

print("done!!")

16:03:42	Batch 1 of 209 complete
16:03:44	Batch 101 of 209 complete
16:03:46	Batch 201 of 209 complete
done!!


### Protein-Protein Edges
Every Edge is a Interaction between two Proteins.
Interaction type is not defined further.


**Cypher Query:**
``` 
MATCH (s:protein{id:'protein1'})
MATCH (s:protein{id:'protein2'})
CREATE (s)-[:interaction]->(t)
```

In [15]:
print(f"There are {df_edges_protein.shape[0]} Protein-Protein Interactions")

There are 13715404 Protein-Protein Interactions


In [16]:
protein1 = df_edges_protein['left Protein ID'].tolist()
protein2 = df_edges_protein['right Protein ID'].tolist()
data = list(zip(protein1, protein2))


with driver.session() as session:
    batch_size = 1000000

    
    for i in range(0, len(data), batch_size):
        batch = data[i:i + batch_size]

        query = """
        UNWIND $batch AS row
        MATCH (s:protein{id:row[0]})
        MATCH (t:protein{id:row[1]})
        CREATE (s)-[:interaction]->(t)
        """

        session.run(query, batch=batch)

        if i % batch_size == 0:
            now = time.strftime("%H:%M:%S", time.localtime())
            print(f"{now}\tBatch {i // batch_size + 1} of {len(data) // batch_size + 1} complete")
    
print("done!!")

16:04:05	Batch 1 of 14 complete
16:04:23	Batch 2 of 14 complete
16:04:41	Batch 3 of 14 complete
16:05:01	Batch 4 of 14 complete
16:05:23	Batch 5 of 14 complete
16:05:44	Batch 6 of 14 complete
16:06:14	Batch 7 of 14 complete
16:06:34	Batch 8 of 14 complete
16:06:57	Batch 9 of 14 complete
16:07:20	Batch 10 of 14 complete
16:07:45	Batch 11 of 14 complete
16:08:09	Batch 12 of 14 complete
16:08:28	Batch 13 of 14 complete
16:08:56	Batch 14 of 14 complete
done!!


## Genes
### Gene Nodes

**CYPHER QUERY:**
```
CREATE (p:gene: { 
    id: Gene ID,
    gene_name: Gene Name,
    norm_healthy_TPM: norm healthy TPM,
    norm_cancer_tpm: norm cancer TPM,
    Δ_TPM: Δ TPM,
    Δ_type: Δ type,
    z_score: z score,
    Δ_TPM_relevant: Δ TPM relevant})
```

**Node-Types:**
* gene
	

**Node-Properties:**
* Gene ID
* Gene Name
* norm healthy TPM
* norm cancer TPM
* Δ TPM
* Δ type
* z score
* Δ TPM relevant


In [17]:
with driver.session() as session:
    query = """CREATE INDEX IF NOT EXISTS FOR (p:gene) ON (p.id)"""
    
    session.run(query)

In [18]:
print(f"There are {df_nodes_gene.shape[0]} Gene nodes")

There are 17627 Gene nodes


In [19]:
data = [{'id': row['Gene ID'], 
         'gene_name': row['Gene Name'],
         'norm_healthy_TPM': row['norm healthy TPM'],
         'norm_cancerous_TPM': row['norm cancerous TPM'],
         'Δ_TPM': row['Δ TPM'],
         'Δ_type': row['Δ type'],
         'z_score': row['z score'],
         'Δ_TPM_relevant': row['Δ TPM relevant']} 
        for index, row in df_nodes_gene.iterrows()]

with driver.session() as session:
    batch_size = 1000
    
    for i in range(0, len(data), batch_size):
        batch = data[i:i + batch_size]
        
        query = """
        UNWIND $batch AS row
        CREATE (p:gene {
            id: row.id, 
            gene_name: row.gene_name,
            norm_healthy_tpm: row.norm_healthy_tpm,
            norm_cancerous_tpm: row.norm_cancerous_tpm,
            Δ_TPM: row.Δ_TPM,
            Δ_type: row.Δ_type,
            z_score: row.z_score,
            Δ_TPM_relevant: row.Δ_TPM_relevant})
        """
        
        session.run(query, batch=batch)
            
        if i % 10000 == 0:
            now = time.strftime("%H:%M:%S", time.localtime())
            print(f"{now}\tBatch {i // batch_size + 1} of {len(data) // batch_size + 1} complete")
            
print("done!!")

16:09:15	Batch 1 of 18 complete
16:09:16	Batch 11 of 18 complete
done!!


### Protein-Gene Edges
Every Edge is a Connection between a Protein and a gene.
Interaction type is not defined further.


**Cypher Query:**
``` 
MATCH (s:protein{id:'id'})
MATCH (s:gene{id:'id'})
CREATE (s)-[:connection]-(t)
```


In [20]:
print(f"There are {df_edges_protein_gene.shape[0]} Protein-Gene connections")

There are 101731 Protein-Gene connections


In [21]:
data = [{'Protein ID': row['Protein ID'], 
         'Gene ID': row['Gene ID']} 
        for index, row in df_edges_protein_gene.iterrows()]

with driver.session() as session:
    batch_size = 100000
    
    for i in range(0, len(data), batch_size):
        batch = data[i:i + batch_size]
        
        query = """
        UNWIND $batch AS row
        MATCH (s:protein{id:row['Protein ID']})
        MATCH (t:gene{id:row['Gene ID']})
        CREATE (s)-[:connection]->(t)
        """
        
        session.run(query, batch=batch)
        
        if i % batch_size == 0:
            now = time.strftime("%H:%M:%S", time.localtime())
            print(f"{now}\tBatch {i // batch_size + 1} of {len(data) // batch_size + 1} complete")

print("done!!")

16:09:21	Batch 1 of 2 complete
16:09:22	Batch 2 of 2 complete
done!!


In [22]:
# Measure the time
end_time = time.time()

print(f"Time elapsed in seconds:\t{end_time - start_time}")
print(f"Time elapsed in minutes:\t{(end_time - start_time) / 60}")

Time elapsed in seconds:	342.6218113899231
Time elapsed in minutes:	5.710363523165385
