# Header


In [1]:
import config
import time

import pandas as pd

from neo4j import GraphDatabase

In [2]:
# set up connection
driver = GraphDatabase.driver(config.uri, auth=(config.user, config.password), encrypted=False)

# Check if connection did work
def check_connectivity(driver):
    try:
        with driver.session() as session:
            result = session.run("RETURN 1 as result")
            record = result.single()
            if record and record["result"] == 1:
                return True
            else:
                print('False')
                return False
    except Exception as e:
        return False


if check_connectivity(driver):
    print("Verbindung erfolgreich hergestellt.")
else:
    print("Fehler bei der Verbindungsherstellung.")

Verbindung erfolgreich hergestellt.


## Indexing of Database
### Delete Database Edges and Nodes

In [3]:
# Delete Nodes for Genes
def delete_gene_nodes(batch_size = 10000):
    with driver.session() as session:
        result = session.run("MATCH (n:gene) RETURN n.id AS id")
        node_ids = [record["id"] for record in result]
    
        for i in range(0, len(node_ids), batch_size):
            batch = node_ids[i:i + batch_size]
            session.run("""
            UNWIND $batch AS id
            MATCH (n:gene {id: id})
            DETACH DELETE n
            """, batch=batch)
    
            now = time.strftime("%H:%M:%S", time.localtime())
            print(f"{now}\tBatch {i // batch_size + 1} of {len(node_ids) // batch_size + 1} complete")
            
# Takes too long - if everything should be deleted, delete the docker container         
"""def delete_protein_nodes(batch_size=10000):
    with driver.session() as session:
        while True:
            # Knoten direkt in Batches laden und löschen
            result = session.run(f"
                MATCH (n:protein)
                WITH n LIMIT $batch_size
                DETACH DELETE n
                RETURN count(n) as deleted_count
            ", batch_size=batch_size)

            # Die Anzahl der gelöschten Knoten auslesen
            deleted_count = result.single()["deleted_count"]

            # Batch-Informationen ausgeben
            now = time.strftime("%H:%M:%S", time.localtime())
            print(f"{now}\tBatch complete, {deleted_count} nodes deleted")

            # Wenn keine Knoten mehr gelöscht wurden, beende die Schleife
            if deleted_count == 0:
                break"""

'def delete_protein_nodes(batch_size=10000):\n    with driver.session() as session:\n        while True:\n            # Knoten direkt in Batches laden und löschen\n            result = session.run(f"\n                MATCH (n:protein)\n                WITH n LIMIT $batch_size\n                DETACH DELETE n\n                RETURN count(n) as deleted_count\n            ", batch_size=batch_size)\n\n            # Die Anzahl der gelöschten Knoten auslesen\n            deleted_count = result.single()["deleted_count"]\n\n            # Batch-Informationen ausgeben\n            now = time.strftime("%H:%M:%S", time.localtime())\n            print(f"{now}\tBatch complete, {deleted_count} nodes deleted")\n\n            # Wenn keine Knoten mehr gelöscht wurden, beende die Schleife\n            if deleted_count == 0:\n                break'

In [4]:
"""## DELETES Nodes and Edges
delete_gene_nodes()
print("Delete Gene Nodes done.")"""

14:02:41	Batch 1 of 4 complete
14:02:45	Batch 2 of 4 complete
14:02:49	Batch 3 of 4 complete
14:02:52	Batch 4 of 4 complete
Delete Gene Nodes done.


## Load Data as Dataframes


In [8]:
df_nodes_gene = pd.read_csv('../processed_data/nodes_genes.csv')
df_nodes_protein = pd.read_csv('../processed_data/nodes_protein.csv')

df_edges_protein = pd.read_csv('../processed_data/edges_protein.csv')
df_edges_protein_gene = pd.read_csv('../processed_data/edges_protein_gene.csv')

In [6]:
# Measure the time
start_time = time.time()

## Proteins
### Protein Nodes

**CYPHER QUERY:**
```
CREATE (p:protein {id: 'Protein stable ID'})
```

**Node-Types:**
* protein
	

**Node-Properties:**
* id

In [7]:
with driver.session() as session:
    query = """CREATE INDEX IF NOT EXISTS FOR (p:protein) ON (p.id)"""
    
    session.run(query)

In [8]:
print(f"There are {df_nodes_protein.shape[0]} Protein nodes")

There are 101731 Protein nodes


In [9]:
data = [{'id': row['Protein stable ID']} for index, row in df_nodes_protein.iterrows()]

with driver.session() as session:
    batch_size = 500
    for i in range(0, len(data), batch_size):
        batch = data[i:i + batch_size]

        query = """
        UNWIND $batch AS row
        CREATE (p:protein {id: row.id})
        """

        # Batch als Parameter übergeben
        session.run(query, batch=batch)

        if i % 50000 == 0:
            now = time.strftime("%H:%M:%S", time.localtime())
            print(f"{now}\tBatch {i // batch_size + 1} of {len(data) // batch_size + 1} complete")

print("done!!")

10:15:25	Batch 1 of 204 complete
10:15:27	Batch 101 of 204 complete
10:15:28	Batch 201 of 204 complete
done!!


### Protein-Protein Edges
Every Edge is a Interaction between two Proteins.
Interaction type is not defined further.


**Cypher Query:**
``` 
MATCH (s:protein{id:'protein1'})
MATCH (s:protein{id:'protein2'})
CREATE (s)-[:INTERACTS]->(t)
```

**Edge-Types:**
* Interaction

In [10]:
print(f"There are {df_edges_protein.shape[0]} Protein-Protein Interactions")

There are 11247242 Protein-Protein Interactions


In [11]:
protein1 = df_edges_protein['protein1'].tolist()
protein2 = df_edges_protein['protein2'].tolist()
data = list(zip(protein1, protein2))


with driver.session() as session:
    batch_size = 1000000

    
    for i in range(0, len(data), batch_size):
        batch = data[i:i + batch_size]

        query = """
        UNWIND $batch AS row
        MATCH (s:protein{id:row[0]})
        MATCH (t:protein{id:row[1]})
        CREATE (s)-[:INTERACTS]->(t)
        """

        session.run(query, batch=batch)

        if i % batch_size == 0:
            now = time.strftime("%H:%M:%S", time.localtime())
            print(f"{now}\tBatch {i // batch_size + 1} of {len(data) // batch_size + 1} complete")
    
print("done!!")

10:15:51	Batch 1 of 12 complete
10:16:09	Batch 2 of 12 complete
10:16:27	Batch 3 of 12 complete
10:16:46	Batch 4 of 12 complete
10:17:02	Batch 5 of 12 complete
10:17:18	Batch 6 of 12 complete
10:17:33	Batch 7 of 12 complete
10:17:48	Batch 8 of 12 complete
10:18:04	Batch 9 of 12 complete
10:18:19	Batch 10 of 12 complete
10:18:34	Batch 11 of 12 complete
10:18:40	Batch 12 of 12 complete
done!!


## Genes
### Gene Nodes

**CYPHER QUERY:**
```
CREATE (p:gene: { 
    id: id, 
    gene_name: gene_name,
    norm_healthy_tpm: norm_healthy_tpm,
    norm_lung_cancer_tpm: norm_lung_cancer_tpm,
    delta_tpm: delta_tpm,
    cancer_active: cancer_active})
```

**Node-Types:**
* gene
	

**Node-Properties:**
* id
* name
* lung_cancer_tpm
* healthy_tpm
* tmp_difference
* cancer_active


In [9]:
with driver.session() as session:
    query = """CREATE INDEX IF NOT EXISTS FOR (p:gene) ON (p.id)"""
    
    session.run(query)

In [10]:
print(f"There are {df_nodes_gene.shape[0]} Gene nodes")

There are 32865 Gene nodes


In [12]:
data = [{'id': row['id'], 
         'gene_name': row['name'],
         'norm_healthy_tpm': row['norm healthy tpm'],
         'norm_cancerous_tpm': row['norm cancerous tpm'],
         'delta_tpm': row['delta tpm'],
         'delta_type': row['delta type'],
         'delta_tpm_relevant': row['delta tpm relevant']} 
        for index, row in df_nodes_gene.iterrows()]

with driver.session() as session:
    batch_size = 1000
    
    for i in range(0, len(data), batch_size):
        batch = data[i:i + batch_size]
        
        query = """
        UNWIND $batch AS row
        CREATE (p:gene {
            id: row.id, 
            gene_name: row.gene_name,
            norm_healthy_tpm: row.norm_healthy_tpm,
            norm_cancerous_tpm: row.norm_cancerous_tpm,
            delta_tpm: row.delta_tpm,
            delta_type: row.delta_type,
            delta_tpm_relevant: row.delta_tpm_relevant})
        """
        
        session.run(query, batch=batch)
            
        if i % 10000 == 0:
            now = time.strftime("%H:%M:%S", time.localtime())
            print(f"{now}\tBatch {i // batch_size + 1} of {len(data) // batch_size + 1} complete")
            
print("done!!")

14:07:39	Batch 1 of 33 complete
14:07:41	Batch 11 of 33 complete
14:07:43	Batch 21 of 33 complete
14:07:45	Batch 31 of 33 complete
done!!


### Protein-Gene Edges
Every Edge is a Connection between a Protein and a gene.
Interaction type is not defined further.


**Cypher Query:**
``` 
MATCH (s:protein{id:'id'})
MATCH (s:gene{id:'id'})
CREATE (s)-[:CONNECTION]-(t)
```

**Edge-Types:**
* Connection

In [13]:
print(f"There are {df_edges_protein_gene.shape[0]} Protein-Gene connections")

There are 101731 Protein-Gene connections


In [14]:
data = [{'Protein stable ID': row['Protein stable ID'], 
         'Gene stable ID': row['Gene stable ID']} 
        for index, row in df_edges_protein_gene.iterrows()]

with driver.session() as session:
    batch_size = 100000
    
    for i in range(0, len(data), batch_size):
        batch = data[i:i + batch_size]
        
        query = """
        UNWIND $batch AS row
        MATCH (s:protein{id:row['Protein stable ID']})
        MATCH (t:gene{id:row['Gene stable ID']})
        CREATE (s)-[:CONNECTION]->(t)
        """
        
        session.run(query, batch=batch)
        
        if i % batch_size == 0:
            now = time.strftime("%H:%M:%S", time.localtime())
            print(f"{now}\tBatch {i // batch_size + 1} of {len(data) // batch_size + 1} complete")

print("done!!")

14:08:16	Batch 1 of 2 complete
14:08:23	Batch 2 of 2 complete
done!!


In [17]:
# Measure the time
end_time = time.time()

print(f"Time elapsed in seconds:\t{end_time - start_time}")
print(f"Time elapsed in minutes:\t{(end_time - start_time) / 60}")

Time elapsed in seconds:	207.3143174648285
Time elapsed in minutes:	3.4552386244138082
