## 005_Neo4J_First_Database

In [16]:
# BASE
import os
import json

# PYEED
from pyeed.core import ProteinRecord

# COOL
from neo4j import GraphDatabase

In [17]:
# URI examples: "neo4j://localhost", "neo4j+s://xxx.databases.neo4j.io"
URI = "neo4j://localhost"
AUTH = ("neo4j", "niklasniklas1")

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    driver.verify_connectivity()

In [18]:
with GraphDatabase.driver(URI, auth=AUTH).session() as session:
    result = session.run("SHOW DATABASES yield name;")
    for line in result:
        print(line['name'])

neo4j
system


In [19]:
current_path = os.getcwd()

input_folder_filters_regions = os.path.join(current_path, "data", "filtered_protein_fetch_ids_all")

In [20]:
# we read in the proteins and ids from input folder
protein_dic = {}

for file in os.listdir(input_folder_filters_regions):
    if file.endswith(".json"):
        with open(os.path.join(input_folder_filters_regions, file), "r") as f:
            # read in the file as a dic
            file_data = json.load(f)
            protein_dic[file.split('.')[0]] = {
                'protein': ProteinRecord.from_json_string(file_data['protein']), 
                'alignment': file_data['alignment'],
                'TEM-Domain': file_data['TEM-Domain'],
                'TEM-Domain-Diff': file_data['TEM-Domain-Diff']
                }

print(len(protein_dic))
print(protein_dic['TEM-1'].keys())

210
dict_keys(['protein', 'alignment', 'TEM-Domain', 'TEM-Domain-Diff'])


In [None]:
def add_protein(driver, protein: ProteinRecord, embedding):
    return driver.execute_query(
        """
        CREATE (p:Protein {id: $id, name: $name, sequence: $sequence, ec_number: $ec_number, molar_mass: $molar_mass})
        WITH p
        CALL db.create.setNodeVectorProperty(p, 'embedding', $embedding)
        """,
        id=protein.id,
        name=protein.name,
        sequence=protein.sequence,
        ec_number=protein.ec_number,
        embedding=embedding,
        molar_mass=protein.mol_weight,
    )


def add_embedding(driver, protein_id: str, embedding):
    return driver.execute_query(
        """
        MATCH (p:Protein {id: $id})
        CALL db.create.setNodeVectorProperty(p, 'embedding', $embedding)
        """,
        id=protein_id,
        embedding=embedding,
    )


def initialize_vector_index(driver):
    query = """
    CREATE VECTOR INDEX embedding IF NOT EXISTS
    FOR (p:Protein)
    ON p.embedding
    OPTIONS {indexConfig: {
     `vector.dimensions`: 1024,
     `vector.similarity_function`: 'cosine'
    }}
    """

    # Execute the query
    with driver.session() as session:
        session.run(query)


def get_protein_data_as_df(driver, protein_id: str, n_results: int):
    # Define the query
    query = """
    MATCH (p:Protein {id: $protein_id})
    CALL db.index.vector.queryNodes('embedding', $n_results, p.embedding)
    YIELD node, score
    RETURN node.id AS id, score, node.sequence as sequence
    """

    # Execute the query and collect the results
    with driver.session() as session:
        result = session.run(query, protein_id=protein_id, n_results=n_results)
        records = result.data()

    # Convert the result to a pandas DataFrame
    df = pd.DataFrame(records)

    return df