In [1]:
from pyeed import Pyeed
from pyeed.model import GOAnnotation, Protein

  from .autonotebook import tqdm as notebook_tqdm


# Create Neo4j DB

Create local Neo4j DB without authentication.
Graph data science plugin is not installed.

```bash
docker run -it --name pyeed-neo4j \
  -p 7474:7474 \--user="$(id -u):$(id -g)" \
  -e NEO4J_AUTH=none \
  -p 7687:7687 \
  -v $HOME/Documents/db/data:/data \
  -v $HOME/Documents/db/logs:/logs \
  -v $HOME/Documents/db/import:/var/lib/neo4j/import \
  -v $HOME/Documents/db/plugins:/plugins \
  -e NEO4J_AUTH=neo4j/test \
  -e NEO4JLABS_PLUGINS='["apoc"]' \
  -e NEO4J_dbms_security_procedures_unrestricted="apoc.*" \
  -d neo4j:latest
```

# Connect to DB

Neo4j DB is hosted locally via Docker.
Also possible to use free hosted Neo4j Sandbox (not tested).

In [2]:
uri = "bolt://127.0.0.1:7687"
user = None
password = None

# Create a Pyeed object, automatically connecting to the database
eedb = Pyeed(uri)
eedb.db._wipe_database()

# DB connector is a property of the Pyeed object
print(eedb.db)

# If this is the first time you are running this script, the pyeed graph model needs to be initialized
first_time = False
if first_time:
    eedb.db._initialize_db_constraints(user=user, password=password)

📡 Connected to database.
All data has been wiped from the database.
<pyeed.dbconnect.DatabaseConnector object at 0x110b157d0>


In [3]:
ids = [
    "P04182",
    "Q6QDP7",
    "P04182",
    "P29758",
    "A0A851UXD9",
    "A0A8C6HVU6",
    "A0A8C6GQ10",
    "A0A1U7QEB0",
    "A0A6I9L5L6",
    "G3HVE0",
    "A0A8J6G992",
    "A0A8C6W4W5",
    "A0A8B9YUY7",
    "L8I4V3",
    "A0A6P3IYQ1",
    "A0A452EKJ3",
    "A0A6P5B7Q0",
    "F1MYG0",
    "A0A5J5MK22",
    "A0A6J0Y425",
    "Q3ZCF5",
    "P00330",  # ADH
    "J8LIG6",
    "A0AA35J9C9",
    "P00331",
    "J8Q680",
    "J5PRJ1",
    "A0A1X7R1I9",
    "Q6FQA4",
    "C5DNB7",
    "Q9P4C2",
    "C5DHM6",
    "Q757I1",
    "A0A7H9HSD9",
    "P20369",
    "H2AXS6",
    "G0W4V9",
    "A0A1G4M9V8",
    "A0A1G4KF85",
    "A0A1G4JJF2",
    "G8ZTZ5",
    "A0A1G4MBD6",
    "A0A7H9HSJ3",
    "J7SA96",
    "G0VK69",
]

# Fetch proteins from primary database
eedb.fetch_from_primary_db(ids)
eedb.calculate_sequence_embeddings()

# number of nodes and edges in db
print(eedb.db.stats())

[32m2024-10-14 16:18:48.472[0m | [34m[1mDEBUG   [0m | [36mpyeed.adapter.primary_db_adapter[0m:[36mmake_request[0m:[36m142[0m - [34m[1mSending 9 requests in batches of 5[0m
[32m2024-10-14 16:18:48.476[0m | [34m[1mDEBUG   [0m | [36mpyeed.adapter.primary_db_adapter[0m:[36msend_request[0m:[36m123[0m - [34m[1mSending request to https://www.ebi.ac.uk/proteins/api/proteins with parameters: {'format': 'json', 'accession': 'P04182,Q6QDP7,P04182,P29758,A0A851UXD9'}[0m
[32m2024-10-14 16:18:48.577[0m | [34m[1mDEBUG   [0m | [36mpyeed.adapter.primary_db_adapter[0m:[36msend_request[0m:[36m123[0m - [34m[1mSending request to https://www.ebi.ac.uk/proteins/api/proteins with parameters: {'format': 'json', 'accession': 'A0A8C6HVU6,A0A8C6GQ10,A0A1U7QEB0,A0A6I9L5L6,G3HVE0'}[0m
[32m2024-10-14 16:18:48.806[0m | [34m[1mDEBUG   [0m | [36mpyeed.adapter.primary_db_adapter[0m:[36msend_request[0m:[36m123[0m - [34m[1mSending request to https://www.ebi.ac.uk/prot

{'nodes': 135, 'relationships': 293}


To use the web interface, open a browser and go to `http://localhost:7474/`.

# Query DB

In [4]:
## Query using pyeed graph objects
# Get all proteins
proteins = Protein.nodes.all()
print("Number of proteins in database: ", len(proteins))

# Get protein with id P04182
protein = Protein.nodes.get(accession_id="P04182")
print(protein)

# Get all protein which are accociated with GO term GO:0005739 (mitochondrion)
go_annotation = GOAnnotation.nodes.get(go_id="GO:0005739")
mito_proteins = protein.go_annotation.all()
print("Number of proteins associated with GO:0005739: ", len(mito_proteins))


## Or execute cypher query
# Get all organisms that have at least two connected proteins
query = """
MATCH (o:Organism)<-[:ORIGINATES_FROM]-(p:Protein)
WITH o, COUNT(p) AS proteinCount
WHERE proteinCount >= 2
RETURN o
"""

organisms = eedb.db.execute_read(query)
print("Number of organisms with at least two proteins: ", len(organisms))

Number of proteins in database:  44
{'accession_id': 'P04182', 'sequence': 'MLSKLASLQTVAALRRGLRTSVASATSVATKKTEQGPPSSEYIFERESKYGAHNYHPLPVALERGKGIYMWDVEGRQYFDFLSAYGAVSQGHCHPKIIEAMKSQVDKLTLTSRAFYNNVLGEYEEYITKLFNYNKVLPMNTGVEAGETACKLARRWGYTVKGIQKYKAKIVFAVGNFWGRTLSAVSSSTDPTSYDGFGPFMPGFETIPYNDLPALERALQDPNVAAFMVEPIQGEAGVIVPDPGYLTGVRELCTRHQVLFIADEIQTGLARTGRWLAVDHENVRPDIVLLGKALSGGLYPVSAVLCDDDIMLTIKPGEHGSTYGGNPLGCRIAIAALEVLEEEHLAENADKMGAILRKELMKLPSDVVTAVRGKGLLNAIVIRETKDCDAWKVCLRLRDNGLLAKPTHGDIIRLAPPLVIKEDEIRESVEIINKTILSF', 'name': 'Ornithine aminotransferase, mitochondrial', 'seq_length': 439, 'mol_weight': 48333.0, 'ec_number': '2.6.1.13', 'nucleotide_id': None, 'locus_tag': None, 'structure_ids': None, 'go_terms': None, 'embedding': [0.02024856023490429, -0.10120689868927002, -0.054875459522008896, 0.05940677598118782, -0.08618494868278503, -0.029551653191447258, 0.09230533987283707, -0.051247984170913696, -0.14654004573822021, 0.02974345162510872, 0.057191673666238785, -0.08889872580766678, 0.

# Embeddings

In [9]:
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go

# Annotations of interest (Alcohol dehydrogenase annotation and proline biosynthesis annotation --> two different protein families)
adh_go_id = "GO:0004022"
ploline_biosynthesis_go_id = "GO:0055129"

# Query to get all proteins with embeddings and get the label based on the annotations
query = """
MATCH (p:Protein)
OPTIONAL MATCH (p)-[:ASSOCIATED_WITH]-(g:GOAnnotation)
WITH p, collect(g.go_id) AS go_ids
RETURN p.accession_id AS protein_id, 
       p.embedding AS embedding,
       CASE 
           WHEN 'GO:0055129' IN go_ids THEN 'proline biosynthesis'
           WHEN 'GO:0004022' IN go_ids THEN 'ADH'
           ELSE 'no annotation'
       END AS label
"""

result = eedb.db.execute_read(query)

# Prepare data for visualization
data = dict(
    protein_id=[],
    embedding=[],
    label=[],
)
for record in result:
    data["protein_id"].append(record["protein_id"])
    data["embedding"].append(record["embedding"])
    data["label"].append(record["label"])

protein_ids, embeddings, labels = (
    data["protein_id"],
    np.array(data["embedding"]),
    data["label"],
)

colors = []
for label in labels:
    if label == "ADH":
        colors.append("red")
    elif label == "proline biosynthesis":
        colors.append("blue")
    else:
        colors.append("green")

hover_texts = [
    "<br>".join(
        [
            f"{key}: {value}"
            for key, value in Protein.nodes.get(
                accession_id=protein_id
            ).__dict__.items()
            if key
            in [
                "accession_id",
                "mol_weight",
                "ec_number",
                "seq_length",
                "mol_weight",
                "name",
            ]
        ]
    )
    for protein_id in protein_ids
]


# Apply t-SNE to Reduce Embeddings to 2D
tsne = TSNE(n_components=2, random_state=42, perplexity=5, max_iter=3000)
embeddings_2d = tsne.fit_transform(embeddings)

unique_labels = set(labels)  # Find the unique labels
traces = []

for label in unique_labels:
    indices = [i for i, l in enumerate(labels) if l == label]
    trace = go.Scatter(
        x=[embeddings_2d[i, 0] for i in indices],
        y=[embeddings_2d[i, 1] for i in indices],
        mode="markers",
        marker=dict(
            size=8,
            color=colors[indices[0]],
        ),
        name=label,
        text=[hover_texts[i] for i in indices],
    )
    traces.append(trace)

layout = go.Layout(
    title="2D t-SNE Visualization of Protein Embeddings",
    xaxis_title="t-SNE Dimension 1",
    yaxis_title="t-SNE Dimension 2",
    width=900,
    height=600,
)

fig = go.Figure(data=traces, layout=layout)

fig.show()

In [6]:
# close connection
eedb.db.close()

🔌 Connection closed.
