## Query by compound

Find similar compounds by SMILES, then retrieved reactions and enzymes catalyzing reactions.

<table style="border-collapse: collapse; border: none; max-width: 800px; margin: 0 auto;">
  <tr>
    <td style="border: none; padding: 10px; text-align: center;">
      <img src="imgs/Pyridoxal-phosphate.svg" alt="Pyridoxal phosphate" style="width: 100%; max-width: 300px; height: auto; object-fit: contain;">
      <h3>Pyridoxal phosphate</h3>
    </td>
    <td style="border: none; padding: 10px; text-align: center;">
      <img src="imgs/retinal.png" alt="Retinal" style="width: 100%; max-width: 300px; height: auto; object-fit: contain;">
      <h3>Retinal</h3>
    </td>
  </tr>
  <tr>
    <td style="border: none; padding: 10px; text-align: center;">
      <img src="imgs/riboflavin.png" alt="Riboflavin" style="width: 100%; max-width: 300px; height: auto; object-fit: contain;">
      <h3>Riboflavin</h3>
    </td>
    <td style="border: none; padding: 10px; text-align: center;">
      <img src="imgs/S-adenosyl_methionine.png" alt="S-adenosyl methionine" style="width: 100%; max-width: 300px; height: auto; object-fit: contain;">
      <h3>S-adenosyl methionine</h3>
    </td>
  </tr>
</table>

In [1]:
from dotenv import load_dotenv
import os

from graph_db.db_connection import Neo4jConnection

load_dotenv()
uri = os.getenv("NEO4J_URI")
username = os.getenv("NEO4J_USERNAME")
password = os.getenv("NEO4J_PASSWORD")

In [17]:
query = """
MATCH (c:Compound {compound_id: $compound_id})
MATCH (c)-[:PRODUCT_OF]->(r:Reaction)
MATCH (p:Protein)-[:CATALYZES]->(r)
OPTIONAL MATCH (m:Genome)-[:CONTAINS]->(p)
OPTIONAL MATCH (m)-[:ORIGINATED_FROM]->(s:Sample)
WHERE 
   ($min_temp IS NULL OR toFloat(s.temperature) >= $min_temp) AND
   ($max_temp IS NULL OR toFloat(s.temperature) <= $max_temp) AND
   ($min_lat IS NULL OR toFloat(s.latitude) >= $min_lat) AND
   ($max_lat IS NULL OR toFloat(s.latitude) <= $max_lat) AND
   ($min_lon IS NULL OR toFloat(s.longigute) >= $min_lon) AND
   ($max_lon IS NULL OR toFloat(s.longigute) <= $max_lon)
RETURN 
    r.reaction_id AS reaction_id,
    r.name AS reaction_name,
    c.compound_id AS compound_id,
    c.name AS compound_name,
    c.smiles AS compound_smiles,
    p.protein_id AS catalyzing_protein_id,
    p.name AS catalyzing_protein_name,
    p.ec_numbers AS catalyzing_protein_ec_numbers,
    m.genome_id AS genome_id,
    m.gtdb_classification AS gtdb_classification,
    s.biosample_id AS sample_id,
    s.temperature AS sample_temperature,
    s.depth AS sample_depth,
    s.latitude AS sample_latitude,
    s.longigute AS sample_longitude
LIMIT $limit"""

compounds = {
    "cpd00304": "Retinal",
    "cpd00220": "Riboflavin (B2)",
    "cpd00016": "Pyridoxal phosphate (B6)",
    "cpd00017": "S-Adenosyl-L-methionine (SAM-e)",
}

compound_market_sizes = {
    "Retinal": 10,
    "Riboflavin": 10,
    "Pyridoxal phosphate": 10,
    "SAM": 10,
}

compound_results = {}
for cpd_id, cpd_name in compounds.items():
    
    params = {
      "compound_id": f"{cpd_id}",
      "min_temp": None,
      "max_temp": None,
      "min_lat": None,
      "max_lat": None,
      "min_lon": None,
      "max_lon": None,
      "limit": 100000
    }

    conn = Neo4jConnection(uri, username, password)
    compound_results[cpd_id] = conn.query(query, params)
    print(f"Found {len(compound_results[cpd_id])} protein hits for compound: {cpd_name}")

Found 13130 protein hits for compound: Retinal
Found 45080 protein hits for compound: Riboflavin (B2)
Found 90729 protein hits for compound: Pyridoxal phosphate (B6)
Found 52033 protein hits for compound: S-Adenosyl-L-methionine (SAM-e)


In [28]:
from src.io import save_results_to_tsv
        
for cpd_id, res in compound_results.items():
    save_results_to_tsv(res, f"outputs/compound_query/{cpd_id}.tsv")

Saved 13130 records to outputs/compound_query/cpd00304.tsv
Saved 45080 records to outputs/compound_query/cpd00220.tsv
Saved 90729 records to outputs/compound_query/cpd00016.tsv
Saved 52033 records to outputs/compound_query/cpd00017.tsv


In [41]:
from src.visualization import plot_compound_statistics

processed_data = {
    cpd_id: pd.read_csv(f"outputs/compound_query/{cpd_id}.tsv", sep="\t")
    for cpd_id in compounds.keys()
}

plot_compound_statistics(
    processed_data,
    shp_file_path="data/maps/ne_110m_ocean/ne_110m_ocean.shp",
    output_dir="outputs/figures",
    base_font_size=14
    )

All figures have been saved to outputs/figures


<img src="outputs/figures/compound_cpd00016_statistics.png" alt="results1">
<br>
<img src="outputs/figures/compound_cpd00017_statistics.png" alt="results1">
<br>
<img src="outputs/figures/compound_cpd00220_statistics.png" alt="results1">
<br>
<img src="outputs/figures/compound_cpd00304_statistics.png" alt="results1">

## Query by similar compound

In [5]:
query = """
MATCH (c:Compound {compound_id: $compound_id})-[sim:CHEMICAL_SIMILARITY]-(similar:Compound)
MATCH (similar)-[:PRODUCT_OF]->(r:Reaction)
MATCH (p:Protein)-[:CATALYZES]->(r)
WHERE toFloat(sim.distance) <= $distance_threshold
RETURN 
    r.reaction_id AS reaction_id,
    r.name AS reaction_name,
    similar.compound_id AS similar_compound_id,
    similar.name AS similar_compound_name,
    similar.smiles AS similar_compound_smiles,
    toFloat(sim.distance) AS similarity_distance,
    p.protein_id AS catalyzing_protein_id,
    p.name AS catalyzing_protein_name,
    p.ec_numbers AS catalyzing_protein_ec_numbers
ORDER BY similarity_distance ASC
LIMIT $limit"""


conn = Neo4jConnection(uri, username, password)

params = {"compound_id": "cpd00304", "distance_threshold": 0.9, "limit": 1000000}
results = conn.query(query, params)
results[0]

<Record reaction_id='rxn00209' reaction_name='Pyridoxine 5-phosphate:oxygen oxidoreductase' similar_compound_id='cpd00016' similar_compound_name='Pyridoxal phosphate' similar_compound_smiles='Cc1ncc(COP(=O)([O-])[O-])c(C=O)c1O' similarity_distance=0.859375 catalyzing_protein_id='OceanDNA-b43668_00045_5' catalyzing_protein_name="pyridoxamine 5'-phosphate oxidase [EC:1.4.3.5]" catalyzing_protein_ec_numbers=['1.4.3.5']>

In [6]:
from src.io import neo4j_records_to_dataframe

res_df = neo4j_records_to_dataframe(results, output_file="outputs/compound_query/KG_hits_cpd00304.tsv")
res_df.head()

DataFrame saved to outputs/compound_query/KG_hits_cpd00304.tsv


Unnamed: 0,reaction_id,reaction_name,similar_compound_id,similar_compound_name,similar_compound_smiles,similarity_distance,catalyzing_protein_id,catalyzing_protein_name,catalyzing_protein_ec_numbers
0,rxn00209,Pyridoxine 5-phosphate:oxygen oxidoreductase,cpd00016,Pyridoxal phosphate,Cc1ncc(COP(=O)([O-])[O-])c(C=O)c1O,0.859375,OceanDNA-b43668_00045_5,pyridoxamine 5'-phosphate oxidase [EC:1.4.3.5],[1.4.3.5]
1,rxn00209,Pyridoxine 5-phosphate:oxygen oxidoreductase,cpd00016,Pyridoxal phosphate,Cc1ncc(COP(=O)([O-])[O-])c(C=O)c1O,0.859375,OceanDNA-b29607_00006_12,pyridoxamine 5'-phosphate oxidase [EC:1.4.3.5],[1.4.3.5]
2,rxn00209,Pyridoxine 5-phosphate:oxygen oxidoreductase,cpd00016,Pyridoxal phosphate,Cc1ncc(COP(=O)([O-])[O-])c(C=O)c1O,0.859375,OceanDNA-b28865_00099_2,pyridoxamine 5'-phosphate oxidase [EC:1.4.3.5],[1.4.3.5]
3,rxn00209,Pyridoxine 5-phosphate:oxygen oxidoreductase,cpd00016,Pyridoxal phosphate,Cc1ncc(COP(=O)([O-])[O-])c(C=O)c1O,0.859375,OceanDNA-b32438_00177_2,pyridoxamine 5'-phosphate oxidase [EC:1.4.3.5],[1.4.3.5]
4,rxn00209,Pyridoxine 5-phosphate:oxygen oxidoreductase,cpd00016,Pyridoxal phosphate,Cc1ncc(COP(=O)([O-])[O-])c(C=O)c1O,0.859375,OceanDNA-b5580_00140_3,pyridoxamine 5'-phosphate oxidase [EC:1.4.3.5],[1.4.3.5]
