In [1]:
from dotenv import load_dotenv
import os

from graph_db.db_connection import Neo4jConnection

load_dotenv()
uri = os.getenv("NEO4J_URI")
username = os.getenv("NEO4J_USERNAME")
password = os.getenv("NEO4J_PASSWORD")

In [2]:
conn = Neo4jConnection(uri, username, password)

In [None]:
conn.query("MATCH (n) RETURN n LIMIT 10")

## Compounds and reactions with available CATALYZES relation

In [4]:
import pandas as pd

df = pd.read_csv("available_enzymes.csv")
df

Unnamed: 0,reaction_id,ec_numbers,name,products
0,rxn00001,[3.6.1.1],diphosphate phosphohydrolase,"[ (2) cpd00009[0], (1) cpd00067[0]]"
1,rxn00002,[3.5.1.54],urea-1-carboxylate amidohydrolase,"[ (2) cpd00011[0], (2) cpd00013[0]]"
2,rxn00003,[2.2.1.6],pyruvate:pyruvate acetaldehydetransferase (dec...,"[ (2) cpd00020[0], (1) cpd00067[0]]"
3,rxn00004,[4.1.3.17],4-hydroxy-4-methyl-2-oxoglutarate pyruvate-lya...,[ (2) cpd00020[0]]
4,rxn00006,"[1.11.1.21, 1.11.1.6]",hydrogen-peroxide:hydrogen-peroxide oxidoreduc...,"[ (2) cpd00001[0], (1) cpd00007[0]]"
...,...,...,...,...
120,rxn00238,[2.7.4.6],ATP:nucleoside-diphosphate phosphotransferase,"[ (1) cpd00008[0], (1) cpd00173[0]]"
121,rxn00241,"[3.6.1.15, 3.6.1.5, 3.6.5.1, 3.6.5.2, 3.6.5.3,...",GTP phosphohydrolase,"[ (1) cpd00009[0], (1) cpd00031[0], (1) cpd000..."
122,rxn00279,[4.1.1.12],L-aspartate 4-carboxy-lyase (L-alanine-forming),"[ (1) cpd00011[0], (1) cpd00035[0]]"
123,rxn00281,[2.3.1.-],rxn00281,"[ (1) cpd00010[0], (1) cpd02121[0]]"


In [5]:
import pandas as pd
import re
from typing import List

def extract_unique_compound_ids(df: pd.DataFrame, column_name: str) -> List[str]:
    # Join all strings in the column
    all_products = ' '.join(df[column_name].astype(str))
    
    # Use regex to find all compound IDs
    compound_ids = re.findall(r'cpd\d+', all_products)
    
    # Return unique compound IDs
    return list(set(compound_ids))

def extract_unique_ec_numbers(df: pd.DataFrame, column_name: str) -> List[str]:
    # Join all strings in the column
    all_ec_numbers = ' '.join(df[column_name].astype(str))
    
    # Use regex to find all EC numbers
    ec_numbers = re.findall(r'\d+\.\d+\.\d+\.[-\d]+', all_ec_numbers)
    
    # Return unique EC numbers
    return list(set(ec_numbers))

unique_compounds = extract_unique_compound_ids(df, 'products')
unique_ec = extract_unique_ec_numbers(df, 'ec_numbers')

print("Unique compound IDs:")
print(unique_compounds)

print("Unique EC numbers:")
print(unique_ec)

Unique compound IDs:
['cpd01414', 'cpd01569', 'cpd00056', 'cpd00071', 'cpd00279', 'cpd07924', 'cpd00027', 'cpd00186', 'cpd00477', 'cpd00110', 'cpd00009', 'cpd00755', 'cpd00173', 'cpd00023', 'cpd02508', 'cpd00355', 'cpd01770', 'cpd05167', 'cpd00169', 'cpd00196', 'cpd02227', 'cpd00122', 'cpd00012', 'cpd00002', 'cpd02097', 'cpd00038', 'cpd00128', 'cpd00061', 'cpd00109', 'cpd00013', 'cpd00190', 'cpd00304', 'cpd00008', 'cpd00025', 'cpd00005', 'cpd02311', 'cpd00146', 'cpd00031', 'cpd00020', 'cpd00022', 'cpd00689', 'cpd01588', 'cpd00018', 'cpd00924', 'cpd03802', 'cpd00281', 'cpd00015', 'cpd00007', 'cpd00035', 'cpd00014', 'cpd02121', 'cpd00686', 'cpd00003', 'cpd00024', 'cpd00004', 'cpd00001', 'cpd00053', 'cpd00843', 'cpd00016', 'cpd00017', 'cpd00447', 'cpd10516', 'cpd00010', 'cpd00006', 'cpd00199', 'cpd00165', 'cpd00091', 'cpd00103', 'cpd00797', 'cpd00062', 'cpd03460', 'cpd00101', 'cpd00182', 'cpd00050', 'cpd00111', 'cpd00067', 'cpd00011', 'cpd00837', 'cpd02882', 'cpd00421', 'cpd02469', 'cpd02

In [1]:
from src.databases import extract_reaction_data

reactions_path = "data/modelSEED/reactions.json"
compounds_path = "data/modelSEED/compounds.json"

reactions, compounds = extract_reaction_data(reactions_path, compounds_path)

In [2]:
[r for r in reactions if r["id"] == "rxn18035"][0]

{'abbreviation': '3.4.11.4-RXN.c',
 'abstract_reaction': None,
 'aliases': ['EcoCyc: 3.4.11.4-RXN',
  'MetaCyc: 3.4.11.4-RXN',
  'Name: alanine-phenylalanine-proline arylamidase; aminoexotripeptidase; aminotripeptidase; imidoendopeptidase; lymphopeptidase; peptidase B; peptidase T; tripeptidase; tripeptide aminopeptidase'],
 'code': '(1) cpd00001[0] + (1) cpd28237[0] <=> (1) cpd22369[0] + (1) cpd26871[0]',
 'compound_ids': 'cpd00001;cpd22369;cpd26871;cpd28237',
 'definition': '(1) H2O[0] + (1) TRIPEPTIDES[0] <=> (1) Amino-Acids-20[0] + (1) DIPEPTIDES[0]',
 'deltag': 10000000.0,
 'deltagerr': 10000000.0,
 'direction': '=',
 'ec_numbers': ['3.4.11.4'],
 'equation': '(1) cpd00001[0] + (1) cpd28237[0] <=> (1) cpd22369[0] + (1) cpd26871[0]',
 'id': 'rxn18035',
 'is_obsolete': 0,
 'is_transport': 0,
 'linked_reaction': None,
 'name': 'aminotripeptidase',
 'notes': ['GCP', 'EQP'],
 'pathways': None,
 'reversibility': '?',
 'source': 'Primary Database',
 'status': 'OK',
 'stoichiometry': '-1:c

In [8]:
import pandas as pd
from typing import List, Dict

def create_compound_dataframe(unique_ids: List[str], compound_dicts: List[Dict]) -> pd.DataFrame:
    # Create a dictionary to store the compound information
    compound_info = {}
    
    # Create a lookup dictionary for faster access
    compound_lookup = {d['id']: d for d in compound_dicts}
    
    # Iterate through the unique IDs
    for compound_id in unique_ids:
        if compound_id in compound_lookup:
            compound = compound_lookup[compound_id]
            compound_info[compound_id] = {
                'name': compound['name'],
                'smiles': compound['smiles']
            }
        else:
            # If the compound is not found in the lookup, add it with empty values
            compound_info[compound_id] = {
                'name': '',
                'smiles': ''
            }
    
    # Create a DataFrame from the compound_info dictionary
    df = pd.DataFrame.from_dict(compound_info, orient='index', columns=['name', 'smiles'])
    
    # Reset the index to make the compound ID a column
    df.reset_index(inplace=True)
    df.columns = ['compound_id', 'name', 'smiles']
    
    return df



available_compounds = create_compound_dataframe(unique_compounds, compounds)
available_compounds.to_csv("available_compounds.tsv", sep="\t", index=False)
available_compounds.head()

Unnamed: 0,compound_id,name,smiles
0,cpd01414,Tetrathionate,O=S(=O)([O-])SSS(=O)(=O)[O-]
1,cpd01569,"1,2-Benzoquinone",O=C1C=CC=CC1=O
2,cpd00056,TPP,Cc1ncc(C[n+]2csc(CCOP(=O)([O-])OP(=O)([O-])O)c...
3,cpd00071,Acetaldehyde,CC=O
4,cpd00279,Acetoacetyl-CoA,CC(=O)CC(=O)SCCNC(=O)CCNC(=O)[C@H](O)C(C)(C)CO...


## Assigning protein IDs to BGCs

In [None]:
!zcat /home/ec2-user/SageMaker/efs/sandbox/sandbox/development/epereira/case_studies/oceandna/results/oceandna_bgc_gene_ids/oceandna_bgc_contig_id2coords2gene_ids.tsv.gz | head -n 5

In [None]:
file_path = (
    "/home/ec2-user/SageMaker/efs/sandbox/sandbox/development/epereira/case_studies/
    "oceandna/results/oceandna_bgc_gene_ids/oceandna_bgc_contig_id2coords2gene_ids.tsv.gz"
)

