In [2]:
!pip install -qr requirements.txt

In [3]:
from dotenv import load_dotenv
import os

from graph_db.db_connection import Neo4jConnection

load_dotenv()
uri = os.getenv("NEO4J_URI")
username = os.getenv("NEO4J_USERNAME")
password = os.getenv("NEO4J_PASSWORD")

In [4]:
query = """
MATCH (p:Protein)
WHERE $ec_number IN p.ec_numbers
MATCH (r:Reaction)
WHERE any(ec IN r.ec_numbers WHERE ec IN p.ec_numbers)
OPTIONAL MATCH (m:Genome)-[:CONTAINS]->(p)
OPTIONAL MATCH (m)-[:ORIGINATED_FROM]->(s:Sample)
WHERE
   ($min_temp IS NULL OR toFloat(s.temperature) >= $min_temp) AND
   ($max_temp IS NULL OR toFloat(s.temperature) <= $max_temp) AND
   ($min_lat IS NULL OR toFloat(s.latitude) >= $min_lat) AND
   ($max_lat IS NULL OR toFloat(s.latitude) <= $max_lat) AND
   ($min_lon IS NULL OR toFloat(s.longigute) >= $min_lon) AND
   ($max_lon IS NULL OR toFloat(s.longigute) <= $max_lon)
RETURN 
   p.protein_id AS protein_id,
   p.name AS protein_name,
   p.ec_numbers AS protein_ec_numbers,
   m.genome_id AS genome_id,
   m.gtdb_classification AS gtdb_classification,
   s.biosample_id AS sample_id,
   s.temperature AS sample_temperature,
   s.depth AS sample_depth,
   s.latitude AS sample_latitude,
   s.longigute AS sample_longitude,
   collect(distinct r.reaction_id) AS associated_reaction_ids
LIMIT $limit"""

enzymes_dict = {
    "3.1.1.3": "Lipase",
    "3.2.1.1": "Amylase",
    "3.2.1.4": "Cellulase",
    "3.4.21.62": "Serine Protease",
    "3.2.1.23": "Lactase",
    "3.2.1.8": "Xylanase",
    "1.11.1.6": "Catalase"
}

market_sizes = {
    "Xylanase": 19100,
    "Serine Protease": 3540,
    "Amylase": 1840.8,
    "Cellulase": 1685.8,
    "Lactase": 1230,
    "Lipase": 591,
    "Catalase": 387.4
}

In [5]:
results = {}
for ec_number, enzyme in enzymes_dict.items():
    conn = Neo4jConnection(uri, username, password)
    params = {
      "ec_number": f"{ec_number}",
      "min_temp": None,
      "max_temp": None,
      "min_lat": None,
      "max_lat": None,
      "min_lon": None,
      "max_lon": None,
      "limit": 70000
    }
    results[ec_number] = conn.query(query, params)
    print(f"Found {len(results[ec_number])} results for {enzyme} ({ec_number})")

Found 12300 results for Lipase (3.1.1.3)
Found 4834 results for Amylase (3.2.1.1)
Found 3551 results for Cellulase (3.2.1.4)
Found 38 results for Serine Protease (3.4.21.62)
Found 10183 results for Lactase (3.2.1.23)
Found 6496 results for Xylanase (3.2.1.8)
Found 7003 results for Catalase (1.11.1.6)


In [6]:
r = results["1.11.1.6"][-1]
r.data()

{'protein_id': 'OceanDNA-b22900_00165_4',
 'protein_name': 'manganese catalase [EC:1.11.1.6]',
 'protein_ec_numbers': ['1.11.1.6'],
 'genome_id': 'OceanDNA-b22900',
 'gtdb_classification': 'd__Bacteria;p__Planctomycetota;c__Planctomycetes;o__Planctomycetales;f__Planctomycetaceae;g__Gimesia;s__Gimesia maris',
 'sample_id': 'SAMEA4397295',
 'sample_temperature': '-1.298837',
 'sample_depth': '35.0',
 'sample_latitude': '78.9343',
 'sample_longitude': '79.0758',
 'associated_reaction_ids': ['rxn00006',
  'rxn00427',
  'rxn01932',
  'rxn06212',
  'rxn11962',
  'rxn19264',
  'rxn22404',
  'rxn26776',
  'rxn26777',
  'rxn27744',
  'rxn28877',
  'rxn31381',
  'rxn31397',
  'rxn38673']}

### Save results to files

In [7]:
from src.io import save_results_to_tsv

for ec, res in results.items():
    save_results_to_tsv(res, f"outputs/enzyme_query/{ec}.tsv")

Saved 12300 records to outputs/enzyme_query/3.1.1.3.tsv
Saved 4834 records to outputs/enzyme_query/3.2.1.1.tsv
Saved 3551 records to outputs/enzyme_query/3.2.1.4.tsv
Saved 38 records to outputs/enzyme_query/3.4.21.62.tsv
Saved 10183 records to outputs/enzyme_query/3.2.1.23.tsv
Saved 6496 records to outputs/enzyme_query/3.2.1.8.tsv
Saved 7003 records to outputs/enzyme_query/1.11.1.6.tsv


## Retrieve protein sequences

In [None]:
import os
import pandas as pd
from src.databases import group_by_mag, retrieve_sequences

input_dir = "outputs/enzyme_query/"
output_dir = "outputs/enzyme_sequences/"
oceandna_dir = (
    "/home/ec2-user/SageMaker/efs/sandbox/sandbox/development/jolespin/"
    "EC2_WorkingDirectory/science/Databases/NewAtlantisPlanktonic/Sources/"
    "OceanDNA/Genomes/Prokaryotic/"
)

for filename in os.listdir(input_dir):
    if filename.endswith(".tsv"):
        # Construct full file paths
        input_file = os.path.join(input_dir, filename)
        output_file = os.path.join(output_dir, f"{os.path.splitext(filename)[0]}.faa")

        # Read the TSV file
        df = pd.read_csv(input_file, sep="\t")
        protein_ids = df.protein_id.values.tolist()

        # Group proteins by MAG and retrieve sequences
        mag_groups = group_by_mag(protein_ids)
        retrieve_sequences(oceandna_dir, mag_groups, output_file)
        
        print(f"Processed {filename} and saved sequences to {output_file}")

Processed 3.2.1.1.tsv and saved sequences to outputs/enzyme_sequences/3.2.1.1.faa


In [24]:
import pandas as pd
from src.databases import group_by_mag, retrieve_sequences

protease = pd.read_csv("outputs/enzyme_query/KG_hits_3.4.21.62.tsv", sep="\t")
protease_ids = protease.protein_id.values.tolist()

oceandna_dir = "/home/ec2-user/SageMaker/efs/sandbox/sandbox/development/jolespin/EC2_WorkingDirectory/science/Databases/NewAtlantisPlanktonic/Sources/OceanDNA/Genomes/Prokaryotic/"
outfile = "outputs/enzyme_sequences/OceanDNA_3.4.21.62.faa"

mag_groups = group_by_mag(protease_ids)
retrieve_sequences(oceandna_dir, mag_groups, outfile)

## Reconstruct phylogenetic tree

In [3]:
from src.phylo import reconstruct_phylogenetic_tree, visualize_tree

faa_file = "outputs/enzyme_sequences/OceanDNA_3.4.21.62.faa"
output_tree_file = "outputs/tree_3.4.21.62.newick"

tree = reconstruct_phylogenetic_tree(faa_file, output_tree_file)
visualize_tree(output_tree_file, method='matplotlib', output_file="outputs/tree_3.4.21.62.png")

Tree visualization saved as 'outputs/tree_3.4.21.62.png_matplotlib.png'


## Candidate enzymes with high industrial relevance


| Enzyme Name | EC | Main Usage | Market Size (USD Million) | Year | Industry | Source |
|-------------|----|-----------|-----------------------------|------|----------|--------|
| Xylanase | 3.2.1.8 | Baking industry, animal feed, paper and pulp industry | 19,100 | 2023 | Baking industry, animal feed, paper and pulp industry | [Cognitive Market Research](https://www.cognitivemarketresearch.com/xylanase-market-report) |
| Serine Protease | 3.4.21.62 | Detergents, food processing, leather industry, pharmaceuticals | 3,540 | 2023 | Detergents, Food Processing, Pharmaceuticals, Animal Feed | [Mordor Intelligence](https://www.mordorintelligence.com/industry-reports/proteases-market/market-size) |
| Amylase | 3.2.1.1 | Starch processing, baking, brewing, textile industry | 1,840.8 | 2023 | Food & Beverage, Detergents, Textiles | [Persistence Market Research](https://www.persistencemarketresearch.com/market-research/alpha-amylase-market.asp) |
| Cellulase | 3.2.1.4 | Biofuel production, textile industry (stone-washing denim), paper and pulp industry | 1,685.8 | 2023 | Textiles, Biofuels, Paper & Pulp | [Future Market Insights](https://www.futuremarketinsights.com/reports/cellulase-market) |
| Lactase | 3.2.1.23 | Dairy industry (lactose-free products), food and beverage | 1,230 | 2023 | Dairy industry (lactose-free products), food and beverage | [Global Market Insights](https://www.gminsights.com/industry-analysis/lactase-market) |
| Lipase | 3.1.1.3 | Food industry (dairy, baking), detergents, biofuel production | 591 | 2023 | Detergents, Food Processing, Pharmaceuticals, Personal Care | [Global Market Insights](https://www.gminsights.com/industry-analysis/microbial-lipase-market) |
| Catalase | 1.11.1.6 | Food preservation, textile industry (bleaching), cosmetics | 387.4 | 2022 | Food preservation, textile industry (bleaching), cosmetics | [Future Market Insights](https://www.futuremarketinsights.com/reports/catalase-market) |

In [6]:
from src.io import filter_ocean_data

enzymes_dict = {
    "3.1.1.3": "Lipase",
    "3.2.1.1": "Amylase",
    "3.2.1.4": "Cellulase",
    "3.4.21.62": "Serine Protease",
    "3.2.1.23": "Lactase",
    "3.2.1.8": "Xylanase",
    "1.11.1.6": "Catalase"
}

bboxes = {
    "3.1.1.3": {"lat": (7.01, 39.50), "long": (-39.02, -6.15)},
    "3.2.1.1": {"lat": (24.17, 39.67), "long": (-80.95, -53.35)},
    "3.2.1.4": {"lat": (-13.5, 10), "long": (-93.3, -76.5)},
    "3.4.21.62": {"lat": None, "long": None},
    "3.2.1.23": {"lat": None, "long": None},
    "3.2.1.8": {"lat": None, "long": None},
    "1.11.1.6": {"lat": None, "long": None},
}

filtered_data = {}
for ec, name in enzymes_dict.items():
    file_path = f"outputs/enzyme_query/KG_hits_{ec}.tsv"
    df = filter_ocean_data(
        file_path,
        depth_range=(100, 10000),
        temperature_range=(-5, 40),
        latitude_range=bboxes[ec]["lat"],
        longitude_range=bboxes[ec]["long"]
    )
    if not df.empty:
        filtered_data[ec] = df
    else:
        print(f"No records found for filtering conditions for ec: {ec}")

## Represent query results

In [7]:
from src.visualization import process_results, plot_ec_number_statistics

enzymes_dict = {
    "3.1.1.3": "Lipase",
    "3.2.1.1": "Amylase",
    "3.2.1.4": "Cellulase",
    "3.4.21.62": "Serine Protease",
    "3.2.1.23": "Lactase",
    "3.2.1.8": "Xylanase",
    "1.11.1.6": "Catalase"
}

market_sizes = {
    "Xylanase": 19100,
    "Serine Protease": 3540,
    "Amylase": 1840.8,
    "Cellulase": 1685.8,
    "Lactase": 1230,
    "Lipase": 591,
    "Catalase": 387.4
}

plot_ec_number_statistics(
    filtered_data,
    shp_file_path="data/maps/ne_110m_ocean/ne_110m_ocean.shp",
    enzymes_dict=enzymes_dict,
    market_sizes=market_sizes,
    output_dir="outputs/figures",
    base_font_size=14,
    )

All figures have been saved to outputs/figures


![results1](outputs/figures/ec_number_3.2.1.8_statistics.png)
<br>
![results1](outputs/figures/ec_number_3.2.1.1_statistics.png)
<br>
![results1](outputs/figures/ec_number_3.2.1.4_statistics.png)
<br>
![results1](outputs/figures/ec_number_3.1.1.3_statistics.png)