In [1]:
from dotenv import load_dotenv
import os

from graph_db.db_connection import Neo4jConnection

load_dotenv()
uri = os.getenv("NEO4J_URI")
username = os.getenv("NEO4J_USERNAME")
password = os.getenv("NEO4J_PASSWORD")

results = {}

In [2]:
query = """
MATCH (p:Protein)
WHERE $ec_number IN p.ec_numbers
OPTIONAL MATCH (m:Genome)-[:CONTAINS]->(p)
OPTIONAL MATCH (m)-[:ORIGINATED_FROM]->(s:Sample)
WHERE 
   ($min_temp IS NULL OR toFloat(s.temperature) >= $min_temp) AND
   ($max_temp IS NULL OR toFloat(s.temperature) <= $max_temp) AND
   ($min_lat IS NULL OR toFloat(s.latitude) >= $min_lat) AND
   ($max_lat IS NULL OR toFloat(s.latitude) <= $max_lat) AND
   ($min_lon IS NULL OR toFloat(s.longitude) >= $min_lon) AND
   ($max_lon IS NULL OR toFloat(s.longitude) <= $max_lon)
RETURN 
   p.protein_id AS protein_id,
   p.name AS protein_name,
   p.ec_numbers AS protein_ec_numbers,
   m.genome_id AS genome_id,
   m.gtdb_classification AS gtdb_classification,
   s.biosample_id AS sample_id,
   s.temperature AS sample_temperature,
   s.depth AS sample_depth,
   s.latitude AS sample_latitude,
   s.longigute AS sample_longitude
LIMIT $limit"""

enzymes_dict = {
    "3.1.1.3": "Lipase",
    "3.2.1.1": "Amylase",
    "3.2.1.4": "Cellulase",
    "3.4.21.62": "Serine Protease",
    "3.2.1.23": "Lactase",
    "3.2.1.8": "Xylanase",
    "1.11.1.6": "Catalase"
}

market_sizes = {
    "Xylanase": 19100,
    "Serine Protease": 3540,
    "Amylase": 1840.8,
    "Cellulase": 1685.8,
    "Lactase": 1230,
    "Lipase": 591,
    "Catalase": 387.4
}

In [3]:
for ec_number, enzyme in enzymes_dict.items():

   conn = Neo4jConnection(uri, username, password)
   params = {
      "ec_number": f"{ec_number}",
      "min_temp": None,
      "max_temp": None,
      "min_lat": None,
      "max_lat": None,
      "min_lon": None,
      "max_lon": None,
      "limit": 50000
   }

   results[ec_number] = conn.query(query, params)
   print(f"Found {len(results[ec_number])} results for {enzyme} ({ec_number})")

Found 12300 results for Lipase (3.1.1.3)
Found 4834 results for Amylase (3.2.1.1)
Found 3551 results for Cellulase (3.2.1.4)
Found 38 results for Serine Protease (3.4.21.62)
Found 10183 results for Lactase (3.2.1.23)
Found 6496 results for Xylanase (3.2.1.8)
Found 7003 results for Catalase (1.11.1.6)


### Save results to pickle file

In [4]:
import pickle

# Specify the file path where you want to save the pickle file
pickle_file_path = 'outputs/results.pickle'

# Save the results dictionary to the pickle file
with open(pickle_file_path, 'wb') as file:
    pickle.dump(results, file)

In [8]:
r = results["1.11.1.6"][-1]
r

<Record protein_id='OceanDNA-b22900_00165_4' protein_name='manganese catalase [EC:1.11.1.6]' protein_ec_numbers='1.11.1.6' genome_id='OceanDNA-b22900' gtdb_classification='d__Bacteria;p__Planctomycetota;c__Planctomycetes;o__Planctomycetales;f__Planctomycetaceae;g__Gimesia;s__Gimesia maris' sample_id='SAMEA4397295' sample_temperature='-1.298837' sample_depth='35.0' sample_latitude='78.9343' sample_longitude='79.0758'>

In [10]:
r.data()

{'protein_id': 'OceanDNA-b22900_00165_4',
 'protein_name': 'manganese catalase [EC:1.11.1.6]',
 'protein_ec_numbers': '1.11.1.6',
 'genome_id': 'OceanDNA-b22900',
 'gtdb_classification': 'd__Bacteria;p__Planctomycetota;c__Planctomycetes;o__Planctomycetales;f__Planctomycetaceae;g__Gimesia;s__Gimesia maris',
 'sample_id': 'SAMEA4397295',
 'sample_temperature': '-1.298837',
 'sample_depth': '35.0',
 'sample_latitude': '78.9343',
 'sample_longitude': '79.0758'}

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import re


def process_results(results):
    processed_data = {}
    for ec_number, query_results in results.items():
        data = [result.data() for result in query_results]
        df = pd.DataFrame(data)
        
        # Extract phylum from gtdb_classification if the column exists
        if 'gtdb_classification' in df.columns:
            df['phylum'] = df['gtdb_classification'].apply(lambda x: extract_phylum(x) if pd.notna(x) else 'Unknown')
        else:
            df['phylum'] = 'Unknown'
        
        # Convert numeric columns to float, if they exist
        numeric_columns = ['sample_temperature', 'sample_depth', 'sample_latitude', 'sample_longitude']
        for col in numeric_columns:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')
        
        processed_data[ec_number] = df
    
    return processed_data

def extract_phylum(classification_string):
    try:
        match = re.search(r'p__([^;]+)', classification_string)
        return match.group(1) if match else 'Unknown'
    except:
        return 'Unknown'

def plot_ec_number_statistics(processed_data, shp_file_path, enzymes_dict, market_sizes, base_font_size=12):

    world = gpd.read_file(shp_file_path)

    for ec_number, df in processed_data.items():
        fig = plt.figure(figsize=(24, 8))  # Adjusted for single row layout
        gs = fig.add_gridspec(2, 3, height_ratios=[0.1, 1])
        
        enzyme_name = enzymes_dict.get(ec_number, "Unknown Enzyme")
        market_size = market_sizes.get(enzyme_name, "N/A")

        # Title
        fig.add_subplot(gs[0, :])
        plt.axis('off')
        plt.text(0.5, 0.5, f'Statistics for EC Number: {ec_number} - {enzyme_name}\nTotal Hits: {len(df)} | Market Size (2023): ${market_size} million', 
                 fontsize=base_font_size*1.8, ha='center', va='center', fontweight='bold')

        # Summary text
        ax_summary = fig.add_subplot(gs[1, 0])
        summary_text = f"""
Enzyme: {enzyme_name}
EC Number: {ec_number}
Number of Hits: {len(df)}
Market Size (2023): ${market_size} million

Temperature Range: {df['sample_temperature'].min():.2f} to {df['sample_temperature'].max():.2f} °C
Depth Range: {df['sample_depth'].min():.2f} to {df['sample_depth'].max():.2f} m
Latitude Range: {df['sample_latitude'].min():.2f} to {df['sample_latitude'].max():.2f}
Longitude Range: {df['sample_longitude'].min():.2f} to {df['sample_longitude'].max():.2f}

Top 5 Phyla:
{df['phylum'].value_counts().head().to_string()}
"""
        ax_summary.text(0.05, 0.95, summary_text, verticalalignment='top', fontsize=base_font_size*1.1, 
                        transform=ax_summary.transAxes, ha='left', va='top', linespacing=1.5)
        ax_summary.axis('off')

        # World map with sample locations
        ax_map = fig.add_subplot(gs[1, 1])
        world.plot(ax=ax_map, color='lightgrey', edgecolor='black')
        ax_map.scatter(df['sample_longitude'], df['sample_latitude'], c='red', s=10, alpha=0.7)
        ax_map.set_title('Sample Locations', fontsize=base_font_size*1.5)
        ax_map.set_xlabel('Longitude', fontsize=base_font_size*1.2)
        ax_map.set_ylabel('Latitude', fontsize=base_font_size*1.2)
        ax_map.set_xlim(-180, 180)
        ax_map.set_ylim(-90, 90)
        ax_map.tick_params(labelsize=base_font_size)

        # Phylum distribution
        ax_pie = fig.add_subplot(gs[1, 2])
        phylum_counts = df['phylum'].value_counts()
        other_threshold = 0.02
        other_mask = phylum_counts / phylum_counts.sum() < other_threshold
        other_count = phylum_counts[other_mask].sum()
        phylum_counts_grouped = phylum_counts[~other_mask]
        phylum_counts_grouped['Other'] = other_count
        wedges, texts, autotexts = ax_pie.pie(phylum_counts_grouped.values, 
                                              labels=phylum_counts_grouped.index, 
                                              autopct='%1.1f%%', 
                                              startangle=90, 
                                              textprops={'fontsize': base_font_size*0.8})
        ax_pie.set_title('Phylum Distribution', fontsize=base_font_size*1.5)
        
        # Adjust legend for pie chart
        plt.setp(autotexts, size=base_font_size*0.8, weight="bold")
        ax_pie.legend(wedges, phylum_counts_grouped.index,
                      title="Phyla",
                      loc="center left",
                      bbox_to_anchor=(1, 0, 0.5, 1),
                      fontsize=base_font_size*0.8)

        plt.tight_layout()
        plt.savefig(f'outputs/figures/ec_number_{ec_number}_statistics.png', dpi=300, bbox_inches='tight')
        plt.close()


# Main execution
processed_data = process_results(results)
plot_ec_number_statistics(
    processed_data,
    shp_file_path='data/maps/ne_110m_ocean/ne_110m_ocean.shp',
    enzymes_dict=enzymes_dict,
    market_sizes=market_sizes,
    base_font_size=14
    )

## Candidate enzymes with high industrial relevance


| Enzyme Name | EC | Main Usage | Market Size (USD Million) | Year | Industry | Source |
|-------------|----|-----------|-----------------------------|------|----------|--------|
| Xylanase | 3.2.1.8 | Baking industry, animal feed, paper and pulp industry | 19,100 | 2023 | Baking industry, animal feed, paper and pulp industry | [Cognitive Market Research](https://www.cognitivemarketresearch.com/xylanase-market-report) |
| Serine Protease | 3.4.21.62 | Detergents, food processing, leather industry, pharmaceuticals | 3,540 | 2023 | Detergents, Food Processing, Pharmaceuticals, Animal Feed | [Mordor Intelligence](https://www.mordorintelligence.com/industry-reports/proteases-market/market-size) |
| Amylase | 3.2.1.1 | Starch processing, baking, brewing, textile industry | 1,840.8 | 2023 | Food & Beverage, Detergents, Textiles | [Persistence Market Research](https://www.persistencemarketresearch.com/market-research/alpha-amylase-market.asp) |
| Cellulase | 3.2.1.4 | Biofuel production, textile industry (stone-washing denim), paper and pulp industry | 1,685.8 | 2023 | Textiles, Biofuels, Paper & Pulp | [Future Market Insights](https://www.futuremarketinsights.com/reports/cellulase-market) |
| Lactase | 3.2.1.23 | Dairy industry (lactose-free products), food and beverage | 1,230 | 2023 | Dairy industry (lactose-free products), food and beverage | [Global Market Insights](https://www.gminsights.com/industry-analysis/lactase-market) |
| Lipase | 3.1.1.3 | Food industry (dairy, baking), detergents, biofuel production | 591 | 2023 | Detergents, Food Processing, Pharmaceuticals, Personal Care | [Global Market Insights](https://www.gminsights.com/industry-analysis/microbial-lipase-market) |
| Catalase | 1.11.1.6 | Food preservation, textile industry (bleaching), cosmetics | 387.4 | 2022 | Food preservation, textile industry (bleaching), cosmetics | [Future Market Insights](https://www.futuremarketinsights.com/reports/catalase-market) |