In [1]:
import logging
import os

import matplotlib.pyplot as plt
import numpy as np
from dotenv import load_dotenv
from pyeed import Pyeed

In [None]:
path_to_data_blast = "/home/nab/Niklas/TEM-lactamase/data/003_data_pull/blast_data_dna/2025-01-19_12-37-48"


load_dotenv()
password = os.getenv("NEO4J_NIKLAS_TEM_CLEAN")
if password is None:
    raise ValueError("KEY is not set in the .env file.")


logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
LOGGER = logging.getLogger(__name__)


uri = "bolt://129.69.129.130:2123"
user = "neo4j"
eedb = Pyeed(uri, user=user, password=password)
eedb.db.initialize_db_constraints(user, password)

📡 Connected to database.
the connection url is bolt://neo4j:niklasniklaspwtemclean@129.69.129.130:2123


In [26]:
# we start by reading in all 258 TEM-lactamase proteins and check their identical ids
data_tem_ids = {}

base_url_tem_family_card = 'http://purl.obolibrary.org/obo/ARO_3000014'

# get all the children of the TEM-lactamase family
query = f"""
MATCH (o:OntologyObject {{name: '{base_url_tem_family_card}'}})-[*1..1]-(n) RETURN n
"""

result = eedb.db.execute_read(query)

for single_tem in result:
    if single_tem['n']['name'] == 'http://purl.obolibrary.org/obo/ARO_3000078':
        continue
    tem_name = single_tem['n']['label']
    tem_url = single_tem['n']['name']

    # now we check for the URL and get the matching protein and read out the number of IdenticalIds
    query_tem_url = f"""
    MATCH (o:OntologyObject {{name: '{tem_url}'}})-[*1..1]-(n:Protein) RETURN n
    """

    result_tem_url = eedb.db.execute_read(query_tem_url)
    if len(result_tem_url) == 0:
        continue
    result_tem_url = result_tem_url[0]

    # chcek wether the key exists in the dictionary result_tem_url['n'] is a dict might have key IdenticalIds
    if 'IdenticalIds' in result_tem_url['n']:
        data_tem_ids[tem_name] = result_tem_url['n']['IdenticalIds'] + [result_tem_url['n']['accession_id']]
    else:
        data_tem_ids[tem_name] = [result_tem_url['n']['accession_id']]




print(data_tem_ids)
print(len(data_tem_ids))
print(data_tem_ids['TEM-1'])
print(data_tem_ids['TEM-10'])
    

{'TEM-52': ['CAA73933.1', 'WP_015058977.1'], 'TEM-49': ['CAA71324.1'], 'TEM-48': ['CAA71323.1'], 'TEM-47': ['CAA71322.1', 'WP_063864914.1'], 'TEM-45': ['CAA64682.1'], 'TEM-43': ['WP_063864912.1', 'AAC32889.2'], 'TEM-42': ['CAA66659.1'], 'TEM-40': ['WP_021526512.1', 'CBX53726.1'], 'TEM-39': ['WP_148044474.1'], 'TEM-37': ['WP_159373457.1'], 'TEM-36': ['WP_075985685.1'], 'TEM-35': ['WP_063864910.1'], 'TEM-34': ['WP_015379489.1', 'AGE11905.1'], 'TEM-33': ['ADL13944.1', 'WP_013279314.1'], 'TEM-32': ['WP_052944427.1'], 'TEM-31': ['WP_165539487.1'], 'TEM-30': ['CAD24670.1'], 'TEM-29': ['CAA76796.1', 'WP_032490103.1'], 'TEM-28': ['AAC32891.1'], 'TEM-26': ['WP_047028173.1'], 'TEM-24': ['CAA46345.1'], 'TEM-22': ['CAA76795.1'], 'TEM-21': ['CAA76794.1'], 'TEM-20': ['CAA76793.1', 'WP_063864893.1'], 'TEM-19': ['AFN21551.1'], 'TEM-17': ['CAA74912.2'], 'TEM-16': ['WP_063864870.1', 'CAA46346.1'], 'TEM-15': ['CAO98721.1'], 'TEM-12': ['WP_042065300.1', 'AAA25053.1'], 'TEM-11': ['WP_063864800.1', 'AAW6660