In [17]:
%reload_ext autoreload
%autoreload 2

import logging
import os

import itertools
from dotenv import load_dotenv
from pyeed import Pyeed
from pyeed.analysis.embedding_analysis import EmbeddingTool
from pyeed.analysis.mutation_detection import MutationDetection
from pyeed.analysis.standard_numbering import StandardNumberingTool

In [18]:
path_to_data_blast = "/home/nab/Niklas/TEM-lactamase/data/003_data_pull/blast_data_dna/2025-01-19_12-37-48"


load_dotenv()
password = os.getenv("NEO4J_NIKLAS_TEM_CLEAN")
if password is None:
    raise ValueError("KEY is not set in the .env file.")


logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
LOGGER = logging.getLogger(__name__)


uri = "bolt://129.69.129.130:2123"
user = "neo4j"
eedb = Pyeed(uri, user=user, password=password)
eedb.db.initialize_db_constraints(user, password)

📡 Connected to database.
the connection url is bolt://neo4j:niklasniklaspwtemclean@129.69.129.130:2123
Loaded /home/nab/Niklas/pyeed/src/pyeed/model.py
Connecting to bolt://neo4j:niklasniklaspwtemclean@129.69.129.130:2123
Setting up indexes and constraints...

Found model.StrictStructuredNode
 ! Skipping class model.StrictStructuredNode is abstract
Found model.Organism
 + Creating node unique constraint for taxonomy_id on label Organism for class model.Organism
{code: Neo.ClientError.Schema.EquivalentSchemaRuleAlreadyExists} {message: An equivalent constraint already exists, 'Constraint( id=4, name='constraint_unique_Organism_taxonomy_id', type='UNIQUENESS', schema=(:Organism {taxonomy_id}), ownedIndex=3 )'.}
Found model.Site
 + Creating node unique constraint for site_id on label Site for class model.Site
{code: Neo.ClientError.Schema.EquivalentSchemaRuleAlreadyExists} {message: An equivalent constraint already exists, 'Constraint( id=6, name='constraint_unique_Site_site_id', type='UN

In [19]:
# we start by reading in all 258 TEM-lactamase proteins and check their identical ids
ids_tem = {}

base_url_tem_family_card = 'http://purl.obolibrary.org/obo/ARO_3000014'

# get all the children of the TEM-lactamase family
query = f"""
MATCH (o:OntologyObject {{name: '{base_url_tem_family_card}'}})-[*1..1]-(n) RETURN n
"""

result = eedb.db.execute_read(query)

for single_tem in result:
    if single_tem['n']['name'] == 'http://purl.obolibrary.org/obo/ARO_3000078':
        continue
    
    tem_name = single_tem['n']['label']
    tem_url = single_tem['n']['name']
    ids_tem[tem_name] = {'tem_name': tem_name, 'tem_url': tem_url}

    # now we check for the URL and get the matching protein and read out the number of IdenticalIds
    query_tem_url = f"""
    MATCH (o:OntologyObject {{name: '{tem_url}'}})-[*1..1]-(n:Protein) RETURN n
    """

    result_tem_url = eedb.db.execute_read(query_tem_url)
    if len(result_tem_url) == 0:
        continue
    result_tem_url = result_tem_url[0]

    if 'IdenticalIds' in result_tem_url['n']:
        ids_tem[tem_name]['identical_ids'] = result_tem_url['n']['IdenticalIds']
    else:
        ids_tem[tem_name]['identical_ids'] = []
    
    ids_tem[tem_name]['accession_id'] = result_tem_url['n']['accession_id']

In [20]:
print(ids_tem)

{'TEM-52': {'tem_name': 'TEM-52', 'tem_url': 'http://purl.obolibrary.org/obo/ARO_3000921', 'identical_ids': ['CAA73933.1'], 'accession_id': 'WP_015058977.1'}, 'TEM-51': {'tem_name': 'TEM-51', 'tem_url': 'http://purl.obolibrary.org/obo/ARO_3000920'}, 'TEM-50': {'tem_name': 'TEM-50', 'tem_url': 'http://purl.obolibrary.org/obo/ARO_3000919'}, 'TEM-49': {'tem_name': 'TEM-49', 'tem_url': 'http://purl.obolibrary.org/obo/ARO_3000918', 'identical_ids': [], 'accession_id': 'CAA71324.1'}, 'TEM-48': {'tem_name': 'TEM-48', 'tem_url': 'http://purl.obolibrary.org/obo/ARO_3000917', 'identical_ids': [], 'accession_id': 'CAA71323.1'}, 'TEM-47': {'tem_name': 'TEM-47', 'tem_url': 'http://purl.obolibrary.org/obo/ARO_3000916', 'identical_ids': ['CAA71322.1'], 'accession_id': 'WP_063864914.1'}, 'TEM-46': {'tem_name': 'TEM-46', 'tem_url': 'http://purl.obolibrary.org/obo/ARO_3000915'}, 'TEM-45': {'tem_name': 'TEM-45', 'tem_url': 'http://purl.obolibrary.org/obo/ARO_3000914', 'identical_ids': [], 'accession_id':

In [21]:
# max number of neighbours
n_neighbours = 100
name_of_standard_numbering_tool = "standard_numbering_pairwise_blaTEM1a"

et = EmbeddingTool()
sn = StandardNumberingTool(name=name_of_standard_numbering_tool)
md = MutationDetection()

blaTEM1a_id = 'AAB59737.1'
blaTEM1a_database_id = None

In [22]:
# find the coresponding database id
for tem_name, tem_data in ids_tem.items():
    if 'accession_id' in tem_data:
        if tem_data['accession_id'] == blaTEM1a_id:
            blaTEM1a_database_id = tem_data['accession_id']
            break
        else:
            if blaTEM1a_id in tem_data['identical_ids']:
                blaTEM1a_database_id = tem_data['accession_id']
                break

print(blaTEM1a_database_id)


CAD09800.1


In [23]:

sn.apply_standard_numbering_pairwise(
   base_sequence_id=blaTEM1a_database_id, db=eedb.db, list_of_seq_ids=['CAD09800.1', 'AAM52207.1', 'WP_015058977.1']
)

sn.apply_standard_numbering_pairwise(
   base_sequence_id=blaTEM1a_database_id, db=eedb.db, list_of_seq_ids=['CAD09800.1', 'AAM52207.1', 'WP_015058977.1']
)

# mutations = md.get_mutations_between_sequences(
#    'CAD09800.1', 'WP_015058977.1', eedb.db, name_of_standard_numbering_tool, save_to_db=True, debug=True
# )


[32m2025-03-07 19:44:50.385[0m | [1mINFO    [0m | [36mpyeed.analysis.standard_numbering[0m:[36mapply_standard_numbering_pairwise[0m:[36m378[0m - [1mPairs: [('CAD09800.1', 'AAM52207.1'), ('CAD09800.1', 'WP_015058977.1')][0m


Output()

[32m2025-03-07 19:44:50.410[0m | [1mINFO    [0m | [36mpyeed.analysis.standard_numbering[0m:[36mapply_standard_numbering_pairwise[0m:[36m393[0m - [1mPairwise alignment results: [{'query_id': 'CAD09800.1', 'target_id': 'AAM52207.1', 'score': 282.0, 'identity': 0.993006993006993, 'gaps': 0, 'mismatches': 2, 'query_aligned': 'MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW', 'target_aligned': 'MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTTPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW'}, {'query_id': 'CAD09800.1', 'target_id': 'WP_015058977.1', 'score': 

Output()

[32m2025-03-07 19:44:50.509[0m | [1mINFO    [0m | [36mpyeed.analysis.standard_numbering[0m:[36mapply_standard_numbering_pairwise[0m:[36m393[0m - [1mPairwise alignment results: [][0m
[32m2025-03-07 19:44:50.509[0m | [1mINFO    [0m | [36mpyeed.analysis.standard_numbering[0m:[36mapply_standard_numbering_pairwise[0m:[36m409[0m - [1mNo alignment found for CAD09800.1[0m


In [24]:
already_processed_pairs = []


for tem_name, tem_data in ids_tem.items():

    if "accession_id" in tem_data:
        # get the closest neighbours
        results = et.find_nearest_neighbors_based_on_vector_index(
            index_name="vector_index_Protein_embedding",
            query_protein_id=tem_data['accession_id'],
            number_of_neighbors=n_neighbours,
            db=eedb.db,
        )

        ids = [neighbour[0] for neighbour in results] + [tem_data['accession_id']]

        sn.apply_standard_numbering_pairwise(
           base_sequence_id=blaTEM1a_database_id, db=eedb.db, list_of_seq_ids=ids
        )

        # we need to create all of the permutations of the neighbours with the base sequence
        # please that the reverse direction should not be included
        # this means that the base sequence is always the first element in the tuple and the second element is the neighbour
        permutations = [(blaTEM1a_database_id, neighbour) for neighbour in ids]
        # print(f"The permutations of the neighbours including the base sequence are: {len(permutations)}")

        # we now want to exclude the pairs that we already processed keeping in mind that we always add in the list both directions
        permuations_to_process = [pair for pair in permutations if pair not in already_processed_pairs]
        LOGGER.info(f"The number of permutations to process is: {len(permuations_to_process)}")

        # we now update the already_processed_pairs list with the new pairs
        # we need to add the reverse of the pair as well
        already_processed_pairs.extend([(pair[1], pair[0]) for pair in permuations_to_process])
        already_processed_pairs.extend(permuations_to_process)

        for pair in permuations_to_process:
            if pair[0] == pair[1]:
                continue

            LOGGER.info(f"Processing pair {pair[0]} and {pair[1]}")

            mutations = md.get_mutations_between_sequences(
                pair[0], pair[1], eedb.db, name_of_standard_numbering_tool, save_to_db=True
            )

            LOGGER.info(f"The mutations are: {mutations}, there are {len(mutations)} mutations")



[32m2025-03-07 19:44:54.093[0m | [1mINFO    [0m | [36mpyeed.analysis.standard_numbering[0m:[36mapply_standard_numbering_pairwise[0m:[36m374[0m - [1mPair CAD09800.1 and WP_015058977.1 already exists under the same standard numbering node[0m
[32m2025-03-07 19:44:54.094[0m | [1mINFO    [0m | [36mpyeed.analysis.standard_numbering[0m:[36mapply_standard_numbering_pairwise[0m:[36m374[0m - [1mPair CAD09800.1 and AAM52207.1 already exists under the same standard numbering node[0m
[32m2025-03-07 19:44:54.094[0m | [1mINFO    [0m | [36mpyeed.analysis.standard_numbering[0m:[36mapply_standard_numbering_pairwise[0m:[36m378[0m - [1mPairs: [('CAD09800.1', 'ARF37825.1'), ('CAD09800.1', 'ARF37504.1'), ('CAD09800.1', 'ARF43254.1'), ('CAD09800.1', 'ARF43174.1'), ('CAD09800.1', 'ARF43417.1'), ('CAD09800.1', 'ARF37488.1'), ('CAD09800.1', 'WP_075985686.1'), ('CAD09800.1', 'ARF37720.1'), ('CAD09800.1', 'ARF29615.1'), ('CAD09800.1', 'ARF37351.1'), ('CAD09800.1', 'ARF42786.1'),

Output()

[32m2025-03-07 19:44:54.167[0m | [1mINFO    [0m | [36mpyeed.analysis.standard_numbering[0m:[36mapply_standard_numbering_pairwise[0m:[36m393[0m - [1mPairwise alignment results: [{'query_id': 'CAD09800.1', 'target_id': 'ARF37825.1', 'score': 278.0, 'identity': 0.986013986013986, 'gaps': 0, 'mismatches': 4, 'query_aligned': 'MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW', 'target_aligned': 'MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTTPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYTTGGQATMDERNRQIAEIGASLIKHW'}, {'query_id': 'CAD09800.1', 'target_id': 'ARF37504.1', 'score': 278.

Output()

[32m2025-03-07 19:44:56.182[0m | [1mINFO    [0m | [36mpyeed.analysis.standard_numbering[0m:[36mapply_standard_numbering_pairwise[0m:[36m393[0m - [1mPairwise alignment results: [{'query_id': 'CAD09800.1', 'target_id': 'CAA71324.1', 'score': 276.0, 'identity': 0.9825174825174825, 'gaps': 0, 'mismatches': 5, 'query_aligned': 'MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW', 'target_aligned': 'MSIQHFRVALIPFFAAFCFPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASKRGSRGIIAALGPDGKPSRIVVIYMTGGQATMDERNRQIAEIGASLIKHW'}, {'query_id': 'CAD09800.1', 'target_id': 'WP_063864916.1', 'score':

IndexError: string index out of range

NameError: name 'df' is not defined

In [None]:
# eedb.fetch_from_primary_db(ids=['AAB59737.1'], db='ncbi_protein')


[32m2025-03-07 11:38:30.500[0m | [1mINFO    [0m | [36mpyeed.main[0m:[36mfetch_from_primary_db[0m:[36m87[0m - [1mFound 34374 sequences in the database.[0m
[32m2025-03-07 11:38:30.501[0m | [1mINFO    [0m | [36mpyeed.main[0m:[36mfetch_from_primary_db[0m:[36m89[0m - [1mFetching 1 sequences from ncbi_protein.[0m
[32m2025-03-07 11:38:30.526[0m | [1mINFO    [0m | [36mpyeed.adapter.primary_db_adapter[0m:[36mexecute_requests[0m:[36m140[0m - [1mStarting requests for 1 batches.[0m
[32m2025-03-07 11:38:30.527[0m | [34m[1mDEBUG   [0m | [36mpyeed.adapter.primary_db_adapter[0m:[36mexecute_requests[0m:[36m142[0m - [34m[1mPrepared 1 request payloads.[0m
[32m2025-03-07 11:38:30.532[0m | [34m[1mDEBUG   [0m | [36mpyeed.adapter.primary_db_adapter[0m:[36m_fetch_response[0m:[36m121[0m - [34m[1mSending request to https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi with parameters: {'retmode': 'text', 'rettype': 'genbank', 'db': 'protein', 'i

In [16]:
a = 'sequence: MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTTPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW'
b = 'sequence: MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW'

a == b

False