In [10]:
%reload_ext autoreload
%autoreload 2
import json
import logging
import numpy as np
import pandas as pd
from pyeed import Pyeed

from pyeed.analysis.ontology_loading import OntologyAdapter

In [3]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
LOGGER = logging.getLogger(__name__)

In [4]:
uri = "bolt://127.0.0.1:1123"
user = "neo4j"
password = "niklasonlytems"

# Create a Pyeed object, automatically connecting to the database
eedb = Pyeed(uri, user, password)

📡 Connected to database.


In [5]:
# For testing purposes, we will wipe the database and remove all constraints
# eedb.db.wipe_database(date='2024-12-13')
# eedb.db.remove_db_constraints(user=user, password=password)

# DB connector is an attribute of the Pyeed object, type `DatabaseConnector`
LOGGER.info(f"Database stats: {eedb.db.stats()}")

# The first time the pyeed database is initialized, we need to create the constraints which are defined in the pyeed graph model
eedb.db.initialize_db_constraints(user=user, password=password)

2024-12-16 14:35:56,311 - INFO - Database stats: {'nodes': 9052, 'relationships': 26156}


the connection url is bolt://neo4j:niklasonlytems@127.0.0.1:1123
Loaded /home/nab/Niklas/pyeed/src/pyeed/model.py
Connecting to bolt://neo4j:niklasonlytems@127.0.0.1:1123
Setting up indexes and constraints...

Found model.StrictStructuredNode
 ! Skipping class model.StrictStructuredNode is abstract
Found model.Organism
 + Creating node unique constraint for taxonomy_id on label Organism for class model.Organism
{code: Neo.ClientError.Schema.EquivalentSchemaRuleAlreadyExists} {message: An equivalent constraint already exists, 'Constraint( id=15, name='constraint_unique_Organism_taxonomy_id', type='UNIQUENESS', schema=(:Organism {taxonomy_id}), ownedIndex=14 )'.}
Found model.Site
 + Creating node unique constraint for site_id on label Site for class model.Site
{code: Neo.ClientError.Schema.EquivalentSchemaRuleAlreadyExists} {message: An equivalent constraint already exists, 'Constraint( id=10, name='constraint_unique_Site_site_id', type='UNIQUENESS', schema=(:Site {site_id}), ownedIndex=

In [None]:
# ok we are ready to go
LOGGER.info("Setup complete")

# read in the ids.json file form this directory
with open("/home/nab/Niklas/TEM-lactamase/data/TEM_Ids/TEM_Ids.json", "r") as f:
    dict_id_name = json.load(f)

# now fecth all of the proteins from the database
eedb.fetch_from_primary_db(dict_id_name, db='ncbi_protein')

In [6]:
# read in the pandas dataframe
df = pd.read_csv('/home/nab/Niklas/TEM-lactamase/data/002_combined_data/TEM_lactamase.csv', sep=';')
print(df.head())

   Unnamed: 0 protein_name phenotype    protein_id protein_id_database
0           0        TEM-1        2b      AAP20891          AAP20891.1
1           1        TEM-2        2b      CAJ85677          CAJ85677.1
2           2        TEM-3       2be      SAQ02853          SAQ02853.1
3           3        TEM-4       2be      CDR98216          CDR98216.1
4           4        TEM-5       2be  WP_109963600      WP_109963600.1


In [None]:
from pyeed.analysis.standard_numbering import StandardNumberingTool

# Apply the standard numbering
standard_numbering = StandardNumberingTool(name="test_standard_numbering_all")
standard_numbering.apply_standard_numbering(base_sequence_id='AAP20891.1', db=eedb.db) # , list_of_seq_ids=df['protein_id_database'].tolist())

In [12]:
# now we want to start with a mutational detection
# a first approach is to just include the 209 TEMs and see if we can detect the mutations
# here we find the colsest neighbor based on the standard numbering and then we can find their mutations
# we also want to coun the number of mutations, the idendeity, the cosine distance and the euclidean distance between all of them
# we can therefore perform a pairwise alignment between the found neighbours

# we first need to find the closest neighbour to the base sequence
n_neighbours = 40000

from pyeed.analysis.embedding_analysis import EmbeddingTool
from pyeed.analysis.sequence_alignment import PairwiseAligner
from pyeed.analysis.mutation_detection import MutationDetection

et = EmbeddingTool()
pa = PairwiseAligner()
md = MutationDetection()

# count the number of pairwise alignments performed
# we want to expect 209*209 / 2 = 21801 pairwise alignments
counter = 0
already_processed_pairs = []

# iterate over the different proteins ids in df
for index, row in df.iterrows():
    print(f"Processing protein {index+1} of {len(df)} with a db id of {row['protein_id_database']}")
    if pd.isna(row['protein_id_database']):
        print(f"Skipping protein {index+1} of {len(df)} because it does not have a database id")
        continue
    # get the id in the database
    base_sequence_id = row['protein_id_database']

    closest_neighbours = et.find_closest_matches_simple(start_sequence_id=base_sequence_id, db=eedb.db, n = n_neighbours)
    # print(f"The number of closest neighbours is: {len(closest_neighbours)}")

    # the protein itself is returned as well
    # the list is build up of tuples with the following structure: (sequence_id, distance)
    closest_neighbours_ids = [neighbour[0] for neighbour in closest_neighbours]
    # print(f"The closest neighbours ids are: {closest_neighbours_ids}")

    # for the moment we only want to look at ids which are in the TEM-209 list
    # this list is stored in the df dataframe
    # we can get the ids from the df dataframe by using the 'protein_id_database' column
    # we need to make sure that the ids are in the closest_neighbours_ids list
    # we can do this by using the intersection of the two lists
    tem_209_ids = df['protein_id_database'].dropna().tolist()
    # print(f"The TEM-209 ids are: {tem_209_ids}")

    # now we can get the intersection of the two lists
    intersection = list(set(closest_neighbours_ids) & set(tem_209_ids))
    # print(f"The intersection of the two lists is: {len(intersection)}")

    # we need to create all of the permutations of the neighbours with the base sequence
    # please that the reverse direction should not be included
    # this means that the base sequence is always the first element in the tuple and the second element is the neighbour
    permutations = [(base_sequence_id, neighbour) for neighbour in intersection]
    # print(f"The permutations of the neighbours including the base sequence are: {len(permutations)}")

    # we now want to exclude the pairs that we already processed keeping in mind that we always add in the list both directions
    permuations_to_process = [pair for pair in permutations if pair not in already_processed_pairs]

    # we now update the already_processed_pairs list with the new pairs
    # we need to add the reverse of the pair as well
    already_processed_pairs.extend([(pair[1], pair[0]) for pair in permuations_to_process])
    already_processed_pairs.extend(permuations_to_process)
    
    # now we run a pairwise alignment between the found neighbours
    pairwise_alignment = pa.align_multipairwise(ids=intersection, db=eedb.db, pairs = permuations_to_process)
    # print(f"The pairwise alignment between the found neighbours and the base sequence is: {pairwise_alignment}")
    counter += len(permuations_to_process)

    # now we detect the mutations
    mutations = []
    # now we can detect the mutations
    for i in range(len(permuations_to_process)):
        if permuations_to_process[i][0] == permuations_to_process[i][1]:
            # print(f"Skipping permutation {i+1} of {len(permuations_to_process)} because they are the same")
            continue

        # print(f"Mutation detection for permuations_to_process {i+1} of {len(permuations_to_process)} between {permuations_to_process[i][0]} and {permuations_to_process[i][1]}")
        result = md.get_mutations_between_sequences(sequence_id1=permuations_to_process[i][0], sequence_id2=permuations_to_process[i][1], db=eedb.db, save_to_db=True, standard_numbering_tool_name="test_standard_numbering")
        # print(f"Number of mutations: {len(result)}")

        mutations.append(result)



print(f"The number of pairwise alignments performed is: {counter}")


Processing protein 1 of 265 with a db id of AAP20891.1


The permutations of the neighbours including the base sequence are: 209


Skipping permutation 98 of 209 because they are the same
Processing protein 2 of 265 with a db id of CAJ85677.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 124 of 208 because they are the same
Processing protein 3 of 265 with a db id of SAQ02853.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 157 of 207 because they are the same
Processing protein 4 of 265 with a db id of CDR98216.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 6 of 206 because they are the same
Processing protein 5 of 265 with a db id of WP_109963600.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 201 of 205 because they are the same
Processing protein 6 of 265 with a db id of CAA41038.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 120 of 204 because they are the same
Processing protein 7 of 265 with a db id of WP_109874025.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 179 of 203 because they are the same
Processing protein 8 of 265 with a db id of CAA46344.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 183 of 202 because they are the same
Processing protein 9 of 265 with a db id of APG33178.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 13 of 201 because they are the same
Processing protein 10 of 265 with a db id of nan
Skipping protein 10 of 265 because it does not have a database id
Processing protein 11 of 265 with a db id of AKC98298.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 183 of 200 because they are the same
Processing protein 12 of 265 with a db id of KJO56189.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 104 of 199 because they are the same
Processing protein 13 of 265 with a db id of nan
Skipping protein 13 of 265 because it does not have a database id
Processing protein 14 of 265 with a db id of nan
Skipping protein 14 of 265 because it does not have a database id
Processing protein 15 of 265 with a db id of KLP91446.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 116 of 198 because they are the same
Processing protein 16 of 265 with a db id of CAA46346.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 176 of 197 because they are the same
Processing protein 17 of 265 with a db id of CAA74912.2
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 37 of 196 because they are the same
Processing protein 18 of 265 with a db id of nan
Skipping protein 18 of 265 because it does not have a database id
Processing protein 19 of 265 with a db id of AFN21551.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 36 of 195 because they are the same
Processing protein 20 of 265 with a db id of ACB22021.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 40 of 194 because they are the same
Processing protein 21 of 265 with a db id of CAA76794.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 162 of 193 because they are the same
Processing protein 22 of 265 with a db id of CAA76795.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 128 of 192 because they are the same
Processing protein 23 of 265 with a db id of nan
Skipping protein 23 of 265 because it does not have a database id
Processing protein 24 of 265 with a db id of CCG28759.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 38 of 191 because they are the same
Processing protein 25 of 265 with a db id of nan
Skipping protein 25 of 265 because it does not have a database id
Processing protein 26 of 265 with a db id of KLG19745.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 161 of 190 because they are the same
Processing protein 27 of 265 with a db id of nan
Skipping protein 27 of 265 because it does not have a database id
Processing protein 28 of 265 with a db id of AAC32891.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 116 of 189 because they are the same
Processing protein 29 of 265 with a db id of CAA76796.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 35 of 188 because they are the same
Processing protein 30 of 265 with a db id of CAD24670.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 13 of 187 because they are the same
Processing protein 31 of 265 with a db id of ARF45649.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 186 of 186 because they are the same
Processing protein 32 of 265 with a db id of CTA52364.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 9 of 185 because they are the same
Processing protein 33 of 265 with a db id of ADL13944.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 56 of 184 because they are the same
Processing protein 34 of 265 with a db id of AGQ50511.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 159 of 183 because they are the same
Processing protein 35 of 265 with a db id of AKA60778.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 102 of 182 because they are the same
Processing protein 36 of 265 with a db id of APT65830.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 26 of 181 because they are the same
Processing protein 37 of 265 with a db id of HAH6232254.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 34 of 180 because they are the same
Processing protein 38 of 265 with a db id of nan
Skipping protein 38 of 265 because it does not have a database id
Processing protein 39 of 265 with a db id of QDO66746.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 78 of 179 because they are the same
Processing protein 40 of 265 with a db id of CBX53726.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 1 of 178 because they are the same
Processing protein 41 of 265 with a db id of nan
Skipping protein 41 of 265 because it does not have a database id
Processing protein 42 of 265 with a db id of nan
Skipping protein 42 of 265 because it does not have a database id
Processing protein 43 of 265 with a db id of AAC32889.2
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 110 of 177 because they are the same
Processing protein 44 of 265 with a db id of nan
Skipping protein 44 of 265 because it does not have a database id
Processing protein 45 of 265 with a db id of CAA64682.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 12 of 176 because they are the same
Processing protein 46 of 265 with a db id of nan
Skipping protein 46 of 265 because it does not have a database id
Processing protein 47 of 265 with a db id of CAA71322.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 157 of 175 because they are the same
Processing protein 48 of 265 with a db id of CAA71323.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 63 of 174 because they are the same
Processing protein 49 of 265 with a db id of CAA71324.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 163 of 173 because they are the same
Processing protein 50 of 265 with a db id of nan
Skipping protein 50 of 265 because it does not have a database id
Processing protein 51 of 265 with a db id of nan
Skipping protein 51 of 265 because it does not have a database id
Processing protein 52 of 265 with a db id of AEC32455.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 59 of 172 because they are the same
Processing protein 53 of 265 with a db id of AAD22538.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 97 of 171 because they are the same
Processing protein 54 of 265 with a db id of AAD22539.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 141 of 170 because they are the same
Processing protein 55 of 265 with a db id of ABB97007.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 84 of 169 because they are the same
Processing protein 56 of 265 with a db id of nan
Skipping protein 56 of 265 because it does not have a database id
Processing protein 57 of 265 with a db id of ACJ43254.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 19 of 168 because they are the same
Processing protein 58 of 265 with a db id of nan
Skipping protein 58 of 265 because it does not have a database id
Processing protein 59 of 265 with a db id of nan
Skipping protein 59 of 265 because it does not have a database id
Processing protein 60 of 265 with a db id of AAC05975.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 140 of 167 because they are the same
Processing protein 61 of 265 with a db id of BCD58813.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 106 of 166 because they are the same
Processing protein 62 of 265 with a db id of nan
Skipping protein 62 of 265 because it does not have a database id
Processing protein 63 of 265 with a db id of AAK17194.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 136 of 165 because they are the same
Processing protein 64 of 265 with a db id of nan
Skipping protein 64 of 265 because it does not have a database id
Processing protein 65 of 265 with a db id of nan
Skipping protein 65 of 265 because it does not have a database id
Processing protein 66 of 265 with a db id of nan
Skipping protein 66 of 265 because it does not have a database id
Processing protein 67 of 265 with a db id of AAD33116.2
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 99 of 164 because they are the same
Processing protein 68 of 265 with a db id of CAB92324.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 105 of 163 because they are the same
Processing protein 69 of 265 with a db id of nan
Skipping protein 69 of 265 because it does not have a database id
Processing protein 70 of 265 with a db id of AAF01046.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 6 of 162 because they are the same
Processing protein 71 of 265 with a db id of AAL03985.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 9 of 161 because they are the same
Processing protein 72 of 265 with a db id of AAF19151.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 118 of 160 because they are the same
Processing protein 73 of 265 with a db id of nan
Skipping protein 73 of 265 because it does not have a database id
Processing protein 74 of 265 with a db id of nan
Skipping protein 74 of 265 because it does not have a database id
Processing protein 75 of 265 with a db id of nan
Skipping protein 75 of 265 because it does not have a database id
Processing protein 76 of 265 with a db id of AAF05613.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 129 of 159 because they are the same
Processing protein 77 of 265 with a db id of AAF05614.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 46 of 158 because they are the same
Processing protein 78 of 265 with a db id of AAF05612.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 44 of 157 because they are the same
Processing protein 79 of 265 with a db id of AAF05611.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 67 of 156 because they are the same
Processing protein 80 of 265 with a db id of AAM15527.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 90 of 155 because they are the same
Processing protein 81 of 265 with a db id of AAL29433.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 100 of 154 because they are the same
Processing protein 82 of 265 with a db id of AAL29434.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 54 of 153 because they are the same
Processing protein 83 of 265 with a db id of AAL29435.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 32 of 152 because they are the same
Processing protein 84 of 265 with a db id of AAL29436.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 64 of 151 because they are the same
Processing protein 85 of 265 with a db id of CAC43229.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 100 of 150 because they are the same
Processing protein 86 of 265 with a db id of CAC43230.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 72 of 149 because they are the same
Processing protein 87 of 265 with a db id of AAG44570.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 34 of 148 because they are the same
Processing protein 88 of 265 with a db id of AAK14792.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 3 of 147 because they are the same
Processing protein 89 of 265 with a db id of nan
Skipping protein 89 of 265 because it does not have a database id
Processing protein 90 of 265 with a db id of AAK30619.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 15 of 146 because they are the same
Processing protein 91 of 265 with a db id of BAB16308.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 52 of 145 because they are the same
Processing protein 92 of 265 with a db id of AAF66653.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 5 of 144 because they are the same
Processing protein 93 of 265 with a db id of CAC85660.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 57 of 143 because they are the same
Processing protein 94 of 265 with a db id of CAC85661.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 119 of 142 because they are the same
Processing protein 95 of 265 with a db id of CAC67290.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 116 of 141 because they are the same
Processing protein 96 of 265 with a db id of AAM22276.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 72 of 140 because they are the same
Processing protein 97 of 265 with a db id of AAK85244.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 31 of 139 because they are the same
Processing protein 98 of 265 with a db id of AAK85245.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 54 of 138 because they are the same
Processing protein 99 of 265 with a db id of AAK85243.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 75 of 137 because they are the same
Processing protein 100 of 265 with a db id of nan
Skipping protein 100 of 265 because it does not have a database id
Processing protein 101 of 265 with a db id of AAM18924.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 127 of 136 because they are the same
Processing protein 102 of 265 with a db id of AAK82652.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 18 of 135 because they are the same
Processing protein 103 of 265 with a db id of EAC0197234.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 49 of 134 because they are the same
Processing protein 104 of 265 with a db id of ADI77428.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 107 of 133 because they are the same
Processing protein 105 of 265 with a db id of AAM61953.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 35 of 132 because they are the same
Processing protein 106 of 265 with a db id of AAM52207.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 84 of 131 because they are the same
Processing protein 107 of 265 with a db id of AAM52215.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 3 of 130 because they are the same
Processing protein 108 of 265 with a db id of AAM28884.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 71 of 129 because they are the same
Processing protein 109 of 265 with a db id of AAT46413.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 61 of 128 because they are the same
Processing protein 110 of 265 with a db id of AAL68923.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 7 of 127 because they are the same
Processing protein 111 of 265 with a db id of AAL77062.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 74 of 126 because they are the same
Processing protein 112 of 265 with a db id of AAS89982.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 99 of 125 because they are the same
Processing protein 113 of 265 with a db id of AAS89983.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 66 of 124 because they are the same
Processing protein 114 of 265 with a db id of AAS89984.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 99 of 123 because they are the same
Processing protein 115 of 265 with a db id of AAN04881.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 65 of 122 because they are the same
Processing protein 116 of 265 with a db id of BAM36530.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 75 of 121 because they are the same
Processing protein 117 of 265 with a db id of PLC74546.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 37 of 120 because they are the same
Processing protein 118 of 265 with a db id of nan
Skipping protein 118 of 265 because it does not have a database id
Processing protein 119 of 265 with a db id of nan
Skipping protein 119 of 265 because it does not have a database id
Processing protein 120 of 265 with a db id of AAO85882.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 44 of 119 because they are the same
Processing protein 121 of 265 with a db id of AAQ01671.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 88 of 118 because they are the same
Processing protein 122 of 265 with a db id of AAQ98890.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 63 of 117 because they are the same
Processing protein 123 of 265 with a db id of AAQ93490.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 92 of 116 because they are the same
Processing protein 124 of 265 with a db id of AAQ93491.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 27 of 115 because they are the same
Processing protein 125 of 265 with a db id of AAT46414.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 67 of 114 because they are the same
Processing protein 126 of 265 with a db id of AAT45742.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 84 of 113 because they are the same
Processing protein 127 of 265 with a db id of AAR89358.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 57 of 112 because they are the same
Processing protein 128 of 265 with a db id of AAR89359.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 61 of 111 because they are the same
Processing protein 129 of 265 with a db id of CAG34105.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 57 of 110 because they are the same
Processing protein 130 of 265 with a db id of CAI29263.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 97 of 109 because they are the same
Processing protein 131 of 265 with a db id of AAR10958.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 91 of 108 because they are the same
Processing protein 132 of 265 with a db id of BAD89187.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 53 of 107 because they are the same
Processing protein 133 of 265 with a db id of AAS19171.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 21 of 106 because they are the same
Processing protein 134 of 265 with a db id of AAS79107.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 71 of 105 because they are the same
Processing protein 135 of 265 with a db id of ADB79815.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 71 of 104 because they are the same
Processing protein 136 of 265 with a db id of AAV83795.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 55 of 103 because they are the same
Processing protein 137 of 265 with a db id of CAL08007.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 19 of 102 because they are the same
Processing protein 138 of 265 with a db id of AAW47922.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 82 of 101 because they are the same
Processing protein 139 of 265 with a db id of AAZ23494.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 76 of 100 because they are the same
Processing protein 140 of 265 with a db id of nan
Skipping protein 140 of 265 because it does not have a database id
Processing protein 141 of 265 with a db id of AAX56615.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 91 of 99 because they are the same
Processing protein 142 of 265 with a db id of ABD60314.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 83 of 98 because they are the same
Processing protein 143 of 265 with a db id of AAY85632.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 7 of 97 because they are the same
Processing protein 144 of 265 with a db id of CAJ17559.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 66 of 96 because they are the same
Processing protein 145 of 265 with a db id of AAZ14083.2
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 12 of 95 because they are the same
Processing protein 146 of 265 with a db id of AAZ14084.2
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 53 of 94 because they are the same
Processing protein 147 of 265 with a db id of ABB84515.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 18 of 93 because they are the same
Processing protein 148 of 265 with a db id of CAJ32372.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 58 of 92 because they are the same
Processing protein 149 of 265 with a db id of ABC96711.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 24 of 91 because they are the same
Processing protein 150 of 265 with a db id of ACI32333.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 30 of 90 because they are the same
Processing protein 151 of 265 with a db id of ABI74448.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 79 of 89 because they are the same
Processing protein 152 of 265 with a db id of ABI74447.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 4 of 88 because they are the same
Processing protein 153 of 265 with a db id of AGA83484.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 76 of 87 because they are the same
Processing protein 154 of 265 with a db id of ACO07310.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 70 of 86 because they are the same
Processing protein 155 of 265 with a db id of ABG77582.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 12 of 85 because they are the same
Processing protein 156 of 265 with a db id of AML08013.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 37 of 84 because they are the same
Processing protein 157 of 265 with a db id of ABI81768.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 72 of 83 because they are the same
Processing protein 158 of 265 with a db id of ABQ00181.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 28 of 82 because they are the same
Processing protein 159 of 265 with a db id of ABM54869.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 4 of 81 because they are the same
Processing protein 160 of 265 with a db id of ABM54870.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 73 of 80 because they are the same
Processing protein 161 of 265 with a db id of nan
Skipping protein 161 of 265 because it does not have a database id
Processing protein 162 of 265 with a db id of ABO64442.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 35 of 79 because they are the same
Processing protein 163 of 265 with a db id of ACF32746.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 66 of 78 because they are the same
Processing protein 164 of 265 with a db id of ABX71157.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 19 of 77 because they are the same
Processing protein 165 of 265 with a db id of nan
Skipping protein 165 of 265 because it does not have a database id
Processing protein 166 of 265 with a db id of ACI25375.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 75 of 76 because they are the same
Processing protein 167 of 265 with a db id of ACJ04051.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 37 of 75 because they are the same
Processing protein 168 of 265 with a db id of ACR22829.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 43 of 74 because they are the same
Processing protein 169 of 265 with a db id of AJC64564.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 70 of 73 because they are the same
Processing protein 170 of 265 with a db id of nan
Skipping protein 170 of 265 because it does not have a database id
Processing protein 171 of 265 with a db id of BAV00618.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 53 of 72 because they are the same
Processing protein 172 of 265 with a db id of nan
Skipping protein 172 of 265 because it does not have a database id
Processing protein 173 of 265 with a db id of nan
Skipping protein 173 of 265 because it does not have a database id
Processing protein 174 of 265 with a db id of nan
Skipping protein 174 of 265 because it does not have a database id
Processing protein 175 of 265 with a db id of nan
Skipping protein 175 of 265 because it does not have a database id
Processing protein 176 of 265 with a db id of KOP91900.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 2 of 71 because they are the same
Processing protein 177 of 265 with a db id of CBJ06718.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 61 of 70 because they are the same
Processing protein 178 of 265 with a db id of CAA65888.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 46 of 69 because they are the same
Processing protein 179 of 265 with a db id of nan
Skipping protein 179 of 265 because it does not have a database id
Processing protein 180 of 265 with a db id of nan
Skipping protein 180 of 265 because it does not have a database id
Processing protein 181 of 265 with a db id of AKQ12673.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 35 of 68 because they are the same
Processing protein 182 of 265 with a db id of ADP20705.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 43 of 67 because they are the same
Processing protein 183 of 265 with a db id of ADR71220.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 42 of 66 because they are the same
Processing protein 184 of 265 with a db id of CCA61905.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 1 of 65 because they are the same
Processing protein 185 of 265 with a db id of AEG64812.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 25 of 64 because they are the same
Processing protein 186 of 265 with a db id of AET99222.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 56 of 63 because they are the same
Processing protein 187 of 265 with a db id of ADM61585.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 38 of 62 because they are the same
Processing protein 188 of 265 with a db id of AEL17198.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 29 of 61 because they are the same
Processing protein 189 of 265 with a db id of AEL79515.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 42 of 60 because they are the same
Processing protein 190 of 265 with a db id of AEL88240.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 15 of 59 because they are the same
Processing protein 191 of 265 with a db id of APY23677.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 54 of 58 because they are the same
Processing protein 192 of 265 with a db id of nan
Skipping protein 192 of 265 because it does not have a database id
Processing protein 193 of 265 with a db id of AFC75523.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 17 of 57 because they are the same
Processing protein 194 of 265 with a db id of AFC75524.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 24 of 56 because they are the same
Processing protein 195 of 265 with a db id of AFC75525.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 20 of 55 because they are the same
Processing protein 196 of 265 with a db id of AFE48832.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 14 of 54 because they are the same
Processing protein 197 of 265 with a db id of AEK48085.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 20 of 53 because they are the same
Processing protein 198 of 265 with a db id of BAL68178.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 43 of 52 because they are the same
Processing protein 199 of 265 with a db id of nan
Skipping protein 199 of 265 because it does not have a database id
Processing protein 200 of 265 with a db id of nan
Skipping protein 200 of 265 because it does not have a database id
Processing protein 201 of 265 with a db id of AFS44742.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 41 of 51 because they are the same
Processing protein 202 of 265 with a db id of nan
Skipping protein 202 of 265 because it does not have a database id
Processing protein 203 of 265 with a db id of nan
Skipping protein 203 of 265 because it does not have a database id
Processing protein 204 of 265 with a db id of ALJ78859.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 40 of 50 because they are the same
Processing protein 205 of 265 with a db id of AGZ20205.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 32 of 49 because they are the same
Processing protein 206 of 265 with a db id of AGK82336.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 2 of 48 because they are the same
Processing protein 207 of 265 with a db id of AAG45415.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 40 of 47 because they are the same
Processing protein 208 of 265 with a db id of AGL39384.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 16 of 46 because they are the same
Processing protein 209 of 265 with a db id of AGW25367.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 31 of 45 because they are the same
Processing protein 210 of 265 with a db id of AIF78090.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 28 of 44 because they are the same
Processing protein 211 of 265 with a db id of AHA80960.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 27 of 43 because they are the same
Processing protein 212 of 265 with a db id of AHA49909.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 11 of 42 because they are the same
Processing protein 213 of 265 with a db id of OCN67279.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 21 of 41 because they are the same
Processing protein 214 of 265 with a db id of AJO16044.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 28 of 40 because they are the same
Processing protein 215 of 265 with a db id of AJO16045.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 7 of 39 because they are the same
Processing protein 216 of 265 with a db id of AHJ78622.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 5 of 38 because they are the same
Processing protein 217 of 265 with a db id of CDN33426.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 24 of 37 because they are the same
Processing protein 218 of 265 with a db id of nan
Skipping protein 218 of 265 because it does not have a database id
Processing protein 219 of 265 with a db id of AIS39742.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 1 of 36 because they are the same
Processing protein 220 of 265 with a db id of AIW68620.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 26 of 35 because they are the same
Processing protein 221 of 265 with a db id of nan
Skipping protein 221 of 265 because it does not have a database id
Processing protein 222 of 265 with a db id of nan
Skipping protein 222 of 265 because it does not have a database id
Processing protein 223 of 265 with a db id of nan
Skipping protein 223 of 265 because it does not have a database id
Processing protein 224 of 265 with a db id of AMD11804.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 23 of 34 because they are the same
Processing protein 225 of 265 with a db id of APT67991.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 23 of 33 because they are the same
Processing protein 226 of 265 with a db id of AQT03459.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 9 of 32 because they are the same
Processing protein 227 of 265 with a db id of AQX83499.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 4 of 31 because they are the same
Processing protein 228 of 265 with a db id of ARF19528.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 16 of 30 because they are the same
Processing protein 229 of 265 with a db id of AUS83547.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 25 of 29 because they are the same
Processing protein 230 of 265 with a db id of AUT06962.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 18 of 28 because they are the same
Processing protein 231 of 265 with a db id of AUT06963.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 10 of 27 because they are the same
Processing protein 232 of 265 with a db id of AVP73880.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 22 of 26 because they are the same
Processing protein 233 of 265 with a db id of AWH90786.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 7 of 25 because they are the same
Processing protein 234 of 265 with a db id of AWI33307.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 24 of 24 because they are the same
Processing protein 235 of 265 with a db id of ACT97652.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 13 of 23 because they are the same
Processing protein 236 of 265 with a db id of AXL10707.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 15 of 22 because they are the same
Processing protein 237 of 265 with a db id of ADN79098.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 4 of 21 because they are the same
Processing protein 238 of 265 with a db id of AYR04769.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 6 of 20 because they are the same
Processing protein 239 of 265 with a db id of MBG9969038.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 1 of 19 because they are the same
Processing protein 240 of 265 with a db id of QBG79064.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 1 of 18 because they are the same
Processing protein 241 of 265 with a db id of QDC28520.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 14 of 17 because they are the same
Processing protein 242 of 265 with a db id of QDY98370.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 15 of 16 because they are the same
Processing protein 243 of 265 with a db id of QPG87090.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 7 of 15 because they are the same
Processing protein 244 of 265 with a db id of QWY17601.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 10 of 14 because they are the same
Processing protein 245 of 265 with a db id of ULU82600.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 11 of 13 because they are the same
Processing protein 246 of 265 with a db id of ULU82601.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 2 of 12 because they are the same
Processing protein 247 of 265 with a db id of UTS94241.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 4 of 11 because they are the same
Processing protein 248 of 265 with a db id of UUT29265.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 6 of 10 because they are the same
Processing protein 249 of 265 with a db id of UUT29266.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 6 of 9 because they are the same
Processing protein 250 of 265 with a db id of WEG44935.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 6 of 8 because they are the same
Processing protein 251 of 265 with a db id of MBY8666561.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 1 of 7 because they are the same
Processing protein 252 of 265 with a db id of nan
Skipping protein 252 of 265 because it does not have a database id
Processing protein 253 of 265 with a db id of nan
Skipping protein 253 of 265 because it does not have a database id
Processing protein 254 of 265 with a db id of nan
Skipping protein 254 of 265 because it does not have a database id
Processing protein 255 of 265 with a db id of nan
Skipping protein 255 of 265 because it does not have a database id
Processing protein 256 of 265 with a db id of nan
Skipping protein 256 of 265 because it does not have a database id
Processing protein 257 of 265 with a db id of nan
Skipping protein 257 of 265 because it does not have a database id
Processing protein 258 of 265 with a db id of nan
Skipping protein 258 of 265 because it does not have a database id
Processing protein 259 of 265 with a db id of AEQ59620.1
The permutations of the neighbours including the base sequence are: 20

Skipping permutation 6 of 6 because they are the same
Processing protein 260 of 265 with a db id of AAD45935.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 1 of 5 because they are the same
Processing protein 261 of 265 with a db id of CAA66659.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 3 of 4 because they are the same
Processing protein 262 of 265 with a db id of AAN05028.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 2 of 3 because they are the same
Processing protein 263 of 265 with a db id of AAN05029.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 2 of 2 because they are the same
Processing protein 264 of 265 with a db id of AAK71474.1
The permutations of the neighbours including the base sequence are: 209


Skipping permutation 1 of 1 because they are the same
Processing protein 265 of 265 with a db id of nan
Skipping protein 265 of 265 because it does not have a database id
The number of pairwise alignments performed is: 21945


In [None]:
ids_tems = df['protein_id_database'].dropna().tolist()
intersection_number_of_tems = len(ids_tems)

In [None]:
# now we want to analyze the mutations
# we want to perform a mutational analysis on all of the mutations
# we are are intrested in creating a matrix which has all of the proteins in df as rows and columns and the values are the number of mutations between the two proteins
# from a logic standpoint this is a square matrix with 209 rows and columns the same as in the embedding_analysis.ipynb file
distance_matrix_mutations = np.zeros((intersection_number_of_tems, intersection_number_of_tems))
protein_ids_mutations = []

# now we want to create a distance matrix for the mutations
# a mutaion can appear between two proteins the direction is not important
query = """
MATCH (p1:Protein)-[r:MUTATION]-(p2:Protein)
WHERE p1.accession_id IN $ids AND p2.accession_id IN $ids
RETURN p1.accession_id AS protein1, p2.accession_id AS protein2, COUNT(r) AS mutations
"""

results_mutations = eedb.db.execute_read(query, {"ids": ids_tems})
print(results_mutations[:10])

# now we want to create a distance matrix for the mutations
# the distance matrix is a square matrix with 209 rows and columns
# the values are the number of mutations between the two proteins
# the diagonal is 0 since a protein does not mutate with itself
for i in range(len(ids_tems)):
    for j in range(len(ids_tems)):
        if i == j:
            distance_matrix_mutations[i, j] = 0
        else:
            distance_matrix_mutations[i, j] = next((record["mutations"] for record in results_mutations if record["protein1"] == ids_tems[i] and record["protein2"] == ids_tems[j]), 0)
        
    protein_ids_mutations.append(ids_tems[i])

# save the distance matrix to a numpy file
np.save("/home/nab/Niklas/TEM-lactamase/data/002_combined_data/distance_matrix_mutations.npy", distance_matrix_mutations)
np.save("/home/nab/Niklas/TEM-lactamase/data/002_combined_data/protein_ids_mutations.npy", protein_ids_mutations)


In [None]:
# we now load the previously saved distance matrix and protein ids

distance_matrix_embeddings_cosine = np.load("/home/nab/Niklas/TEM-lactamase/data/002_combined_data/distance_matrix_embeddings_cosine.npy")
protein_ids_embeddings_cosine = np.load("/home/nab/Niklas/TEM-lactamase/data/002_combined_data/protein_ids_embeddings.npy")

distance_matrix_embeddings_euclidean = np.load("/home/nab/Niklas/TEM-lactamase/data/002_combined_data/distance_matrix_embeddings_euclidean.npy")
protein_ids_embeddings_euclidean = np.load("/home/nab/Niklas/TEM-lactamase/data/002_combined_data/protein_ids_embeddings.npy")

distance_matrix_pairwise = np.load("/home/nab/Niklas/TEM-lactamase/data/002_combined_data/distance_matrix_pairwise.npy")
protein_ids_pairwise = np.load("/home/nab/Niklas/TEM-lactamase/data/002_combined_data/protein_ids_pairwise.npy")

distance_matrix_mutations = np.load("/home/nab/Niklas/TEM-lactamase/data/002_combined_data/distance_matrix_mutations.npy")
protein_ids_mutations = np.load("/home/nab/Niklas/TEM-lactamase/data/002_combined_data/protein_ids_mutations.npy")

## Plots

In [None]:
import matplotlib.pyplot as plt

In [None]:
# a histogram of the number of mutations between the proteins
# the histrogram is normalized so that the sum of the bars is 1
# the x-axis is the number of mutations and the y-axis is the frequency

plt.hist(distance_matrix_mutations.flatten(), bins=100, density=True)
plt.xlabel("Number of mutations")
plt.ylabel("Frequency")
plt.title("Histogram of the number of mutations between the proteins")
plt.show()


In [None]:
# next we are intrested in combine the data from the different matrixes
# it is importtant to always ensure that the same ids are used in the different matrixes
# they might vary in order, but all cases have the same number of entries
# we can handle this with the various protein_ids_* numpy arrays
# usually the points are here a data point and a tupel from the two ids

In [None]:
# we want a plot where we compare the identity vs number of mutations
# this is a simple scatter plot, we expect to know the result quite well but want to confirm it
# on the x axis we will have the number of mutations, the y axis will have the identity
# we just need to include the upper part of the matrix since the lower part is a mirror
# coloring should not play a role here




In [None]:
# same plot as before but with euclidean distance

fig = plt.figure(figsize=(15, 10))

# tracking
count_points = 0
already_plotted_labels = set()

# get all combinations of indexes but skip the second half of the matrix
combinations_of_indexes = list(itertools.combinations(range(len(protein_ids)), 2))
print(len(combinations_of_indexes))
total_number = len(combinations_of_indexes)
random_choices = np.random.choice(total_number, 5000, replace=False)


for index in random_choices:
            
        i = combinations_of_indexes[index][0]
        j = combinations_of_indexes[index][1]
        if i == j:
            continue
    
        if colors[i, j] == 'gray':
            label = 'undefined'
        else:
            label = dict_id_label_new_ids_none[protein_ids_embedding[i]]
    
        if label in already_plotted_labels:
            plt.scatter(
                distance_matrix_embedding_euclidean[i, j],
                distance_matrix_pairwise[i, j],
                c=colors[i, j],
                alpha=0.2,
                s=30,
                edgecolor="k",
            )
    
        else:
            plt.scatter(
                distance_matrix_embedding_euclidean[i, j],
                distance_matrix_pairwise[i, j],
                c=colors[i, j],
                label=label,
                alpha=0.7,
                s=50,
                edgecolor="k",
            )
            already_plotted_labels.add(label)
    
        count_points += 1
    
        if count_points > 20000:
            break

plt.title("Euclidean Distance vs Identity, points plaaced n = " + str(count_points))
plt.xlabel("Euclidean Distance")
plt.ylabel("Identity")
plt.grid()
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# Align the sequences
aligner = PairwiseAligner()

# fetch all ids
query = """
        MATCH (p:Protein) 
        WHERE p.accession_id IS NOT NULL
        RETURN p.accession_id AS accession_id
        """
ids = [record['accession_id'] for record in eedb.db.execute_read(query)]
print(ids)

aligner.align_multipairwise(db=eedb.db, ids=ids)

In [None]:
# Fetch the DNA entries for the proteins
eedb.fetch_dna_entries_for_proteins()

In [None]:
# i want to know how many of the TEM proteins have a DNA sequence linked to them
# this can be found by checking if the DNA-[ENCODES]->Protein relationship exists
# then it should be compared to the TEM-Proteins from the dict and their IDs checked so that we can see if all of them have a DNA sequence

query = """
        MATCH (d:DNA)-[e:ENCODES]->(p:Protein)
        WHERE p.accession_id IS NOT NULL
        RETURN p.accession_id AS accession_id
        """
dna_protein_ids = [record['accession_id'] for record in eedb.db.execute_read(query)]
print(len(dna_protein_ids))
# ['AQT03459.1', 'AFC75523.1', 'CAB92324.1', 'AAF01046.1', 'AFC75524.1']
print(dna_protein_ids[:5])

# first we need to get the ids from the dict
dict_ids = list(dict_id_name.keys())
print(len(dict_ids))
# ['AAP20891', 'CAJ85677', 'SAQ02853', 'CDR98216', 'WP_109963600']
print(dict_ids[:5])

# but we need to be carful, because the dict_ids are not the same as the dna_protein_ids
# the dict_ids are the ids without the version number, so we need to remove the version number from the dna_protein_ids
dna_protein_ids = [id.split('.')[0] for id in dna_protein_ids]
print(len(dna_protein_ids))
print(dna_protein_ids[:5])

# now we can compare the two lists
diff_ids = list(set(dict_ids) - set(dna_protein_ids))
print(len(diff_ids))
print(diff_ids)

# make the printout a bit more readable
print(f" {len(diff_ids)} of the TEM proteins do not have a DNA sequence linked to them")

In [None]:
# we first need to compute the vector embedding for the proteins
eedb.calculate_sequence_embeddings()

In [None]:
# loading in the owl ontology
file_path = "/home/nab/Niklas/TEM-lactamase/CARD_Data_Ontologies/aro.owl"
db = eedb.db
ontology_adapter = OntologyAdapter()

ontology_adapter.import_ontology_file_in_db(file_path, db)

In [None]:
# now we want to pull all of the proteins that are in the CARD ontology, and link them to the ontology structure
# we now open the tsv index file from CARD and link the proteins to the ontology, but first we have to pull them
# ARO Accession	CVTERM ID	Model Sequence ID	Model ID	Model Name	ARO Name	Protein Accession	DNA Accession	AMR Gene Family	Drug Class	Resistance Mechanism	CARD Short Name
file_path = "/home/nab/Niklas/TEM-lactamase/CARD_Data_Data/aro_index.tsv"

# open the file and read in the proteins
df = pd.read_csv(file_path, sep="\t")
df = df.dropna(subset=["Protein Accession"])

# now we want to fetch the proteins from the database
# eedb.fetch_from_primary_db(df["Protein Accession"].tolist(), db='ncbi_protein')

In [None]:
# now we want to link the proteins to the ontology
# we do this by matching the protein accession and the ARO Accession
# the link realtionship is the following:     go_annotation = RelationshipTo("GOAnnotation", "ASSOCIATED_WITH")

for index, row in df.iterrows():
    # the query is the following to match and to link the protein to the ontology
    # it is in cypther since we are using the neo4j database
    query = """
    MATCH (p:Protein {accession_id: $protein_accession})
    MATCH (a:OntologyObject {name: $aro_accession})
    MERGE (p)-[:ASSOCIATED_WITH]->(a)
    """

    # we now execute the query
    # example ARO:3002527 need to be name: http://purl.obolibrary.org/obo/ARO_3002527
    eedb.db.execute_write(query, parameters={"protein_accession": row["Protein Accession"], "aro_accession": f"http://purl.obolibrary.org/obo/{row['ARO Accession'].replace(':', '_')}"})