# Infer active sites on Denovo Deisgned Enzymes

# Data downloaded from two papers

1. https://www.nature.com/articles/s41587-024-02214-2#data-availability (https://zenodo.org/records/11176444) for "experimentally_tested_lysozymes.xlsx" and "experimentally_tested_metrics.csv"
2. serine_hydrolases taken from the SI of: https://www.science.org/doi/pdf/10.1126/science.adu2454?casa_token=iDQKvu36FKgAAAAA:Yq6Hc3SJ-cjQ9apHGT2RM6FgbknzYCMhLzeXA39nKcjgLwz5tIvCGRGguuPtRrw67d8om0V_LLkRP-4 for each of the reported sequences


In [1]:
import pandas as pd
import pandas as pd
import os 
import sys
sys.path.append('/disk1/ariane/vscode/enzyme-tk/')
from enzymetk.sequence_search_blast import BLAST
from enzymetk.save_step import Save
from sciutil import SciUtil
from Bio import SeqIO



# First filter to combine the sequences keeping only those that are active

### Large language models generate functional protein sequences across diverse families 
Downloaded supplemental table from the computational paper disagrees with the madani et al paper so downloaded the supplemental table https://www.nature.com/articles/s41587-022-01618-2#Sec25 labelled: 41587_2022_1618_MOESM3_ESM_Madani_et_all_supp
#### 5 lysozume families. 
1. Phage lysozyme (PF00959) 20
2. Glyco_hydro_108 (PF05838)
3. Glucosaminidase (PF01832
4. Transglycosylase (PF06737)
5. Pesticin (PF16754)

### Computational design of Serine Hydrolases
https://www.science.org/doi/10.1126/science.adu2454#supplementary-materials
Serine hydrolases: Super and Win had catalytic activity for 4MU-Ac and 4MU-Bu, The catalytic residues were taken from the figures in the paper.

5. Super: S128, D53, H95
6. Win: S142, D37, H17
7. Win1, Win31, dadt1: H17, S142, D37
8. Charliet2: S30, D113, H89
9. kent1: S53, D138, H37
10. momi120_103 S49, D105, H90
11. n8: S105 or S91, H89, D77
12. superfast: S147, H114, D53
13. supercool: S146, D53, H113
14. Momi120-74: S49, D107, H92

In [107]:
import re

label = '41587_2022_1618_MOESM3_ESM_Madani_et_all_supp'
denovo_df = pd.DataFrame()
progen_df = pd.read_csv(f'denovo/input_data/{label}.csv')
progen_df = progen_df[progen_df['functional?'] == 'TRUE']
progen_df['model'] = 'progen'
progen_df['CR'] = None
progen_df['doi'] = '10.1038/s41587-022-01618-2'
columns = ['name', 'model', 'sequence', 'CR', 'doi']
progen_df = progen_df[columns]

# Serine hydrolases 
label = 'serine_hydrolases'
serine_df = pd.read_csv(f'denovo/input_data/{label}.csv', index_col=0)
serine_df['name'] = serine_df['id'].values
serine_df['sequence'] = serine_df['seq'].values
serine_df['model'] = 'RFDiffusion'
serine_df['doi'] = '10.1126/science.adu2454'
serine_df = serine_df[columns]

# Data tested from Kevin Yang paper
label = 'experimentally_tested_metrics'
df = pd.read_csv(f'denovo/input_data/{label}.csv')
df = df[df['Activity'] == 1]
df['model'] = df['Model'].values
df['name'] = [f'{f}_{i}' for f, i in df[['Family', 'id']].values]
df['sequence'] = df['Sequence'].values
df['CR'] = None
df['doi'] = '10.1038/s41587-024-02214-2'
df = df[columns]
df = df[~df['model'].isin(['pre-test', 'test'])]
df['model'].value_counts()

label = 'chorismate_mutase_CM_table'
cm_df = pd.read_csv(f'denovo/input_data/{label}.csv')
cm_df = cm_df[cm_df['Activity'] == 1]
cm_df['model'] = cm_df['Model'].values
cm_df['name'] = [f'{f}_{i}' for f, i in cm_df[['Family', 'id']].values]
cm_df['sequence'] = cm_df['Sequence'].values
cm_df['CR'] = None
cm_df['doi'] = '10.1038/s41587-024-02214-2'
cm_df = cm_df[columns]
cm_df = cm_df[cm_df['model'] == 'BmDCA']
cm_df['model'].value_counts()

# Combine all
denovo_df = pd.concat([progen_df, serine_df, df, cm_df])
denovo_df['id'] = [re.sub('[^0-9a-zA-Z]+', '', n) for n in denovo_df['name'].values]
denovo_df.to_csv('denovo/denovo_dataset.csv', index=False)

In [108]:
denovo_df.drop_duplicates('name')

Unnamed: 0,name,model,sequence,CR,doi,id
0,L001,progen,AAESYEASLTRLLKNEGGYTDHPSDPGGPTNFGITLADARRYWKGN...,,10.1038/s41587-022-01618-2,L001
1,L002,progen,AAPANAASESTWDALAQCESGGNWGTSTGNGFSGGLQFTPSTWAAF...,,10.1038/s41587-022-01618-2,L002
2,L003,progen,AAPVDTWDRVAQCESGGNWSINTGNGYYGGLQFSQHTWAAYGGTQY...,,10.1038/s41587-022-01618-2,L003
3,L004,progen,APAFVKAEKKYGVNAIAMTSIAALESGWGKSKRAVKDNNLTGLGVY...,,10.1038/s41587-022-01618-2,L004
4,L005,progen,ASAGAAPTHDWDGVAQCESGGNWGINTGNGYYGGLQFSHSTWVANG...,,10.1038/s41587-022-01618-2,L005
...,...,...,...,...,...,...
2675,CM_2676,BmDCA,TIEIVRSEIEDLDREILALIDKRVNLAERVLKIKRANNLPINDQKQ...,,10.1038/s41587-024-02214-2,CM2676
2689,CM_2690,BmDCA,IELLKLREKIDVVDDQILELLNERMHLVKQVGEWKQKQGTAIYVPE...,,10.1038/s41587-024-02214-2,CM2690
2720,CM_2721,BmDCA,VTTENLTALRSQINELDGQLLELLAKRMQISAEIAAYKKEHNMPIL...,,10.1038/s41587-024-02214-2,CM2721
2736,CM_2737,BmDCA,TQTTEQLNQLRRQIDEIDNSLVEQLSKRFRIVREIGQYKKEHNMTV...,,10.1038/s41587-024-02214-2,CM2737


In [110]:
# Try running the denovo dataset with squidly 
with open(f'denovo/denovo_dataset.fasta', 'w+') as fout:
    for entry, seq in denovo_df[['id', 'sequence']].values:
        fout.write(f'>{entry}\n{seq}\n')

In [111]:
print(f'squidly denovo/denovo_dataset.fasta esm2_t36_3B_UR50D denovo/output_data/ denovo_3B --database /disk1/ariane/vscode/squidly/data/reviewed_sprot_08042025.csv --blast-threshold 50 --mean-prob 0.5')

squidly denovo/denovo_dataset.fasta esm2_t36_3B_UR50D denovo/output_data/ denovo_3B --database /disk1/ariane/vscode/squidly/data/reviewed_sprot_08042025.csv --blast-threshold 50 --mean-prob 0.5


In [87]:
print(f'squidly denovo/denovo_dataset.fasta esm2_t36_3B_UR50D denovo/output_data/ denovo_dataset_3B --mean-prob 0.5')

squidly denovo/denovo_dataset.fasta esm2_t36_3B_UR50D denovo/output_data/ denovo_dataset_3B --mean-prob 0.5


In [113]:
df = pd.read_csv('denovo/output_data/denovo_3B_ensemble.csv')
df['tool'].value_counts()

Not-found    526
BLAST        183
squidly       30
Name: tool, dtype: int64

In [98]:
df[df['tool'] == 'BLAST']

Unnamed: 0,id,residues,tool
3,L004,58|83,BLAST
11,L012,14,BLAST
15,L019,62|78,BLAST
17,L021,0|166,BLAST
42,L056,14|23,BLAST
...,...,...,...
725,CM2431,0|0,BLAST
727,CM2457,0|0,BLAST
728,CM2472,0|0,BLAST
730,CM2584,0|0,BLAST


In [105]:
df = pd.read_pickle('denovo/output_data/denovo_3B_squidly.pkl')
df = df[df['Squidly_Ensemble_Residues'] != '']
df

Unnamed: 0_level_0,Squidly_CR_Position,Squidly_CR_probabilities,Squidly_CR_representations,all_AS_probs,label_1,Squidly_CR_Position_1,Squidly_CR_probabilities_1,Squidly_CR_representations_1,all_AS_probs_1,label_2,...,all_AS_probs_3,label_4,Squidly_CR_Position_4,Squidly_CR_probabilities_4,Squidly_CR_representations_4,all_AS_probs_4,mean,aleatoric,epistemic,Squidly_Ensemble_Residues
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
L003,13,0.9997974,"[[-0.0953205, -0.07024201, -0.7475638, -0.0688...","[0.0018, 0.0013, 0.001, 0.004, 0.0011, 0.0014,...",L003,13,0.99983275,"[[0.4811278, -0.028638866, 0.020045452, 0.4397...","[0.0012, 0.0009, 0.0007, 0.0022, 0.002, 0.0016...",L003,...,"[0.0028, 0.005, 0.0036, 0.0076, 0.0045, 0.0051...",L003,,,[],"[0.0019, 0.0019, 0.0018, 0.0018, 0.0018, 0.001...",[0.9976400100000001],[0.011743330398519256],[-0.009386137368189582],13
L005,17,0.9997516,"[[-0.089712836, -0.06702872, -0.67186654, -0.0...","[0.0024, 0.0012, 0.0025, 0.0017, 0.0013, 0.001...",L005,17,0.9997532,"[[0.4298938, -0.025566332, 0.018090963, 0.3928...","[0.0012, 0.0012, 0.0022, 0.0013, 0.001, 0.0011...",L005,...,"[0.0034, 0.0042, 0.0076, 0.004, 0.0062, 0.0066...",L005,,,[],"[0.0019, 0.0019, 0.0018, 0.0018, 0.0018, 0.001...",[0.89312001],[0.3851323150900789],[-0.28417911436997706],17
L002,18,0.9997694,"[[-0.09002136, -0.06573059, -0.6999352, -0.065...","[0.002, 0.0025, 0.0016, 0.0024, 0.0014, 0.0028...",L002,18,0.9998229,"[[0.46744657, -0.02862038, 0.020515256, 0.4253...","[0.0012, 0.0012, 0.0009, 0.0018, 0.0008, 0.001...",L002,...,"[0.0031, 0.0062, 0.0048, 0.0069, 0.0034, 0.006...",L002,,,[],"[0.0019, 0.0019, 0.0018, 0.0018, 0.0018, 0.001...",[0.99354001],[0.03204456323600807],[-0.02560549404791341],18
L075,14,0.998844,"[[-0.048933987, -0.03416989, -0.37807256, -0.0...","[0.0023, 0.0031, 0.0018, 0.0012, 0.0014, 0.001...",L075,14,0.99974436,"[[0.45347774, -0.026474845, 0.014075136, 0.418...","[0.0011, 0.0022, 0.0008, 0.0007, 0.0011, 0.000...",L075,...,"[0.002, 0.0055, 0.0019, 0.0025, 0.0029, 0.0018...",L075,,,[],"[0.0019, 0.0019, 0.0018, 0.0018, 0.0018, 0.001...",[0.98686001],[0.064664967161641],[-0.05161169745793149],14
L009,,,[],"[0.0019, 0.0015, 0.0013, 0.003, 0.0019, 0.0011...",L009,21,0.99890256,"[[0.071626015, 0.069256075, 0.031381954, 0.074...","[0.0018, 0.0019, 0.0034, 0.0027, 0.0012, 0.001...",L009,...,"[0.0019, 0.0023, 0.0036, 0.0037, 0.0019, 0.002...",L009,,,[],"[0.0019, 0.0019, 0.0018, 0.0018, 0.0018, 0.001...",[0.71730001],[0.8342875012804213],[-0.5959566195587858],21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MDHASRMDH409,171,0.9998472,"[[-0.2532988, 0.13214558, -0.26530963, 0.00919...","[0.0018, 0.0036, 0.0011, 0.0012, 0.0009, 0.005...",MDHASRMDH409,171,0.999936,"[[-0.11890301, -0.6468959, -0.25390473, 0.2805...","[0.0011, 0.0014, 0.0006, 0.0011, 0.0006, 0.001...",MDHASRMDH409,...,"[0.0019, 0.0025, 0.0015, 0.0015, 0.0018, 0.006...",MDHASRMDH409,171,0.99985063,"[[-0.108086824, -0.82718563, 0.43563244, -0.07...","[0.0019, 0.0019, 0.0018, 0.0018, 0.0018, 0.001...",[0.9998400100000001],[0.0007998699936657438],[-0.0006399027927484996],171
MDHESMMSAT0BE41,173,0.9998708,"[[-0.27488753, 0.14502588, -0.2930703, 0.00913...","[0.0026, 0.0013, 0.0012, 0.0011, 0.0011, 0.001...",MDHESMMSAT0BE41,173,0.9999337,"[[-0.11422788, -0.62275344, -0.24320397, 0.269...","[0.0011, 0.0007, 0.0011, 0.0006, 0.0006, 0.001...",MDHESMMSAT0BE41,...,"[0.0017, 0.0016, 0.0016, 0.0014, 0.0013, 0.001...",MDHESMMSAT0BE41,173,0.9998561,"[[-0.1037118, -0.78663653, 0.41454464, -0.0713...","[0.0019, 0.0019, 0.0018, 0.0018, 0.0018, 0.001...",[0.9998600100000001],[0.0006998849948325466],[-0.0005599147938899965],173
MDH7455,188|190,0.9903232|0.99982506,"[[-0.015909469, 0.008886394, -0.021008663, 0.0...","[0.0048, 0.0012, 0.0011, 0.0019, 0.0011, 0.001...",MDH7455,190,0.99992454,"[[-0.08323757, -0.4515609, -0.17888026, 0.1970...","[0.0012, 0.0006, 0.0006, 0.0007, 0.0006, 0.000...",MDH7455,...,"[0.0031, 0.0012, 0.0013, 0.0018, 0.0023, 0.001...",MDH7455,190,0.9998399,"[[-0.08565694, -0.6820892, 0.35770816, -0.0656...","[0.0019, 0.0019, 0.0018, 0.0018, 0.0018, 0.001...",[0.9998000100000001],[0.0009998199863308047],[-0.0007998599856641938],190
MDHESMMSAMGYP001005886523,179,0.9998498,"[[-0.25871652, 0.13553713, -0.27300143, 0.0084...","[0.0038, 0.0017, 0.0013, 0.0016, 0.0019, 0.001...",MDHESMMSAMGYP001005886523,179,0.9999254,"[[-0.090730876, -0.49430457, -0.19423223, 0.21...","[0.0012, 0.0008, 0.0006, 0.0006, 0.0006, 0.000...",MDHESMMSAMGYP001005886523,...,"[0.004, 0.0027, 0.0016, 0.0019, 0.0022, 0.0023...",MDHESMMSAMGYP001005886523,179,0.99983406,"[[-0.08326885, -0.64866734, 0.34127882, -0.059...","[0.0019, 0.0019, 0.0018, 0.0018, 0.0018, 0.001...",[0.9998000100000001],[0.0009998199863308047],[-0.0007998599856641938],179


In [76]:
df

Unnamed: 0_level_0,Squidly_CR_Position,Squidly_CR_probabilities,Squidly_CR_representations,all_AS_probs,label_1,Squidly_CR_Position_1,Squidly_CR_probabilities_1,Squidly_CR_representations_1,all_AS_probs_1,label_2,...,all_AS_probs_3,label_4,Squidly_CR_Position_4,Squidly_CR_probabilities_4,Squidly_CR_representations_4,all_AS_probs_4,mean,aleatoric,epistemic,Squidly_Ensemble_Residues
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CuSOD_N133_0.938,,,[],"[0.0019, 0.0012, 0.0011, 0.0013, 0.0011, 0.001...",CuSOD_N133_0.938,,,[],"[0.0017, 0.0012, 0.001, 0.0015, 0.001, 0.001, ...",CuSOD_N133_0.938,...,"[0.0018, 0.0015, 0.0014, 0.0013, 0.0012, 0.001...",CuSOD_N133_0.938,,,[],"[0.0019, 0.0019, 0.0018, 0.0018, 0.0018, 0.001...",[],[],[],
CuSOD_N382_0.622,,,[],"[0.0017, 0.0016, 0.0011, 0.0012, 0.0012, 0.001...",CuSOD_N382_0.622,,,[],"[0.0016, 0.0016, 0.0011, 0.001, 0.002, 0.0011,...",CuSOD_N382_0.622,...,"[0.0016, 0.0019, 0.0016, 0.0016, 0.0013, 0.001...",CuSOD_N382_0.622,,,[],"[0.0019, 0.0019, 0.0018, 0.0018, 0.0018, 0.001...",[],[],[],
CuSOD_N5_0.362,,,[],"[0.0017, 0.0016, 0.0011, 0.0011, 0.0012, 0.001...",CuSOD_N5_0.362,,,[],"[0.0016, 0.0014, 0.001, 0.001, 0.0014, 0.001, ...",CuSOD_N5_0.362,...,"[0.0017, 0.0016, 0.0015, 0.0012, 0.0011, 0.001...",CuSOD_N5_0.362,,,[],"[0.0019, 0.0019, 0.0018, 0.0018, 0.0018, 0.001...",[],[],[],
CuSOD_N405_0.854,,,[],"[0.0018, 0.0014, 0.0011, 0.0011, 0.0012, 0.001...",CuSOD_N405_0.854,,,[],"[0.0016, 0.0023, 0.001, 0.001, 0.0017, 0.0011,...",CuSOD_N405_0.854,...,"[0.0017, 0.0018, 0.0012, 0.0013, 0.0013, 0.001...",CuSOD_N405_0.854,,,[],"[0.0019, 0.0019, 0.0018, 0.0018, 0.0018, 0.001...",[0.52288001],[0.8624724737173309],[-0.5234353766571687],67
CuSOD_N101_0.980,,,[],"[0.0017, 0.0038, 0.002, 0.0017, 0.0033, 0.0013...",CuSOD_N101_0.980,,,[],"[0.0014, 0.0041, 0.0022, 0.0011, 0.003, 0.0024...",CuSOD_N101_0.980,...,"[0.0019, 0.0062, 0.0026, 0.003, 0.0029, 0.002,...",CuSOD_N101_0.980,,,[],"[0.0019, 0.0019, 0.0018, 0.0018, 0.0018, 0.001...","[0.7250800099999999, 0.55202001]","[0.9267656788685884, 1.1738466630645787]","[-0.693671845885146, -0.845852400887346]",74|82
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MDH_7455,188|190,0.9903232|0.99982506,"[[-0.015909469, 0.008886394, -0.021008663, 0.0...","[0.0048, 0.0012, 0.0011, 0.0019, 0.0011, 0.001...",MDH_7455,190,0.99992454,"[[-0.08323757, -0.4515609, -0.17888026, 0.1970...","[0.0012, 0.0006, 0.0006, 0.0007, 0.0006, 0.000...",MDH_7455,...,"[0.0031, 0.0012, 0.0013, 0.0018, 0.0023, 0.001...",MDH_7455,190,0.9998399,"[[-0.08565694, -0.6820892, 0.35770816, -0.0656...","[0.0019, 0.0019, 0.0018, 0.0018, 0.0018, 0.001...",[0.9998000100000001],[0.0009998199863308047],[-0.0007998599856641938],190
MDH_ESM_MSA_MGYP001005886523,179,0.9998498,"[[-0.25871652, 0.13553713, -0.27300143, 0.0084...","[0.0038, 0.0017, 0.0013, 0.0016, 0.0019, 0.001...",MDH_ESM_MSA_MGYP001005886523,179,0.9999254,"[[-0.090730876, -0.49430457, -0.19423223, 0.21...","[0.0012, 0.0008, 0.0006, 0.0006, 0.0006, 0.000...",MDH_ESM_MSA_MGYP001005886523,...,"[0.004, 0.0027, 0.0016, 0.0019, 0.0022, 0.0023...",MDH_ESM_MSA_MGYP001005886523,179,0.99983406,"[[-0.08326885, -0.64866734, 0.34127882, -0.059...","[0.0019, 0.0019, 0.0018, 0.0018, 0.0018, 0.001...",[0.9998000100000001],[0.0009998199863308047],[-0.0007998599856641938],179
MDH_ESM_MSA_0_MGYP001094065905,191,0.9998443,"[[-0.2618748, 0.13587447, -0.27553186, 0.00836...","[0.0029, 0.0026, 0.0013, 0.0016, 0.002, 0.0012...",MDH_ESM_MSA_0_MGYP001094065905,191,0.9999249,"[[-0.08452272, -0.46041983, -0.18147936, 0.200...","[0.0011, 0.0008, 0.0006, 0.0006, 0.0008, 0.000...",MDH_ESM_MSA_0_MGYP001094065905,...,"[0.0022, 0.0012, 0.0014, 0.0017, 0.0014, 0.001...",MDH_ESM_MSA_0_MGYP001094065905,191,0.99982303,"[[-0.081120364, -0.64219534, 0.33667243, -0.06...","[0.0019, 0.0019, 0.0018, 0.0018, 0.0018, 0.001...",[0.99978001],[0.001099774976161051],[-0.0008798191757357735],191
MDH_ASR_MDH_275,194,0.9998387,"[[-0.24771197, 0.12856025, -0.2629465, 0.00679...","[0.0051, 0.0013, 0.0012, 0.0021, 0.0017, 0.001...",MDH_ASR_MDH_275,194,0.9999211,"[[-0.07563782, -0.40839976, -0.16177227, 0.178...","[0.0014, 0.0007, 0.0008, 0.0013, 0.0012, 0.001...",MDH_ASR_MDH_275,...,"[0.0043, 0.0015, 0.0019, 0.0028, 0.003, 0.003,...",MDH_ASR_MDH_275,194,0.99983037,"[[-0.082027495, -0.65116155, 0.34125763, -0.06...","[0.0019, 0.0019, 0.0018, 0.0018, 0.0018, 0.001...",[0.99978001],[0.001099774976161051],[-0.0008798191757357735],194


In [None]:
label = 'serine_hydrolases'

fasta_label = f'denovo/input_data/{label}.fasta'
rows = []
for record in SeqIO.parse(fasta_label, "fasta"):
    rows.append([record.id, str(record.seq)])
df = pd.DataFrame(rows, columns=['id', 'seq'])
df.to_csv(f'denovo/input_data/{label}.csv')
with open(f'denovo/output_data/{label}.fasta', 'w+') as fout:
    for entry, seq in df[['id', 'seq']].values:
        fout.write(f'>{entry}\n{seq}\n')
fasta_label = f'denovo/output_data/{label}.fasta'

# Test generated lysozymes

There are 3 lysozymes in M-CSA: https://www.ebi.ac.uk/thornton-srv/m-csa/search/?s=lysozyme

In [3]:
label = 'experimentally_tested_lysozymes'
df = pd.read_csv(f'denovo/input_data/{label}.csv')

with open(f'denovo/output_data/{label}.fasta', 'w+') as fout:
    for entry, seq in df[['Name', 'sequence']].values:
        fout.write(f'>{entry}\n{seq}\n')

fasta_label = f'denovo/output_data/{label}.fasta'

In [4]:
os.system(f'squidly {fasta_label} esm2_t36_3B_UR50D denovo/output_data/ {label}_3B --database /disk1/ariane/vscode/squidly/data/reviewed_sprot_08042025.csv --blast-threshold 50')

[93m--------------------------------------------------------------------------------[0m
[93mRunning BLAST on the following DB: 	/disk1/ariane/vscode/squidly/data/reviewed_sprot_08042025.csv	[0m
[93m--------------------------------------------------------------------------------[0m


[31m╭─[0m[31m────────────────────[0m[31m [0m[1;31mTraceback [0m[1;2;31m(most recent call last)[0m[31m [0m[31m─────────────────────[0m[31m─╮[0m
[31m│[0m [2;33m/disk1/ariane/miniconda3/envs/metagenomics/lib/python3.10/site-packages/squi[0m [31m│[0m
[31m│[0m [2;33mdly/[0m[1;33m__main__.py[0m:[94m112[0m in [92mrun[0m                                                   [31m│[0m
[31m│[0m                                                                              [31m│[0m
[31m│[0m   [2m109 [0m[2m│   │   │   [0m[94mreturn[0m                                                     [31m│[0m
[31m│[0m   [2m110 [0m[2m│   │   [0m                                                               [31m│[0m
[31m│[0m   [2m111 [0m[2m│   │   [0m[2m# Run blast [0m                                                   [31m│[0m
[31m│[0m [31m❱ [0m112 [2m│   │   [0mblast_df = run_blast(query_df, database_df, output_folder, run [31m│[0m
[31m│[0m   [

256

In [5]:
print(f'squidly {fasta_label} esm2_t36_3B_UR50D denovo/output_data/ {label}_3B --database /disk1/ariane/vscode/squidly/data/reviewed_sprot_08042025.csv --blast-threshold 50')

squidly denovo/output_data/experimentally_tested_lysozymes.fasta esm2_t36_3B_UR50D denovo/output_data/ experimentally_tested_lysozymes_3B --database /disk1/ariane/vscode/squidly/data/reviewed_sprot_08042025.csv --blast-threshold 50


In [None]:
os.system(f'squidly {fasta_label} esm2_t36_3B_UR50D denovo/output_data/ {label}_AEGAN_3B_1 --as-threshold 0.9 --lstm-model-as ../models/FinalModels/CLEANED_reproducing_AEGAN_benchmark_squidly_scheme_3_esm2_t36_3B_UR50D_2025-03-04/Scheme3_16000_1/LSTM/models/04-03-25_12-49_128_2_0.2_400_best_model.pth --cr-model-as ../models/FinalModels/CLEANED_reproducing_AEGAN_benchmark_squidly_scheme_3_esm2_t36_3B_UR50D_2025-03-04/Scheme3_16000_1/models/temp_best_model.pt')
os.system(f'squidly {fasta_label} esm2_t36_3B_UR50D denovo/output_data/ {label}_AEGAN_3B_2 --as-threshold 0.9 --lstm-model-as ../models/FinalModels/CLEANED_reproducing_AEGAN_benchmark_squidly_scheme_3_esm2_t36_3B_UR50D_2025-03-04/Scheme3_16000_2/LSTM/models/04-03-25_14-09_128_2_0.2_400_best_model.pth --cr-model-as ../models/FinalModels/CLEANED_reproducing_AEGAN_benchmark_squidly_scheme_3_esm2_t36_3B_UR50D_2025-03-04/Scheme3_16000_2/models/temp_best_model.pt')
os.system(f'squidly {fasta_label} esm2_t36_3B_UR50D denovo/output_data/ {label}_AEGAN_3B_3 --as-threshold 0.9 --lstm-model-as ../models/FinalModels/CLEANED_reproducing_AEGAN_benchmark_squidly_scheme_3_esm2_t36_3B_UR50D_2025-03-04/Scheme3_16000_3/LSTM/models/04-03-25_15-14_128_2_0.2_400_best_model.pth --cr-model-as ../models/FinalModels/CLEANED_reproducing_AEGAN_benchmark_squidly_scheme_3_esm2_t36_3B_UR50D_2025-03-04/Scheme3_16000_3/models/temp_best_model.pt')
os.system(f'squidly {fasta_label} esm2_t36_3B_UR50D denovo/output_data/ {label}_AEGAN_3B_4 --as-threshold 0.9 --lstm-model-as ../models/FinalModels/CLEANED_reproducing_AEGAN_benchmark_squidly_scheme_3_esm2_t36_3B_UR50D_2025-03-04/Scheme3_16000_4/LSTM/models/04-03-25_17-28_128_2_0.2_400_best_model.pth --cr-model-as ../models/FinalModels/CLEANED_reproducing_AEGAN_benchmark_squidly_scheme_3_esm2_t36_3B_UR50D_2025-03-04/Scheme3_16000_4/models/temp_best_model.pt')
os.system(f'squidly {fasta_label} esm2_t36_3B_UR50D denovo/output_data/ {label}_AEGAN_3B_5 --as-threshold 0.9 --lstm-model-as ../models/FinalModels/CLEANED_reproducing_AEGAN_benchmark_squidly_scheme_3_esm2_t36_3B_UR50D_2025-03-04/Scheme3_16000_5/LSTM/models/04-03-25_19-44_128_2_0.2_400_best_model.pth --cr-model-as ../models/FinalModels/CLEANED_reproducing_AEGAN_benchmark_squidly_scheme_3_esm2_t36_3B_UR50D_2025-03-04/Scheme3_16000_5/models/temp_best_model.pt')


In [None]:
os.system(f'squidly {fasta_label} esm2_t36_3B_UR50D denovo/output_data/ {label}_3B_1 --as-threshold 0.9 --lstm-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_1/LSTM/models/13-04-25_15-07_128_2_0.2_100_best_model.pth --cr-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_1/models/temp_best_model.pt')
os.system(f'squidly {fasta_label} esm2_t36_3B_UR50D denovo/output_data/ {label}_3B_2 --as-threshold 0.9 --lstm-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_2/LSTM/models/13-04-25_16-48_128_2_0.2_100_best_model.pth --cr-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_2/models/temp_best_model.pt')
os.system(f'squidly {fasta_label} esm2_t36_3B_UR50D denovo/output_data/ {label}_3B_3 --as-threshold 0.9 --lstm-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_3/LSTM/models/13-04-25_19-16_128_2_0.2_100_best_model.pth --cr-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_3/models/temp_best_model.pt')
os.system(f'squidly {fasta_label} esm2_t36_3B_UR50D denovo/output_data/ {label}_3B_4 --as-threshold 0.9 --lstm-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_4/LSTM/models/13-04-25_20-57_128_2_0.2_100_best_model.pth --cr-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_4/models/temp_best_model.pt')
os.system(f'squidly {fasta_label} esm2_t36_3B_UR50D denovo/output_data/ {label}_3B_5 --as-threshold 0.9 --lstm-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_5/LSTM/models/13-04-25_22-36_128_2_0.2_100_best_model.pth --cr-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_5/models/temp_best_model.pt')


# Test generated families

1. CuSOD: copper superoxide dismutase or Superoxide dismutase: https://www.ebi.ac.uk/thornton-srv/m-csa/entry/138/
2. MDH: malate dehydrogenase: https://www.ebi.ac.uk/thornton-srv/m-csa/search/?s=malate+dehydrogenase


In [None]:
label = 'experimentally_tested_metrics'
df = pd.read_csv(f'denovo/input_data/{label}.csv')

with open(f'denovo/output_data/{label}.fasta', 'w+') as fout:
    for entry, seq in df[['id', 'Sequence']].values:
        fout.write(f'>{entry}\n{seq}\n')

fasta_label = f'denovo/output_data/{label}.fasta'

In [None]:
os.system(f'squidly {fasta_label} esm2_t36_3B_UR50D denovo/output_data/ {label}_3B_1 --as-threshold 0.9 --lstm-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_1/LSTM/models/13-04-25_15-07_128_2_0.2_100_best_model.pth --cr-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_1/models/temp_best_model.pt')
os.system(f'squidly {fasta_label} esm2_t36_3B_UR50D denovo/output_data/ {label}_3B_2 --as-threshold 0.9 --lstm-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_2/LSTM/models/13-04-25_16-48_128_2_0.2_100_best_model.pth --cr-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_2/models/temp_best_model.pt')
os.system(f'squidly {fasta_label} esm2_t36_3B_UR50D denovo/output_data/ {label}_3B_3 --as-threshold 0.9 --lstm-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_3/LSTM/models/13-04-25_19-16_128_2_0.2_100_best_model.pth --cr-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_3/models/temp_best_model.pt')
os.system(f'squidly {fasta_label} esm2_t36_3B_UR50D denovo/output_data/ {label}_3B_4 --as-threshold 0.9 --lstm-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_4/LSTM/models/13-04-25_20-57_128_2_0.2_100_best_model.pth --cr-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_4/models/temp_best_model.pt')
os.system(f'squidly {fasta_label} esm2_t36_3B_UR50D denovo/output_data/ {label}_3B_5 --as-threshold 0.9 --lstm-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_5/LSTM/models/13-04-25_22-36_128_2_0.2_100_best_model.pth --cr-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_5/models/temp_best_model.pt')


# Test generated serine hydrolases

1. serine_hydrolases taken from the SI of: https://www.science.org/doi/pdf/10.1126/science.adu2454?casa_token=iDQKvu36FKgAAAAA:Yq6Hc3SJ-cjQ9apHGT2RM6FgbknzYCMhLzeXA39nKcjgLwz5tIvCGRGguuPtRrw67d8om0V_LLkRP-4 for each of the reported sequences

Unfortunatly only 18 sequences are provided of the 100's tested.

In [None]:
label = 'serine_hydrolases'

fasta_label = f'denovo/input_data/{label}.fasta'
rows = []
for record in SeqIO.parse(fasta_label, "fasta"):
    rows.append([record.id, str(record.seq)])
df = pd.DataFrame(rows, columns=['id', 'seq'])
df.to_csv(f'denovo/input_data/{label}.csv')
with open(f'denovo/output_data/{label}.fasta', 'w+') as fout:
    for entry, seq in df[['id', 'seq']].values:
        fout.write(f'>{entry}\n{seq}\n')
fasta_label = f'denovo/output_data/{label}.fasta'

In [None]:
os.system(f'squidly {fasta_label} esm2_t36_3B_UR50D denovo/output_data/ {label}_AEGAN_3B_1 --as-threshold 0.9 --lstm-model-as ../models/FinalModels/CLEANED_reproducing_AEGAN_benchmark_squidly_scheme_3_esm2_t36_3B_UR50D_2025-03-04/Scheme3_16000_1/LSTM/models/04-03-25_12-49_128_2_0.2_400_best_model.pth --cr-model-as ../models/FinalModels/CLEANED_reproducing_AEGAN_benchmark_squidly_scheme_3_esm2_t36_3B_UR50D_2025-03-04/Scheme3_16000_1/models/temp_best_model.pt')
os.system(f'squidly {fasta_label} esm2_t36_3B_UR50D denovo/output_data/ {label}_AEGAN_3B_2 --as-threshold 0.9 --lstm-model-as ../models/FinalModels/CLEANED_reproducing_AEGAN_benchmark_squidly_scheme_3_esm2_t36_3B_UR50D_2025-03-04/Scheme3_16000_2/LSTM/models/04-03-25_14-09_128_2_0.2_400_best_model.pth --cr-model-as ../models/FinalModels/CLEANED_reproducing_AEGAN_benchmark_squidly_scheme_3_esm2_t36_3B_UR50D_2025-03-04/Scheme3_16000_2/models/temp_best_model.pt')
os.system(f'squidly {fasta_label} esm2_t36_3B_UR50D denovo/output_data/ {label}_AEGAN_3B_3 --as-threshold 0.9 --lstm-model-as ../models/FinalModels/CLEANED_reproducing_AEGAN_benchmark_squidly_scheme_3_esm2_t36_3B_UR50D_2025-03-04/Scheme3_16000_3/LSTM/models/04-03-25_15-14_128_2_0.2_400_best_model.pth --cr-model-as ../models/FinalModels/CLEANED_reproducing_AEGAN_benchmark_squidly_scheme_3_esm2_t36_3B_UR50D_2025-03-04/Scheme3_16000_3/models/temp_best_model.pt')
os.system(f'squidly {fasta_label} esm2_t36_3B_UR50D denovo/output_data/ {label}_AEGAN_3B_4 --as-threshold 0.9 --lstm-model-as ../models/FinalModels/CLEANED_reproducing_AEGAN_benchmark_squidly_scheme_3_esm2_t36_3B_UR50D_2025-03-04/Scheme3_16000_4/LSTM/models/04-03-25_17-28_128_2_0.2_400_best_model.pth --cr-model-as ../models/FinalModels/CLEANED_reproducing_AEGAN_benchmark_squidly_scheme_3_esm2_t36_3B_UR50D_2025-03-04/Scheme3_16000_4/models/temp_best_model.pt')
os.system(f'squidly {fasta_label} esm2_t36_3B_UR50D denovo/output_data/ {label}_AEGAN_3B_5 --as-threshold 0.9 --lstm-model-as ../models/FinalModels/CLEANED_reproducing_AEGAN_benchmark_squidly_scheme_3_esm2_t36_3B_UR50D_2025-03-04/Scheme3_16000_5/LSTM/models/04-03-25_19-44_128_2_0.2_400_best_model.pth --cr-model-as ../models/FinalModels/CLEANED_reproducing_AEGAN_benchmark_squidly_scheme_3_esm2_t36_3B_UR50D_2025-03-04/Scheme3_16000_5/models/temp_best_model.pt')


In [None]:
os.system(f'squidly {fasta_label} esm2_t36_3B_UR50D denovo/output_data/ {label}_3B_1 --as-threshold 0.9 --lstm-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_1/LSTM/models/13-04-25_15-07_128_2_0.2_100_best_model.pth --cr-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_1/models/temp_best_model.pt')
os.system(f'squidly {fasta_label} esm2_t36_3B_UR50D denovo/output_data/ {label}_3B_2 --as-threshold 0.9 --lstm-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_2/LSTM/models/13-04-25_16-48_128_2_0.2_100_best_model.pth --cr-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_2/models/temp_best_model.pt')
os.system(f'squidly {fasta_label} esm2_t36_3B_UR50D denovo/output_data/ {label}_3B_3 --as-threshold 0.9 --lstm-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_3/LSTM/models/13-04-25_19-16_128_2_0.2_100_best_model.pth --cr-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_3/models/temp_best_model.pt')
os.system(f'squidly {fasta_label} esm2_t36_3B_UR50D denovo/output_data/ {label}_3B_4 --as-threshold 0.9 --lstm-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_4/LSTM/models/13-04-25_20-57_128_2_0.2_100_best_model.pth --cr-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_4/models/temp_best_model.pt')
os.system(f'squidly {fasta_label} esm2_t36_3B_UR50D denovo/output_data/ {label}_3B_5 --as-threshold 0.9 --lstm-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_5/LSTM/models/13-04-25_22-36_128_2_0.2_100_best_model.pth --cr-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_5/models/temp_best_model.pt')


# Ensemble the predictions 

In [None]:
# Read in the predictions from the ensemble
from tqdm import tqdm 
import numpy as np

def compute_uncertainties(df, prob_columns, mean_prob=0.5):
    means, aleatorics, epistemics, residues  = [], [], [], []
    for p1, p2, p3, p4, p5 in tqdm(df[prob_columns].values):
        mean_values = []
        aleatoric_values = []
        epistemic_values = []
        indicies = []
        for j in range(0, len(p1)):
        
            # Aleatoric: average predicted entropy
            eps = 1e-8  # for numerical stability
            # For each value we want the mean and the variance and the uncertainty
            all_probs = np.array([p1[j] , p2[j], p3[j], p4[j], p5[j]])
            mean_probs = np.mean(all_probs + eps) # (N, 10)
            var_probs = np.var(all_probs)    # epistemic uncertainty (variance across models)

            entropies = -np.sum(all_probs * np.log(all_probs + eps))  # (num_models, N)
            aleatoric = np.mean(entropies)   # (N,)

            # Epistemic: entropy of mean prediction minus mean entropy
            mean_entropy = -np.sum(mean_probs * np.log(mean_probs + eps))  # (N,)
            epistemic = mean_entropy - aleatoric  # (N,)
            if mean_probs > mean_prob:
                indicies.append(j)
                mean_values.append(mean_probs)
                aleatoric_values.append(aleatoric)
                epistemic_values.append(epistemic)
        means.append(mean_values)
        aleatorics.append(aleatoric_values) 
        epistemics.append(epistemic_values)
        residues.append('|'.join([str(s) for s in indicies]))
    return means, aleatorics, epistemics, residues


label = 'serine_hydrolases'

squidly_ensemble = pd.DataFrame()
for i in range(1, 6):
    squidly = pd.read_pickle(f'denovo/output_data/{label}_3B_{i}_squidly.pkl')
    squidly.set_index('label', inplace=True)
    squidly_ensemble = squidly_ensemble.join(squidly, how='outer', rsuffix=f'_{i}')

cols = ['all_AS_probs', 'all_AS_probs_2', 'all_AS_probs_3', 'all_AS_probs_4', 'all_AS_probs_5']
means, aleatorics, epistemics, residues = compute_uncertainties(squidly_ensemble, cols, mean_prob=0.5)
squidly_ensemble['mean_prob'] = means
squidly_ensemble['aleatoric'] = aleatorics
squidly_ensemble['residues'] = residues
squidly_ensemble.to_pickle(f'denovo/output_data/{label}_3B_AEGAN_squidly_ensemble.pkl')


In [None]:
df = pd.read_csv(f'denovo/input_data/{label}.csv')
df.set_index('id', inplace=True)
df = df.join(squidly_ensemble, how='left')
residues = []
for pos, seq in df[['residues', 'seq']].values:
    try:
        if len(str(pos)) >  0:
            pos = str(pos)
            res = []
            for i in pos.split('|'):
                res.append(seq[int(i)])
            residues.append('|'.join(res))
        else:
            residues.append(None)
    except:
        residues.append(None)
df['Residues'] = residues
df[[ 'Squidly_CR_probabilities', 'Residues']]
df.to_pickle(f'denovo/output_data/{label}_3B_squidly_ensemble.pkl')
df['Residues'].value_counts()

In [None]:
# Do the same for experimentally tested lysozymes
label = 'experimentally_tested_lysozymes'

squidly_ensemble = pd.DataFrame()
for i in range(1, 6):
    squidly = pd.read_pickle(f'denovo/output_data/{label}_3B_{i}_squidly.pkl')
    squidly.set_index('label', inplace=True)
    squidly_ensemble = squidly_ensemble.join(squidly, how='outer', rsuffix=f'_{i}')

cols = ['all_AS_probs', 'all_AS_probs_2', 'all_AS_probs_3', 'all_AS_probs_4', 'all_AS_probs_5']
means, aleatorics, epistemics, residues = compute_uncertainties(squidly_ensemble, cols, mean_prob=0.5)
squidly_ensemble['mean_prob'] = means
squidly_ensemble['aleatoric'] = aleatorics
squidly_ensemble['residues'] = residues

In [None]:
label = 'experimentally_tested_lysozymes'

df = pd.read_csv(f'denovo/input_data/{label}.csv')
df.set_index('Name', inplace=True)
df = df.join(squidly_ensemble, how='left')
residues = []
for pos, seq in df[['residues', 'sequence']].values:
    try:
        if len(str(pos)) >  0:
            pos = str(pos)
            res = []
            for i in pos.split('|'):
                res.append(seq[int(i)])
            residues.append('|'.join(res))
        else:
            residues.append(None)
    except:
        residues.append(None)
df['Residues'] = residues
df[[ 'Squidly_CR_probabilities', 'Residues']]
df.to_pickle(f'denovo/output_data/{label}_3B_squidly_ensemble.pkl')

In [None]:
df

In [None]:
df = df[df['Active'] == 'Y']
df

# Finally test the families

In [None]:
# Do the same for experimentally tested lysozymes
label = 'experimentally_tested_metrics'

squidly_ensemble = pd.DataFrame()
for i in range(1, 6):
    squidly = pd.read_pickle(f'denovo/output_data/{label}_3B_{i}_squidly.pkl')
    squidly.set_index('label', inplace=True)
    squidly_ensemble = squidly_ensemble.join(squidly, how='outer', rsuffix=f'_{i}')

cols = ['all_AS_probs', 'all_AS_probs_2', 'all_AS_probs_3', 'all_AS_probs_4', 'all_AS_probs_5']
means, aleatorics, epistemics, residues = compute_uncertainties(squidly_ensemble, cols, mean_prob=0.5)
squidly_ensemble['mean_prob'] = means
squidly_ensemble['aleatoric'] = aleatorics
squidly_ensemble['residues'] = residues
df = pd.read_csv(f'denovo/input_data/{label}.csv')
df.set_index('id', inplace=True)
df = df.join(squidly_ensemble, how='left')
residues = []
for pos, seq in df[['residues', 'Sequence']].values:
    try:
        if len(str(pos)) >  0:
            pos = str(pos)
            res = []
            for i in pos.split('|'):
                res.append(seq[int(i)])
            residues.append('|'.join(res))
        else:
            residues.append(None)
    except:
        residues.append(None)
df['Residues'] = residues
df[[ 'Squidly_CR_probabilities', 'Residues']]
df.to_pickle(f'denovo/output_data/{label}_3B_squidly_ensemble.pkl')

In [None]:
label = 'experimentally_tested_metrics'
df = pd.read_pickle(f'denovo/output_data/{label}_3B_squidly_ensemble.pkl')

In [None]:
for family in set(df['Family'].values):
    tmp_df = df[df['Family'] == family]
    print(f'----------------------- {family} ------------------')
    print(tmp_df['Residues'].value_counts())
    

In [None]:
df['Model']

In [None]:
df = df[df['Activity'] == 1]
for family in set(df['Family'].values):
    for model in set(df['Model'].values):
        tmp_df = df[df['Family'] == family]
        tmp_df = tmp_df[tmp_df['Model'] == model]
        print(f'----------------------- {family} {model} {len(tmp_df)} ------------------')
        print(tmp_df['Residues'].value_counts())

# Test CM enzymes as well

3. CM: chorismate mutase: https://www.ebi.ac.uk/thornton-srv/m-csa/search/?s=chorismate+mutase+

Expected residue is:
Glu198, Arg16, Glu246, Arg157, Lys168, Asn194 (main-N), Thr242
or 
Arg7, Glu78, Arg116, Tyr108, Arg90, Arg63, Cys75


K        227 (lysine)  
L        134 (leucine)  
R         93 (Arginine)  
R|K       39 
L|K       32

In [None]:
label = 'chorismate_mutase_CM_table'
df = pd.read_csv(f'denovo/input_data/{label}.csv')

with open(f'denovo/output_data/{label}.fasta', 'w+') as fout:
    for entry, seq in df[['id', 'Sequence']].values:
        fout.write(f'>{entry}\n{seq}\n')

fasta_label = f'denovo/output_data/{label}.fasta'

os.system(f'squidly {fasta_label} esm2_t36_3B_UR50D denovo/output_data/ {label}_3B_1 --as-threshold 0.9 --lstm-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_1/LSTM/models/13-04-25_15-07_128_2_0.2_100_best_model.pth --cr-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_1/models/temp_best_model.pt')
os.system(f'squidly {fasta_label} esm2_t36_3B_UR50D denovo/output_data/ {label}_3B_2 --as-threshold 0.9 --lstm-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_2/LSTM/models/13-04-25_16-48_128_2_0.2_100_best_model.pth --cr-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_2/models/temp_best_model.pt')
os.system(f'squidly {fasta_label} esm2_t36_3B_UR50D denovo/output_data/ {label}_3B_3 --as-threshold 0.9 --lstm-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_3/LSTM/models/13-04-25_19-16_128_2_0.2_100_best_model.pth --cr-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_3/models/temp_best_model.pt')
os.system(f'squidly {fasta_label} esm2_t36_3B_UR50D denovo/output_data/ {label}_3B_4 --as-threshold 0.9 --lstm-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_4/LSTM/models/13-04-25_20-57_128_2_0.2_100_best_model.pth --cr-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_4/models/temp_best_model.pt')
os.system(f'squidly {fasta_label} esm2_t36_3B_UR50D denovo/output_data/ {label}_3B_5 --as-threshold 0.9 --lstm-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_5/LSTM/models/13-04-25_22-36_128_2_0.2_100_best_model.pth --cr-model-as ../models/FinalModels/CataloDB_models_3_esm2_t36_3B_UR50D_2025-04-13/Scheme3_16000_5/models/temp_best_model.pt')


In [None]:
# Do the same for experimentally tested lysozymes
label = 'chorismate_mutase_CM_table'

squidly_ensemble = pd.DataFrame()
for i in range(1, 6):
    squidly = pd.read_pickle(f'denovo/output_data/{label}_3B_{i}_squidly.pkl')
    squidly.set_index('label', inplace=True)
    squidly_ensemble = squidly_ensemble.join(squidly, how='outer', rsuffix=f'_{i}')

cols = ['all_AS_probs', 'all_AS_probs_2', 'all_AS_probs_3', 'all_AS_probs_4', 'all_AS_probs_5']
means, aleatorics, epistemics, residues = compute_uncertainties(squidly_ensemble, cols, mean_prob=0.5)
squidly_ensemble['mean_prob'] = means
squidly_ensemble['aleatoric'] = aleatorics
squidly_ensemble['residues'] = residues


In [None]:
label = 'chorismate_mutase_CM_table'
df = pd.read_csv(f'denovo/input_data/{label}.csv')
df['id'] = [str(i) for i in df['id'].values]
df.set_index('id', inplace=True)
df = df.join(squidly_ensemble, how='outer')
residues = []
for pos, seq in df[['residues', 'Sequence']].values:
    try:
        if len(str(pos)) >  0 and str(pos) != 'nan':
            pos = str(pos)
            res = []
            for i in pos.split('|'):
                res.append(seq[int(i)])
            residues.append('|'.join(res))
        else:
            residues.append(None)
    except:
        residues.append(None)
df['Residues'] = residues
df[[ 'Squidly_CR_probabilities', 'Residues']]
df.to_pickle(f'denovo/output_data/{label}_3B_squidly_ensemble.pkl')

In [None]:
df = pd.read_pickle(f'denovo/output_data/{label}_3B_squidly_ensemble.pkl')
df['Residues'].value_counts()

In [None]:
df['Model'].value_counts()

In [None]:
df = df[df['above threshold'] == True]
len(df)

In [None]:
for model in set(df['Model'].values):
    tmp_df = df[df['Model'] == model]
    print(f'----------------------- {model} {len(tmp_df)}------------------')
    print(tmp_df['Residues'].value_counts())
    

# For each of these as well, run ensemble version with squidly but then also with BLAST