In [105]:
import urllib.request
import os
import pandas as pd
import zipfile
import pathlib
import numpy as np
from collections import Counter
import h5py

In [192]:
df = pd.read_csv("./data/foldseek/toxins.csv")
df

FileNotFoundError: [Errno 2] No such file or directory: '../data/foldseek/toxins.csv'

### get structures for foldseek

In [42]:
output_dir = "./data/foldseek/alphafold_structures"
uniprot_ids = df["Entry"].unique()
os.makedirs(output_dir, exist_ok=True)
fail = []

# Download structures
for uniprot_id in uniprot_ids:
    pdb_url = f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v4.pdb"
    save_path = os.path.join(output_dir, f"{uniprot_id}.pdb")
    try:
        urllib.request.urlretrieve(pdb_url, save_path)
        print(f"Downloaded: {uniprot_id}")
    except Exception as e:
        fail.append(uniprot_id)
        print(f"Failed: {uniprot_id} - {e}")

print("Download complete.")
print(f"Failed entries:{fail}")

Downloaded: A0A0B4U9L8
Downloaded: A0A0B5A8P4
Downloaded: A0A0B5AC95
Downloaded: A0A0D4WV12
Downloaded: A0A0N7CSQ4
Downloaded: A0A0S4FKT4
Downloaded: A0A193CHJ5
Downloaded: A0A1L4BJ98
Downloaded: A0A1Z0YU59
Downloaded: A0A2D0TC04
Downloaded: A0A2L0ART2
Downloaded: A0A2U8QPE6
Downloaded: A0A384E129
Downloaded: A0A411EZW9
Downloaded: A0A4V8GZX0
Downloaded: A0A5C2A2T2
Downloaded: A0A6B7FMR5
Downloaded: A0S864
Downloaded: A0S865
Downloaded: A3R0T9
Downloaded: A4PBQ9
Downloaded: A4VBF0
Downloaded: A6MEY4
Downloaded: A8CG78
Downloaded: A8CG82
Downloaded: A8CG86
Downloaded: A8CG89
Downloaded: A8E2V8
Downloaded: A8QL52
Downloaded: A8QL56
Downloaded: A8YPR6
Downloaded: A8YPR9
Downloaded: B1P1D9
Downloaded: B1P1E0
Downloaded: B1P1E1
Downloaded: B1P1E2
Downloaded: B1P1E3
Downloaded: B2KPN7
Downloaded: B3EWF4
Downloaded: B3EWF5
Downloaded: B3EWF6
Downloaded: B3EWF8
Downloaded: B3EWH0
Downloaded: B6CQR5
Downloaded: B7FDP2
Downloaded: B7S4N9
Downloaded: B8K1W0
Downloaded: C0HJD3
Downloaded: C0HJE7
D

In [43]:
failed_df = df[df['Entry'].isin(fail)]

# Write to FASTA file
with open("./data/foldseek/failed_entries.fasta", "w") as fasta:
    for _, row in failed_df.iterrows():
        fasta.write(f">{row['Entry']}\n{row['Sequence']}\n")

In [44]:
# Define paths
source_dir = pathlib.Path("./data/foldseek/result/")  # Directory with zip files
destination_dir = pathlib.Path("./data/foldseek/alphafold_structures/")
destination_dir.mkdir(parents=True, exist_ok=True)  # Ensure destination exists

# Iterate over zip files in the directory
for zip_path in source_dir.glob("*.zip"):
    with zipfile.ZipFile(zip_path, 'r') as z:
        for file in z.namelist():
            if file.endswith(".pdb"):
                z.extract(file, destination_dir)

print("Extraction complete!")

Extraction complete!


In [45]:
num_entries = len(os.listdir(destination_dir))
print(f"Total entries: {num_entries}")

Total entries: 5041


In [46]:
import re
from pathlib import Path

# Define the directory containing the files
destination_dir = Path("./data/foldseek/alphafold_structures")

# Track the base names without the additional suffixes
base_names = set()

# Regex pattern to capture base name (e.g., A0A8U0LTF0)
base_name_pattern = re.compile(r"^[A-Za-z0-9]+")  # This will match alphanumeric characters at the start of the filename

# Identify and delete the files with extra suffixes
for file in destination_dir.iterdir():
    if file.is_file():
        # Extract base name using regex
        match = base_name_pattern.match(file.stem)
        if match:
            name_part = match.group(0)  # The base name before any suffixes

            # Check if it's an AlphaFold prediction with extra information
            if name_part not in base_names:
                base_names.add(name_part)
            else:  # If the base name already exists, delete the file
                file.unlink()
                print(f"Deleted: {file.name}")

print("Cleanup complete!")


Cleanup complete!


In [47]:
# Define the directory containing the files
destination_dir = Path("./data/foldseek/alphafold_structures")

# Track non-unique names (base name before "_")
non_unique_names = []

for file in destination_dir.iterdir():
    if file.is_file():
        name_part = file.stem.split("_")[0]  # Get the base name before "_"
        non_unique_names.append(name_part)

# Count how many times each name appears
name_counts = Counter(non_unique_names)

# Identify duplicates
duplicates = [name for name, count in name_counts.items() if count > 1]
# Print the results
print(f"Total non-unique names: {len(non_unique_names)}")
print(f"Total unique names: {len(set(non_unique_names))}")
print(f"Duplicates (overlapping names): {duplicates}")

Total non-unique names: 5040
Total unique names: 5040
Duplicates (overlapping names): []


In [48]:
set(df["Entry"]).symmetric_difference(set(non_unique_names))

{'.DS'}

In [49]:
for file in destination_dir.iterdir():
    if file.is_file():
        name_part = file.stem.split("_")[0]
        if name_part == ".DS":
            continue
        file.rename(f"data/foldseek/alphafold_structures/{name_part}.pdb")

### run foldseek

In [53]:
def write_fasta(df, filename):
    """Writes a DataFrame to a FASTA file."""
    with open(filename, "w") as f:
        for _, row in df.iterrows():
            f.write(f">{row['Entry']}\n{row['Sequence']}\n")

write_fasta(df, "./data/foldseek/toxins.fasta")

In [130]:
!foldseek createdb data/foldseek/alphafold_structures/ data/foldseek/tox_structures

[33mdata/foldseek/tox_structures exists and will be overwritten
[39mcreatedb data/foldseek/alphafold_structures/ data/foldseek/tox_structures 

MMseqs Version:             	10.941cd33
Use GPU                     	0
Path to ProstT5             	
Chain name mode             	0
Createdb extraction mode    	0
Interface distance threshold	8
Write mapping file          	0
Mask b-factor threshold     	0
Coord store mode            	2
Write lookup file           	1
Input format                	0
File Inclusion Regex        	.*
File Exclusion Regex        	^$
Threads                     	14
Verbosity                   	3

Output file: data/foldseek/tox_structures
Time for merging to tox_structures_ss: 0h 0m 0s 2ms
Time for merging to tox_structures_h: 0h 0m 0s 2ms
Time for merging to tox_structures_ca: 0h 0m 0s 5ms
Time for merging to tox_structures: 0h 0m 0s 2ms
Ignore 0 out of 5039.
Too short: 0, incorrect: 0, not proteins: 0.
Time for processing: 0h 0m 0s 213ms


In [67]:
!foldseek easy-search data/foldseek/alphafold_structures/* data/foldseek/tox_structures data/foldseek/alignment_results.txt data/foldseek/tmp/ --max-seqs 5039

[33mdata/foldseek/alignment_results.txt exists and will be overwritten
[39measy-search data/foldseek/alphafold_structures/A0A023VZM6.pdb data/foldseek/alphafold_structures/A0A023VZR2.pdb data/foldseek/alphafold_structures/A0A023W082.pdb data/foldseek/alphafold_structures/A0A023W090.pdb data/foldseek/alphafold_structures/A0A023W0B6.pdb data/foldseek/alphafold_structures/A0A023W0C3.pdb data/foldseek/alphafold_structures/A0A023W0U0.pdb data/foldseek/alphafold_structures/A0A023W0V6.pdb data/foldseek/alphafold_structures/A0A023W0V9.pdb data/foldseek/alphafold_structures/A0A023W0W9.pdb data/foldseek/alphafold_structures/A0A023W123.pdb data/foldseek/alphafold_structures/A0A023W140.pdb data/foldseek/alphafold_structures/A0A023W145.pdb data/foldseek/alphafold_structures/A0A023W157.pdb data/foldseek/alphafold_structures/A0A023W163.pdb data/foldseek/alphafold_structures/A0A023W168.pdb data/foldseek/alphafold_structures/A0A059T2H4.pdb data/foldseek/alphafold_structures/A0A059U906.pdb data/folds

In [146]:
!foldseek easy-cluster data/foldseek/alphafold_structures data/foldseek/tox_clusters/ data/foldseek/tmp/ --alignment-type 1
# 1 = TMalign

[33mdata/foldseek/tox_clusters/ exists and will be overwritten
[39measy-cluster data/foldseek/alphafold_structures data/foldseek/tox_clusters/ data/foldseek/tmp/ --alignment-type 1 

MMseqs Version:                     	10.941cd33
Substitution matrix                 	aa:3di.out,nucl:3di.out
Seed substitution matrix            	aa:3di.out,nucl:3di.out
Sensitivity                         	4
k-mer length                        	0
Target search mode                  	0
k-score                             	seq:2147483647,prof:2147483647
Max sequence length                 	65535
Max results per query               	300
Split database                      	0
Split mode                          	2
Split memory limit                  	0
Coverage threshold                  	0
Coverage mode                       	0
Compositional bias                  	1
Compositional bias                  	1
Diagonal scoring                    	true
Exact k-mer matching                	0
M

In [147]:
scores = pd.read_csv(
    "./data/foldseek/alignment_results.txt",
    sep="\t",
    names=["Query", "Target", "Fident", "AliLength", "Mismatches", "GapOpenings", "QueryStart", "QueryEnd", "TargetStart", "TargetEnd", "E-val", "Bit"],
    header=0
)
scores[:5]

Unnamed: 0,Query,Target,Fident,AliLength,Mismatches,GapOpenings,QueryStart,QueryEnd,TargetStart,TargetEnd,E-val,Bit
0,A0A023VZM6,A0A023W0U0,0.935,78,5,0,1,78,1,78,1.634e-09,229
1,A0A023VZM6,P58924,0.126,86,67,0,1,78,3,88,1.395,33
2,A0A023VZM6,B1P1A9,0.151,86,60,0,3,74,2,87,1.493,26
3,A0A023VZM6,W4VRV4,0.17,88,61,0,1,75,1,88,3.158,26
4,A0A023VZM6,C8CK78,0.182,81,61,0,3,78,5,85,1.303,25


### make Matrix

In [148]:
matrix = np.empty((len(df), len(df)))
matrix[:] = np.nan
evalue_matrix = pd.DataFrame(matrix, index=df["Entry"], columns=df["Entry"])
evalue_matrix.iloc[:5, :5]

Entry,A0A0B4U9L8,A0A0B5A8P4,A0A0B5AC95,A0A0D4WV12,A0A0N7CSQ4
Entry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A0A0B4U9L8,,,,,
A0A0B5A8P4,,,,,
A0A0B5AC95,,,,,
A0A0D4WV12,,,,,
A0A0N7CSQ4,,,,,


In [149]:
for _, row in scores.iterrows():
    query_index = row['Query']
    subject_index = row['Target']
    evalue = row['Fident']

    # Update the DataFrame
    evalue_matrix.at[query_index, subject_index] = evalue
evalue_matrix.iloc[:5, :5]

Entry,A0A0B4U9L8,A0A0B5A8P4,A0A0B5AC95,A0A0D4WV12,A0A0N7CSQ4
Entry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A0A0B4U9L8,1.0,,,,
A0A0B5A8P4,,1.0,0.704,,
A0A0B5AC95,,0.713,1.0,,
A0A0D4WV12,,,,1.0,
A0A0N7CSQ4,,,,,1.0


In [150]:
nan_replacement = 0
zero_replacement = 0.00001
#zero_replacement = np.nanmin(evalue_matrix[(evalue_matrix != 0) & (~np.isnan(evalue_matrix))])

In [151]:
replaced_matrix = np.where(np.isnan(matrix), nan_replacement, matrix)
replaced_matrix = np.where(replaced_matrix == 0, zero_replacement, replaced_matrix)
replaced_matrix[:5,:5]

array([[1.00e+00, 1.00e-05, 1.00e-05, 1.00e-05, 1.00e-05],
       [1.00e-05, 1.00e+00, 7.04e-01, 1.00e-05, 1.00e-05],
       [1.00e-05, 7.13e-01, 1.00e+00, 1.00e-05, 1.00e-05],
       [1.00e-05, 1.00e-05, 1.00e-05, 1.00e+00, 1.00e-05],
       [1.00e-05, 1.00e-05, 1.00e-05, 1.00e-05, 1.00e+00]])

In [152]:
symmetric_matrix = (replaced_matrix + replaced_matrix.T) / 2
symmetric_matrix[:5,:5]

array([[1.000e+00, 1.000e-05, 1.000e-05, 1.000e-05, 1.000e-05],
       [1.000e-05, 1.000e+00, 7.085e-01, 1.000e-05, 1.000e-05],
       [1.000e-05, 7.085e-01, 1.000e+00, 1.000e-05, 1.000e-05],
       [1.000e-05, 1.000e-05, 1.000e-05, 1.000e+00, 1.000e-05],
       [1.000e-05, 1.000e-05, 1.000e-05, 1.000e-05, 1.000e+00]])

In [153]:
# log_matrix = np.log(symmetric_matrix)
# log_matrix[:5,:5]
# scaled_matrix = (log_matrix - np.min(log_matrix)) / (np.max(log_matrix) - np.min(log_matrix))
# scaled_matrix[:5,:5]
# matrix = 1 - scaled_matrix
# matrix[:5,:5]

In [154]:
evalue_matrix.loc[:, :] = symmetric_matrix
evalue_matrix.iloc[:5,:5]

Entry,A0A0B4U9L8,A0A0B5A8P4,A0A0B5AC95,A0A0D4WV12,A0A0N7CSQ4
Entry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A0A0B4U9L8,1.0,1e-05,1e-05,1e-05,1e-05
A0A0B5A8P4,1e-05,1.0,0.7085,1e-05,1e-05
A0A0B5AC95,1e-05,0.7085,1.0,1e-05,1e-05
A0A0D4WV12,1e-05,1e-05,1e-05,1.0,1e-05
A0A0N7CSQ4,1e-05,1e-05,1e-05,1e-05,1.0


### create h5 file

In [155]:
with h5py.File("data/foldseek/fident_matrix.h5", "w") as f:
    for row_name, row_values in evalue_matrix.iterrows():
        f.create_dataset(row_name, data=row_values.to_numpy(dtype=np.float64))  # Ensure float64

# To verify saved data
with h5py.File("data/foldseek/fident_matrix.h5", "r") as f:
    for name in f:
        print(name, f[name][:])

A0A023VZM6 [1.e-05 1.e-05 1.e-05 ... 1.e-05 1.e-05 1.e-05]
A0A023VZR2 [1.e-05 1.e-05 1.e-05 ... 1.e-05 1.e-05 1.e-05]
A0A023W082 [1.0000e-05 1.0000e-05 1.0000e-05 ... 1.0000e-05 9.8005e-02 1.0000e-05]
A0A023W090 [1.e-05 1.e-05 1.e-05 ... 1.e-05 1.e-05 1.e-05]
A0A023W0B6 [1.e-05 1.e-05 1.e-05 ... 1.e-05 1.e-05 1.e-05]
A0A023W0C3 [1.e-05 1.e-05 1.e-05 ... 1.e-05 1.e-05 1.e-05]
A0A023W0U0 [1.e-05 1.e-05 1.e-05 ... 1.e-05 1.e-05 1.e-05]
A0A023W0V6 [1.e-05 1.e-05 1.e-05 ... 1.e-05 1.e-05 1.e-05]
A0A023W0V9 [1.e-05 1.e-05 1.e-05 ... 1.e-05 1.e-05 1.e-05]
A0A023W0W9 [1.e-05 1.e-05 1.e-05 ... 1.e-05 1.e-05 1.e-05]
A0A023W123 [1.e-05 1.e-05 1.e-05 ... 1.e-05 1.e-05 1.e-05]
A0A023W140 [1.e-05 1.e-05 1.e-05 ... 1.e-05 1.e-05 1.e-05]
A0A023W145 [1.e-05 1.e-05 1.e-05 ... 1.e-05 1.e-05 1.e-05]
A0A023W157 [1.e-05 1.e-05 1.e-05 ... 1.e-05 1.e-05 1.e-05]
A0A023W163 [1.e-05 1.e-05 1.e-05 ... 1.e-05 1.e-05 1.e-05]
A0A023W168 [1.e-05 1.e-05 1.e-05 ... 1.e-05 1.e-05 1.e-05]
A0A059T2H4 [1.0000e-05 7.4005e-0

In [156]:
df["Protein families"].value_counts()

Protein families
other                                      558
Snake three-finger toxin family            485
Long (4 C-C) scorpion toxin superfamily    337
Phospholipase A2 family                    291
Neurotoxin 10 (Hwtx-1) family              284
                                          ... 
AVIT (prokineticin) family                  12
Bradykinin-related peptide family           12
Conotoxin B superfamily                     11
Neurotoxin 21 family                        11
Cationic peptide 03 (latarcin) family       11
Name: count, Length: 63, dtype: int64

In [157]:
df["Protein families"] = df["Protein families"].apply(lambda x: "other" if df["Protein families"].value_counts()[x] <= 10 else x)
df

Unnamed: 0,Entry,Organism,Protein names,Protein families,Sequence,Fragment
0,A0A0B4U9L8,Vipera ammodytes ammodytes (Western sand viper),Zinc metalloproteinase-disintegrin-like protein F1 (EC 3.4.24.-) (Fibrinogenase 1) (Metalloproteinase F1) (P-IIIa metalloproteinase F1) (Snake venom metalloproteinase) (SVMP) (VaF1),Venom metalloproteinase (M12B) family,MLQVLLVTICLAVFPYQGSSIILESGNVNDYEVVYPQKLTALLKGAIQQPEQKYEDAMQYEFKVNGKPVVLHLEKNKGLFSEDYSETHYSPDGREITTNPPVEDHCYYHGHIQNDAHLTASISACNGLKGHFQLRGETYLIEPLKIPDSEAHAVYKYENVEKEDEGPKKCGVTQTNWKSDEPIKASQFILTPEQRAYMNANKYIKLAIVVDNVMFRKYTGNFTAIRTRIYEIVNTLNLIYTILNIHIALVFLEIWSKGDSINVQSVVDVTLNSFGEWRERDLLNRKRHDNAQLLTGINFNGDTIGFGFVGSMCIPKKSVGIVQDHSKTHLLVATTMAHELGHNLGINHDGDSCTCQANSCIMAAKLSHQPSYQFSDCSINELWMYLISHTPRCILNEPLTTDVVSPAVCGNYVVEEGEECDCGSLWYCRNPCCDAATCKLKPGAECGDGVCCYQCRFVTAGTVCRPARSECDIPEYCSGQSVECPMDHIQKNGKPCLMNHGYCYNGRCPIMIHQCIALWGPGTTVSSDVCFQRNESGQGYSYCRRENNQNIPCAPQDVKCGRLYCKFHNVNTLPCNFKYSDFAPDYGLVDHGTKCGDGKVCNSNRQCVDVNTAY,
1,A0A0B5A8P4,Conus geographus (Geography cone) (Nubecula geographus),Con-Ins G3 (Insulin 3) [Cleaved into: Con-Ins G3 B chain; Con-Ins G3 A chain],Insulin family,MTTSFYFLLVALGLLLYVCQSSFGNQHTRNSDTPKHRCGSELADQYVQLCHGKRNDAGKKRGRASPLWQRQGFLSMLKAKRNEAFFLQRDGRGIVEVCCDNPCTVATLRTFCH,
2,A0A0B5AC95,Conus geographus (Geography cone) (Nubecula geographus),Con-Ins G1a (Insulin 1) [Cleaved into: Con-Ins G1 B chain; Con-Ins G1a A chain],Insulin family,MTTSSYFLLMALGLLLYVCQSSFGNQHTRTFDTPKHRCGSEITNSYMDLCYRKRNDAGEKRGRASPLWQRRGSLSKLKARAKRNGAFHLPRDGRGVVEHCCHRPCSNAEFKKYCG,
3,A0A0D4WV12,Sicarius terrosus (Cave spider),Dermonecrotic toxin StSicTox-betaIB1i (EC 4.6.1.-) (Phospholipase D) (PLD) (Sphingomyelin phosphodiesterase D) (SMD) (SMase D) (Sphingomyelinase D),Arthropod phospholipase D family,GDSRRPIWNIAHMVNDLDLVDEYLDDGANSLELDVEFSKSGTALRTYHGVPCDCFRSCTRSEKFSKYLDYIRQLTTPGNSKFRSRLILLVLDLKLNPLSSSAAYNAGADVARNLLDNYWQRGDSKARAYIVLSLETIAGAEFITGFKDTMKKEGFDEKYYDKIGWDFSGNEDLGKIRDVLESHGIREHIWQGDGITNCLPRDDNRLKQAISRRYSPTYVYADKVYTWSIDKESSIENALRLGVDGVMTNYPARVISVLGEREFSGKLRLATYDDNPWEK,
4,A0A0N7CSQ4,Scolopendra mutilans (Chinese red-headed centipede) (Scolopendra subspinipes mutilans),Tau-scoloptoxin(04)-Ssm1b (Tau-SLPTX(04)-Ssm1b) (Toxin RhTx2) [Cleaved into: Tau-scoloptoxin(04)-Ssm1a (Tau-SLPTX(04)-Ssm1a) (Toxin RhTx)],Scoloptoxin-04 family,MLKSFCILSVFMVLFLAKFPDLCSGEEISPLKIVVRNSEYLNNPCNGVTCPSGYRCSIVDKQCIKKEK,
...,...,...,...,...,...,...
5034,W4VSI7,Trittame loki (Brush-footed trapdoor spider),Toxin ICK-13,Neurotoxin 21 family,MKPTISILIFFALAVAIMGHRLNSGYGIPHIVEKLPNGQWCRTPGDDCSESKQCCKPEDTATYAHGCSQQWSGQRGELVKMCYICNKESSMC,
5035,W4VSI8,Trittame loki (Brush-footed trapdoor spider),Toxin ICK-8,Neurotoxin 25 family,MMKLYSLVIIATLAAAAFAATSEEISAAVSEIISQHQEDLERYAKIVERGEEPKKYIRCSKQLGQSCYLNCECCGASAVCEDIKYICKDKVSDNSILDAMGKAWNAVGNSISRYYCSAE,
5036,W4VSI9,Trittame loki (Brush-footed trapdoor spider),U10-barytoxin-Tl1a (U10-BATX-Tl1a) (Toxin ICK-3),Neurotoxin 10 (Hwtx-1) family,MKTLVLVAVLGVASLYLLSSASEVQQLSPAEEEFRAFVSTFGGLFETEERGVDSEDCRAMFGGCGEDNDCCLHLGCKTTKLPPFANPYCAWDGTTGRK,
5037,X5IFY8,Conus geographus (Geography cone) (Nubecula geographus),Contryphan-G,Conotoxin O2 superfamily,MGKLTILVLVAAVLLSTQAMVQGDGDQPAARNAVPRDDNPDGPSAKFMNVQRRSGCPWEPWCG,


In [158]:
df.to_csv("data/foldseek/fident_matrix.csv", columns=["Entry", "Protein families", "Organism"], header=["identifier", "Protein families", "Organism"], index=False)

In [159]:
!protspace-json -i data/foldseek/fident_matrix.h5 -m data/foldseek/fident_matrix.csv -o data/foldseek/fident_matrix.json --methods pca2 pacmap2 umap2

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.

'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



In [126]:
!protspace --json data/foldseek/fident_matrix.json

Dash is running on http://127.0.0.1:8050/

INFO: Dash is running on http://127.0.0.1:8050/

 * Serving Flask app 'protspace.app'
 * Debug mode: on
^C


### add cluster labels

In [182]:
df = pd.read_csv('./data/model_input.csv', index_col=0)
df

Unnamed: 0,Entry,Organism,Protein names,Protein families,Sequence,Fragment,Cluster,Cluster_Rep,Train_Cluster_Rep,Val_Cluster_Rep,Test_Cluster_Rep,protein_category
0,A0A0D4WV12,Sicarius terrosus (Cave spider),Dermonecrotic toxin StSicTox-betaIB1i (EC 4.6.1.-) (Phospholipase D) (PLD) (Sphingomyelin phosphodiesterase D) (SMD) (SMase D) (Sphingomyelinase D),Phospholipase family,GDSRRPIWNIAHMVNDLDLVDEYLDDGANSLELDVEFSKSGTALRTYHGVPCDCFRSCTRSEKFSKYLDYIRQLTTPGNSKFRSRLILLVLDLKLNPLSSSAAYNAGADVARNLLDNYWQRGDSKARAYIVLSLETIAGAEFITGFKDTMKKEGFDEKYYDKIGWDFSGNEDLGKIRDVLESHGIREHIWQGDGITNCLPRDDNRLKQAISRRYSPTYVYADKVYTWSIDKESSIENALRLGVDGVMTNYPARVISVLGEREFSGKLRLATYDDNPWEK,mature_seq (NO_SP),8,True,True,False,False,other
1,A0A0N7CSQ4,Scolopendra mutilans (Chinese red-headed centipede) (Scolopendra subspinipes mutilans),Tau-scoloptoxin(04)-Ssm1b (Tau-SLPTX(04)-Ssm1b) (Toxin RhTx2) [Cleaved into: Tau-scoloptoxin(04)-Ssm1a (Tau-SLPTX(04)-Ssm1a) (Toxin RhTx)],Scoloptoxin family,MLKSFCILSVFMVLFLAKFPDLCSGEEISPLKIVVRNSEYLNNPCNGVTCPSGYRCSIVDKQCIKKEK,full_seq (SP),12,True,True,False,False,scoloptoxin
2,A0A1L4BJ98,Hemiscorpius lepturus (Scorpion),Dermonecrotic toxin Hl-PLD1 (EC 4.6.1.-) (Phospholipase D) (PLD) (Sphingomyelin phosphodiesterase D) (SMD) (SMase D) (Sphingomyelinase D),Phospholipase family,MAHCYYNSKRGCNRVMKTVALVVLISTVMVEESRGDSQEDKKRPIWNIGHMVNAVKQIEEFLDLGANALEADVTFDDNGNPKWTYHGTPCDCFRDCLRWEYVDEYLKRIRELTSPGSSKFRKGFILLMLDLKISKLSDNAKSKAGKEIADMIIKRLWSGSGEKAQLYIVLSFPYVNDIEFVRAFRERVKSKGFASEAEKRIGWDISGNEDLGKIRDAYQKLGITDNVWQSDGITNCLTRSHDRLAEAVCKRDSDKEWPSLKKVYYWTVDKQSSMKEALKVGVDGMITNDPDDLVAVLNEFSGTHRLANINDSPWQKIPRPKSNC,full_seq (SP),8,False,True,False,False,other
3,A0A5C2A2T2,Conus purpurascens (Purple cone),Conodipine-P1 (Cdpi-P1) (EC 3.1.1.4) (Phosphatidylcholine 2-acylhydrolase) (Phospholipase A2) (PLA2) [Cleaved into: Conodipine-P1 alpha subunit; Conodipine-P1 beta subunit],Phospholipase family,MKLLAPVLWAMAALGVTWLVAVDSKESCTKHSNGCSTPLRLPCQEYFRPACDIHDNCYHCGTIFGISRKECDDAFLKDMNTLCKKLGSNSATCPARGKREVTSHRATSIAHSRLWKTALDQKSFLNRKARQAILLTPNSCLYWANNFYMAVHVFGARSYSRTTDPKDCQGLKHCLPNH,mature_seq (NO_SP),35,False,True,False,False,phospholipase_a2
4,A0A6B7FMR5,Vipera ammodytes ammodytes (Western sand viper),Disintegrin-like/cysteine-rich protein MPIII-3 (D'C protein MPIII-3) (Metalloproteinase-like protein of class P-III MPIII-3) (Snake venom metalloproteinase precursor-derived protein MPIII-3) (SVMP precursor-derived protein MPIII-3) (Snake venom metalloproteinase-like) (SVMP-like) (Vaa-MPIII-3) (VaaMPIII-3),Venom metalloproteinase (M12B) family,MIQVLLVIICLAVFPYQVSSIILESGNINNYEVVYPQKVTALPKGAIQQLEQKYEDAMQYQFKVKGEPVVLHLEKNKDFFPEDYSETHYSPDDREITTNPPVEDHCYYYGHIQNDADSTASISACNGLKGYFTLRGVTYLIEPLKIPESEAHAIYKYENVEKEDEDPKKCEFRRAGTECRPARSECDVAEYCTGQSAECPTDVFHSNGKPCLNNFGYCYNGNCPIMYHQCYALFGPNATVGQDGCFEWNKKGESYFYCRKENDVPIPCAPEDIKCGRLFCELIKNTCKYDYSEDPDYGMVDHGTKCGDGKVCINRHCVDVTTAY,full_seq (SP),36,True,False,False,True,metalloproteinase
...,...,...,...,...,...,...,...,...,...,...,...,...
1040,W4VSB6,Conus victoriae (Queen Victoria cone),Conotoxin Vc7.1 (H_Vc7.1),Conotoxin family,MNTAGRLLLLCLALGLVFESLGIPVADDVEAVRDTDPDEKDPSVHNSLKAVYGDCGGERCRFGCCKTDDGEEKCQHFGCP,full_seq (SP),744,False,True,False,False,conotoxin
1041,W4VSB9,Trittame loki (Brush-footed trapdoor spider),Toxin ICK-9,Neurotoxin family,MMKLYSLVIIATLAAAAFAATSEEISAAVSEIISQHQQDLERYAKIVERGEEPKKYIRCSKQLGEKCDLNCECCGAAAYCEDIVYICKEKISDNSILNAFGQAMTAMGNAVSRYYCDAE,full_seq (SP),19,False,True,False,False,neurotoxin
1042,W4VSG7,Conus victoriae (Queen Victoria cone),Conotoxin Vc1 (H_Vc1),Conotoxin family,MRTSGRLLLLCLAVGLLLESQAHPNADAGDATRDVGSDRTSVELSKMLKGWQAEKGQRKASAPKKFYVYPPVRRSFY,full_seq (SP),746,True,True,False,False,conotoxin
1043,W4VSI6,Trittame loki (Brush-footed trapdoor spider),Toxin ICK-18,Neurotoxin family,MKTIFALVFCCAIAVVVLGFGENEGSTIDHDQNNCKGPGSRCSNKNECCKPKDMETYTYYCGSRWDSSSGDFVRKCVICNRESSMC,full_seq (SP),748,True,True,False,False,neurotoxin


In [183]:
clustering = pd.read_csv('./data/foldseek/tox_clusters/_cluster.tsv', sep="\t", header=None, names=["rep", "member"])
clustering

Unnamed: 0,rep,member
0,A0A023VZM6,A0A023VZM6
1,A0A023VZM6,A0A023W0U0
2,A0A023VZM6,Q2I2P4
3,A0A023VZM6,I6S7G5
4,A0A023VZM6,P0DQA6
...,...,...
5034,B6DD34,B6DCY1
5035,B6DD34,B6DD60
5036,B6DD34,B6DD62
5037,B6DD34,Q5Y4V9


In [184]:
clustering["rep"].value_counts()

rep
P01385    458
P00618    280
P01140    254
D2Y1Y6    238
B1P1H6    231
         ... 
P0DJL6      1
P0DJK7      1
P0DJK4      1
P0CV87      1
C0HJA8      1
Name: count, Length: 562, dtype: int64

In [185]:
cluster_map = dict(zip(clustering["member"], clustering["rep"]))
cluster_map

{'A0A023VZM6': 'A0A023VZM6',
 'A0A023W0U0': 'A0A023VZM6',
 'Q2I2P4': 'A0A023VZM6',
 'I6S7G5': 'A0A023VZM6',
 'P0DQA6': 'A0A023VZM6',
 'P0DQA7': 'A0A023VZM6',
 'P0DQA8': 'A0A023VZM6',
 'A0A023W082': 'A0A023W082',
 'A0A023W123': 'A0A023W082',
 'A0A023W090': 'A0A023W082',
 'A0A023W0V6': 'A0A023W0V6',
 'A0A023W0W9': 'A0A023W0W9',
 'A0A023W0B6': 'A0A023W0W9',
 'A0A023W0V9': 'A0A023W0W9',
 'A0A023W157': 'A0A023W0W9',
 'A0A023W163': 'A0A023W0W9',
 'A0A023W168': 'A0A023W0W9',
 'P84715': 'A0A023W0W9',
 'Q2PE51': 'A0A023W0W9',
 'A0A023W140': 'A0A023W140',
 'A0A125S9E1': 'A0A125S9E1',
 'A0A142C197': 'A0A142C197',
 'A0A2I6EDN2': 'A0A2I6EDN2',
 'P15471': 'A0A2I6EDN2',
 'C0HKF6': 'A0A2I6EDN2',
 'P0C8V1': 'A0A2I6EDN2',
 'A0A346CED7': 'A0A346CED7',
 'A0A346CIA0': 'A0A346CIA0',
 'A0A346CIB3': 'A0A346CIA0',
 'A0S864': 'A0A346CIA0',
 'A0S865': 'A0A346CIA0',
 'A7X3S0': 'A0A346CIA0',
 'A7X3S2': 'A0A346CIA0',
 'A7X3V0': 'A0A346CIA0',
 'Q06ZW0': 'A0A346CIA0',
 'A0A346CIB2': 'A0A346CIA0',
 'A0A3G1VU81': 'A0A3

In [186]:
df["Cluster ID"] = df["Entry"].map(cluster_map)
df

Unnamed: 0,Entry,Organism,Protein names,Protein families,Sequence,Fragment,Cluster,Cluster_Rep,Train_Cluster_Rep,Val_Cluster_Rep,Test_Cluster_Rep,protein_category,Cluster ID
0,A0A0D4WV12,Sicarius terrosus (Cave spider),Dermonecrotic toxin StSicTox-betaIB1i (EC 4.6.1.-) (Phospholipase D) (PLD) (Sphingomyelin phosphodiesterase D) (SMD) (SMase D) (Sphingomyelinase D),Phospholipase family,GDSRRPIWNIAHMVNDLDLVDEYLDDGANSLELDVEFSKSGTALRTYHGVPCDCFRSCTRSEKFSKYLDYIRQLTTPGNSKFRSRLILLVLDLKLNPLSSSAAYNAGADVARNLLDNYWQRGDSKARAYIVLSLETIAGAEFITGFKDTMKKEGFDEKYYDKIGWDFSGNEDLGKIRDVLESHGIREHIWQGDGITNCLPRDDNRLKQAISRRYSPTYVYADKVYTWSIDKESSIENALRLGVDGVMTNYPARVISVLGEREFSGKLRLATYDDNPWEK,mature_seq (NO_SP),8,True,True,False,False,other,B2KKW0
1,A0A0N7CSQ4,Scolopendra mutilans (Chinese red-headed centipede) (Scolopendra subspinipes mutilans),Tau-scoloptoxin(04)-Ssm1b (Tau-SLPTX(04)-Ssm1b) (Toxin RhTx2) [Cleaved into: Tau-scoloptoxin(04)-Ssm1a (Tau-SLPTX(04)-Ssm1a) (Toxin RhTx)],Scoloptoxin family,MLKSFCILSVFMVLFLAKFPDLCSGEEISPLKIVVRNSEYLNNPCNGVTCPSGYRCSIVDKQCIKKEK,full_seq (SP),12,True,True,False,False,scoloptoxin,P0DM27
2,A0A1L4BJ98,Hemiscorpius lepturus (Scorpion),Dermonecrotic toxin Hl-PLD1 (EC 4.6.1.-) (Phospholipase D) (PLD) (Sphingomyelin phosphodiesterase D) (SMD) (SMase D) (Sphingomyelinase D),Phospholipase family,MAHCYYNSKRGCNRVMKTVALVVLISTVMVEESRGDSQEDKKRPIWNIGHMVNAVKQIEEFLDLGANALEADVTFDDNGNPKWTYHGTPCDCFRDCLRWEYVDEYLKRIRELTSPGSSKFRKGFILLMLDLKISKLSDNAKSKAGKEIADMIIKRLWSGSGEKAQLYIVLSFPYVNDIEFVRAFRERVKSKGFASEAEKRIGWDISGNEDLGKIRDAYQKLGITDNVWQSDGITNCLTRSHDRLAEAVCKRDSDKEWPSLKKVYYWTVDKQSSMKEALKVGVDGMITNDPDDLVAVLNEFSGTHRLANINDSPWQKIPRPKSNC,full_seq (SP),8,False,True,False,False,other,B2KKW0
3,A0A5C2A2T2,Conus purpurascens (Purple cone),Conodipine-P1 (Cdpi-P1) (EC 3.1.1.4) (Phosphatidylcholine 2-acylhydrolase) (Phospholipase A2) (PLA2) [Cleaved into: Conodipine-P1 alpha subunit; Conodipine-P1 beta subunit],Phospholipase family,MKLLAPVLWAMAALGVTWLVAVDSKESCTKHSNGCSTPLRLPCQEYFRPACDIHDNCYHCGTIFGISRKECDDAFLKDMNTLCKKLGSNSATCPARGKREVTSHRATSIAHSRLWKTALDQKSFLNRKARQAILLTPNSCLYWANNFYMAVHVFGARSYSRTTDPKDCQGLKHCLPNH,mature_seq (NO_SP),35,False,True,False,False,phospholipase_a2,P86976
4,A0A6B7FMR5,Vipera ammodytes ammodytes (Western sand viper),Disintegrin-like/cysteine-rich protein MPIII-3 (D'C protein MPIII-3) (Metalloproteinase-like protein of class P-III MPIII-3) (Snake venom metalloproteinase precursor-derived protein MPIII-3) (SVMP precursor-derived protein MPIII-3) (Snake venom metalloproteinase-like) (SVMP-like) (Vaa-MPIII-3) (VaaMPIII-3),Venom metalloproteinase (M12B) family,MIQVLLVIICLAVFPYQVSSIILESGNINNYEVVYPQKVTALPKGAIQQLEQKYEDAMQYQFKVKGEPVVLHLEKNKDFFPEDYSETHYSPDDREITTNPPVEDHCYYYGHIQNDADSTASISACNGLKGYFTLRGVTYLIEPLKIPESEAHAIYKYENVEKEDEDPKKCEFRRAGTECRPARSECDVAEYCTGQSAECPTDVFHSNGKPCLNNFGYCYNGNCPIMYHQCYALFGPNATVGQDGCFEWNKKGESYFYCRKENDVPIPCAPEDIKCGRLFCELIKNTCKYDYSEDPDYGMVDHGTKCGDGKVCINRHCVDVTTAY,full_seq (SP),36,True,False,False,True,metalloproteinase,U6BLN5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1040,W4VSB6,Conus victoriae (Queen Victoria cone),Conotoxin Vc7.1 (H_Vc7.1),Conotoxin family,MNTAGRLLLLCLALGLVFESLGIPVADDVEAVRDTDPDEKDPSVHNSLKAVYGDCGGERCRFGCCKTDDGEEKCQHFGCP,full_seq (SP),744,False,True,False,False,conotoxin,P0DRA3
1041,W4VSB9,Trittame loki (Brush-footed trapdoor spider),Toxin ICK-9,Neurotoxin family,MMKLYSLVIIATLAAAAFAATSEEISAAVSEIISQHQQDLERYAKIVERGEEPKKYIRCSKQLGEKCDLNCECCGAAAYCEDIVYICKEKISDNSILNAFGQAMTAMGNAVSRYYCDAE,full_seq (SP),19,False,True,False,False,neurotoxin,P0DKY9
1042,W4VSG7,Conus victoriae (Queen Victoria cone),Conotoxin Vc1 (H_Vc1),Conotoxin family,MRTSGRLLLLCLAVGLLLESQAHPNADAGDATRDVGSDRTSVELSKMLKGWQAEKGQRKASAPKKFYVYPPVRRSFY,full_seq (SP),746,True,True,False,False,conotoxin,W4VSG7
1043,W4VSI6,Trittame loki (Brush-footed trapdoor spider),Toxin ICK-18,Neurotoxin family,MKTIFALVFCCAIAVVVLGFGENEGSTIDHDQNNCKGPGSRCSNKNECCKPKDMETYTYYCGSRWDSSSGDFVRKCVICNRESSMC,full_seq (SP),748,True,True,False,False,neurotoxin,B1P1H6


In [187]:
df["Cluster ID"].value_counts()

Cluster ID
P01140    40
D2Y1Y6    40
B1P1H6    38
P01385    34
P60980    33
          ..
P0DJK4     1
P0DJK3     1
P0DPM0     1
P0CJ36     1
V9ISG0     1
Name: count, Length: 405, dtype: int64

In [188]:
cluster_counts = df["Cluster ID"].value_counts()

df.loc[df["Cluster ID"].isin(cluster_counts[cluster_counts < 10].index), "Cluster ID"] = "other"
df

Unnamed: 0,Entry,Organism,Protein names,Protein families,Sequence,Fragment,Cluster,Cluster_Rep,Train_Cluster_Rep,Val_Cluster_Rep,Test_Cluster_Rep,protein_category,Cluster ID
0,A0A0D4WV12,Sicarius terrosus (Cave spider),Dermonecrotic toxin StSicTox-betaIB1i (EC 4.6.1.-) (Phospholipase D) (PLD) (Sphingomyelin phosphodiesterase D) (SMD) (SMase D) (Sphingomyelinase D),Phospholipase family,GDSRRPIWNIAHMVNDLDLVDEYLDDGANSLELDVEFSKSGTALRTYHGVPCDCFRSCTRSEKFSKYLDYIRQLTTPGNSKFRSRLILLVLDLKLNPLSSSAAYNAGADVARNLLDNYWQRGDSKARAYIVLSLETIAGAEFITGFKDTMKKEGFDEKYYDKIGWDFSGNEDLGKIRDVLESHGIREHIWQGDGITNCLPRDDNRLKQAISRRYSPTYVYADKVYTWSIDKESSIENALRLGVDGVMTNYPARVISVLGEREFSGKLRLATYDDNPWEK,mature_seq (NO_SP),8,True,True,False,False,other,other
1,A0A0N7CSQ4,Scolopendra mutilans (Chinese red-headed centipede) (Scolopendra subspinipes mutilans),Tau-scoloptoxin(04)-Ssm1b (Tau-SLPTX(04)-Ssm1b) (Toxin RhTx2) [Cleaved into: Tau-scoloptoxin(04)-Ssm1a (Tau-SLPTX(04)-Ssm1a) (Toxin RhTx)],Scoloptoxin family,MLKSFCILSVFMVLFLAKFPDLCSGEEISPLKIVVRNSEYLNNPCNGVTCPSGYRCSIVDKQCIKKEK,full_seq (SP),12,True,True,False,False,scoloptoxin,other
2,A0A1L4BJ98,Hemiscorpius lepturus (Scorpion),Dermonecrotic toxin Hl-PLD1 (EC 4.6.1.-) (Phospholipase D) (PLD) (Sphingomyelin phosphodiesterase D) (SMD) (SMase D) (Sphingomyelinase D),Phospholipase family,MAHCYYNSKRGCNRVMKTVALVVLISTVMVEESRGDSQEDKKRPIWNIGHMVNAVKQIEEFLDLGANALEADVTFDDNGNPKWTYHGTPCDCFRDCLRWEYVDEYLKRIRELTSPGSSKFRKGFILLMLDLKISKLSDNAKSKAGKEIADMIIKRLWSGSGEKAQLYIVLSFPYVNDIEFVRAFRERVKSKGFASEAEKRIGWDISGNEDLGKIRDAYQKLGITDNVWQSDGITNCLTRSHDRLAEAVCKRDSDKEWPSLKKVYYWTVDKQSSMKEALKVGVDGMITNDPDDLVAVLNEFSGTHRLANINDSPWQKIPRPKSNC,full_seq (SP),8,False,True,False,False,other,other
3,A0A5C2A2T2,Conus purpurascens (Purple cone),Conodipine-P1 (Cdpi-P1) (EC 3.1.1.4) (Phosphatidylcholine 2-acylhydrolase) (Phospholipase A2) (PLA2) [Cleaved into: Conodipine-P1 alpha subunit; Conodipine-P1 beta subunit],Phospholipase family,MKLLAPVLWAMAALGVTWLVAVDSKESCTKHSNGCSTPLRLPCQEYFRPACDIHDNCYHCGTIFGISRKECDDAFLKDMNTLCKKLGSNSATCPARGKREVTSHRATSIAHSRLWKTALDQKSFLNRKARQAILLTPNSCLYWANNFYMAVHVFGARSYSRTTDPKDCQGLKHCLPNH,mature_seq (NO_SP),35,False,True,False,False,phospholipase_a2,other
4,A0A6B7FMR5,Vipera ammodytes ammodytes (Western sand viper),Disintegrin-like/cysteine-rich protein MPIII-3 (D'C protein MPIII-3) (Metalloproteinase-like protein of class P-III MPIII-3) (Snake venom metalloproteinase precursor-derived protein MPIII-3) (SVMP precursor-derived protein MPIII-3) (Snake venom metalloproteinase-like) (SVMP-like) (Vaa-MPIII-3) (VaaMPIII-3),Venom metalloproteinase (M12B) family,MIQVLLVIICLAVFPYQVSSIILESGNINNYEVVYPQKVTALPKGAIQQLEQKYEDAMQYQFKVKGEPVVLHLEKNKDFFPEDYSETHYSPDDREITTNPPVEDHCYYYGHIQNDADSTASISACNGLKGYFTLRGVTYLIEPLKIPESEAHAIYKYENVEKEDEDPKKCEFRRAGTECRPARSECDVAEYCTGQSAECPTDVFHSNGKPCLNNFGYCYNGNCPIMYHQCYALFGPNATVGQDGCFEWNKKGESYFYCRKENDVPIPCAPEDIKCGRLFCELIKNTCKYDYSEDPDYGMVDHGTKCGDGKVCINRHCVDVTTAY,full_seq (SP),36,True,False,False,True,metalloproteinase,U6BLN5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1040,W4VSB6,Conus victoriae (Queen Victoria cone),Conotoxin Vc7.1 (H_Vc7.1),Conotoxin family,MNTAGRLLLLCLALGLVFESLGIPVADDVEAVRDTDPDEKDPSVHNSLKAVYGDCGGERCRFGCCKTDDGEEKCQHFGCP,full_seq (SP),744,False,True,False,False,conotoxin,other
1041,W4VSB9,Trittame loki (Brush-footed trapdoor spider),Toxin ICK-9,Neurotoxin family,MMKLYSLVIIATLAAAAFAATSEEISAAVSEIISQHQQDLERYAKIVERGEEPKKYIRCSKQLGEKCDLNCECCGAAAYCEDIVYICKEKISDNSILNAFGQAMTAMGNAVSRYYCDAE,full_seq (SP),19,False,True,False,False,neurotoxin,other
1042,W4VSG7,Conus victoriae (Queen Victoria cone),Conotoxin Vc1 (H_Vc1),Conotoxin family,MRTSGRLLLLCLAVGLLLESQAHPNADAGDATRDVGSDRTSVELSKMLKGWQAEKGQRKASAPKKFYVYPPVRRSFY,full_seq (SP),746,True,True,False,False,conotoxin,other
1043,W4VSI6,Trittame loki (Brush-footed trapdoor spider),Toxin ICK-18,Neurotoxin family,MKTIFALVFCCAIAVVVLGFGENEGSTIDHDQNNCKGPGSRCSNKNECCKPKDMETYTYYCGSRWDSSSGDFVRKCVICNRESSMC,full_seq (SP),748,True,True,False,False,neurotoxin,B1P1H6


In [189]:
df["Cluster ID"].value_counts()

Cluster ID
other         640
D2Y1Y6         40
P01140         40
B1P1H6         38
P01385         34
P60980         33
P13487         23
Q2XXR7         22
B3EWF2         20
P15968         18
A0A346CIA0     16
B3EWN2         14
O76963         14
U6BLN5         13
P0C612         13
P0DQX1         12
P61791         12
P0DM68         12
D2Y2R1         11
A0A3S9V8K6     10
R4GUQ3         10
Name: count, dtype: int64

In [191]:
write_fasta(df, "./data/model_input.fasta")

In [190]:
df.to_csv("data/model_input.csv", index=False)