In [197]:
import pandas as pd
from Bio import SeqIO
import numpy as np
from sklearn.utils import shuffle

In [198]:
df = pd.read_csv('data/toxins1.tsv', sep='\t')
df

Unnamed: 0,Entry,Organism,Protein names,Protein families,Sequence,Fragment
0,A0A068B6Q6,Conus betulinus (Beech cone),Conotoxin Bt1.8,Conotoxin A superfamily,PDGRNAAAKAFDLITPTVRKGCCSNPACILNNPNQCG,fragment
1,A0A0B4U9L8,Vipera ammodytes ammodytes (Western sand viper),Zinc metalloproteinase-disintegrin-like protei...,"Venom metalloproteinase (M12B) family, P-III s...",MLQVLLVTICLAVFPYQGSSIILESGNVNDYEVVYPQKLTALLKGA...,
2,A0A0B5A8P4,Conus geographus (Geography cone) (Nubecula ge...,Con-Ins G3 (Insulin 3) [Cleaved into: Con-Ins ...,Insulin family,MTTSFYFLLVALGLLLYVCQSSFGNQHTRNSDTPKHRCGSELADQY...,
3,A0A0B5AC95,Conus geographus (Geography cone) (Nubecula ge...,Con-Ins G1a (Insulin 1) [Cleaved into: Con-Ins...,Insulin family,MTTSSYFLLMALGLLLYVCQSSFGNQHTRTFDTPKHRCGSEITNSY...,
4,A0A0D4WTV1,Loxosceles arizonica (Arizona brown spider),Dermonecrotic toxin LarSicTox-betaID1 (EC 4.6....,"Arthropod phospholipase D family, Class II sub...",EGAEQDGSERTDGGRPIWNIAHMVNNKQAIDKYLDKGANSVESDVS...,fragment
...,...,...,...,...,...,...
6423,P0CI18,Hastula hectica (Sea snail) (Impages hectica),Augerpeptide hhe6.3,,VLFTPPELLGCGNRCSDDCCKWGRCQPGCTD,
6424,P0CI19,Hastula hectica (Sea snail) (Impages hectica),Augerpeptide hhe6.4,,SSLHCGDDPWCPTGCCENEDCDIGCKRDWEKSRSQP,
6425,P0CI21,Hastula hectica (Sea snail) (Impages hectica),Augerpeptide hhe53,,GLSQSGCQAFTGRWCVGCERLRSRVVWECSPKRVVNSI,
6426,Q45RU8,Conus striatus (Striated cone),Conotoxin S5.1,,HNDLIRAGLTVCLSENRKRLTCSGLLNMAGSVCCKVDTSCCSSQ,fragment


In [199]:
df['Protein families'] = df['Protein families'].str.split(',').str[0]
df['Protein families'] = df['Protein families'].str.split(';').str[0]

In [200]:
filtered = df[df['Protein families'].str.contains('superfamily', na=False)]['Protein families'].unique()
filtered

array(['Conotoxin A superfamily', 'Conotoxin O1 superfamily',
       'Conotoxin M superfamily',
       'Long (4 C-C) scorpion toxin superfamily',
       'Short scorpion toxin superfamily', 'Conotoxin B superfamily',
       'Formicidae venom precursor-01 superfamily',
       'Conotoxin D superfamily', 'O2 superfamily',
       'AB hydrolase superfamily', 'Conotoxin T superfamily',
       'Non-disulfide-bridged peptide (NDBP) superfamily',
       'Long (3 C-C) scorpion toxin superfamily',
       'Conotoxin J superfamily', 'Conotoxin O2 superfamily',
       'Conotoxin S superfamily', 'Conotoxin C superfamily',
       'Conotoxin L superfamily', 'Conotoxin I2 superfamily',
       'Conotoxin I1 superfamily', 'Conotoxin P superfamily',
       'Conotoxin U superfamily', 'Conotoxin V superfamily',
       'Conotoxin Y superfamily', 'Conotoxin K superfamily',
       'Conotoxin I3 superfamily', 'Pg turripeptide superfamily',
       'Conopeptide P-like superfamily', 'O1 superfamily',
       'Conotox

In [201]:
df["Protein families"].value_counts()

Protein families
Snake three-finger toxin family            532
Long (4 C-C) scorpion toxin superfamily    418
Phospholipase A2 family                    414
Neurotoxin 10 (Hwtx-1) family              289
Venom metalloproteinase (M12B) family      281
                                          ... 
Scoloptoxin family                           1
Scolopendra neurotoxin 8 family              1
Scolopendra toxin 6 family                   1
Scolopendra toxin 7 family                   1
SLPTX(15) family                             1
Name: count, Length: 235, dtype: int64

In [202]:
df[df["Protein families"].str.contains("phospholipase", case=False, na=False)]["Protein families"].value_counts()

Protein families
Phospholipase A2 family             414
Arthropod phospholipase D family    217
Phospholipase B-like family           2
Name: count, dtype: int64

In [203]:
df['Protein families'] = df['Protein families'].replace('I1 superfamily', 'Conotoxin I1 superfamily')
df['Protein families'] = df['Protein families'].replace('O1 superfamily', 'Conotoxin O1 superfamily')
df['Protein families'] = df['Protein families'].replace('O2 superfamily', 'Conotoxin O2 superfamily')
df['Protein families'] = df['Protein families'].replace('E superfamily', 'Conotoxin E superfamily')
df['Protein families'] = df['Protein families'].replace('F superfamily', 'Conotoxin F superfamily')
df['Protein families'] = df['Protein families'].replace('Conotoxin M family', 'Conotoxin M superfamily')
df['Protein families'] = df['Protein families'].replace('Conotoxin B2 family', 'Conotoxin B2 superfamily')
df['Protein families'] = df['Protein families'].replace('Conotoxin O1 family', 'Conotoxin O1 superfamily')
df['Protein families'] = df['Protein families'].replace('Conotoxin O2 family', 'Conotoxin O2 superfamily')

In [204]:
mapping = {
    r'Conotoxin.*': 'Conotoxin family',
    r'Neurotoxin.*': 'Neurotoxin family',
    r'Scoloptoxin.*|Scolopendra.*': 'Scoloptoxin family',
    r'Caterpillar.*': 'Caterpillar family',
    r'Teretoxin.*': 'Teretoxin family',
    r'Limacoditoxin.*': 'Limacoditoxin family',
    r'Scutigerotoxin.*': 'Scutigerotoxin family',
    r'Cationic peptide.*': 'Cationic peptide family',
    r'Formicidae venom.*': 'Formicidae venom family',
    r'Bradykinin-potentiating peptide family|Natriuretic peptide family': 'Natriuretic, Bradykinin potentiating peptide family',
    r'.*phospholipase.*|.*Phospholipase.*': 'Phospholipase family'
}

# Apply mapping
for pattern, replacement in mapping.items():
    df['Protein families'] = df['Protein families'].str.replace(pattern, replacement, regex=True)

df['Protein families'].value_counts()

Protein families
Conotoxin family                           1131
Neurotoxin family                          1015
Phospholipase family                        633
Snake three-finger toxin family             532
Long (4 C-C) scorpion toxin superfamily     418
                                           ... 
DNase II family                               1
5'-nucleotidase family                        1
Diuretic hormone class 2 family               1
CART family                                   1
SLPTX(15) family                              1
Name: count, Length: 97, dtype: int64

In [205]:
# just some MSA shenanigans
# filtered_df = df[df["Protein families"].str.contains("Neurotoxin", na=False, case=False)]
#
# with open("/Users/selin/Desktop/filtered_neurotoxins.fasta", "w") as fasta_file:
#     for _, row in filtered_df.iterrows():
#         fasta_file.write(f">{row['Entry']}\t{row["Protein families"]}\n{row['Sequence']}\n")
#
#
# filtered_df = df[df["Protein families"].str.contains("Conotoxin", na=False, case=False)]
#
# with open("/Users/selin/Desktop/filtered_conotoxins.fasta", "w") as fasta_file:
#     for _, row in filtered_df.iterrows():
#         fasta_file.write(f">{row['Entry']}\t{row["Protein families"]}\n{row['Sequence']}\n")

In [206]:
df["Protein families"]

0                            Conotoxin family
1       Venom metalloproteinase (M12B) family
2                              Insulin family
3                              Insulin family
4                        Phospholipase family
                        ...                  
6423                                      NaN
6424                                      NaN
6425                                      NaN
6426                                      NaN
6427                                      NaN
Name: Protein families, Length: 6428, dtype: object

In [151]:
scolop = df[df["Entry"].isin(["I6R1R5", "P0DPX5", "P0DPX9", "P0DPY0", "P0DPY1", "P0DPX7", "P0DPX8"])]
scolop

Unnamed: 0,Entry,Organism,Protein names,Protein families,Sequence,Fragment
2440,I6R1R5,Scolopendra mutilans (Chinese red-headed centi...,Omega-scoloptoxin(05)-Ssm1a (Omega-SLPTX(05)-S...,Scoloptoxin family,MPSLCIIALFGTLTFYTLIPSIHTLKCVRCDGPMSNYDCKTTYPAA...,
5337,P0DPX5,Scolopendra alternans (Florida Keys giant cent...,U-scoloptoxin(05)-Sa1a (U-SLPTX(05)-Sa1a),Scoloptoxin family,MPSLCIIALFGTLTFYTLIPSIHTLKCVICDSPMGNYDCKTTYPAA...,
5339,P0DPX7,Scolopendra morsitans (Tanzanian blue ringleg ...,U-scoloptoxin(05)-Sm1a (U-SLPTX(05)-Sm1a),Scoloptoxin family,MNVLYTKIFFILILTRTSSALKCVQCFTGYGNIVQDCNKTSYSEPK...,
5340,P0DPX8,Ethmostigmus rubripes (Giant centipede),U-scoloptoxin(05)-Er1a (U-SLPTX(05)-Er1a),Scoloptoxin family,MLSLGVSIFLLVFLIPENSGLECYQCTWMKNSQSPDNCYKNLPNAT...,
5341,P0DPX9,Ethmostigmus rubripes (Giant centipede),U-scoloptoxin(05)-Er2a (U-SLPTX(05)-Er2a),Scoloptoxin family,MTFVVAAVVLLTVVPLATPLKCVQCDGPLTEFDCKTTVPEAKDCPQ...,
5342,P0DPY0,Ethmostigmus rubripes (Giant centipede),U-scoloptoxin(05)-Er3a (U-SLPTX(05)-Er3a),Scoloptoxin family,MRSWFVFVALLAVVFLPSSLDALKCIQCDSQPNRDECKTTLPEARD...,
5343,P0DPY1,Cormocephalus westwoodi (Westwood's green cent...,U-scoloptoxin(05)-Cw1a (U-SLPTX(05)-Cw1a),Scoloptoxin family,MNPLNLSTFIVFTLFAASATTALTCFQCTTSEGSDYCVSSFPKPSQ...,


In [207]:
df = df.loc[df['Fragment'] != "fragment"]
df = df.loc[df['Protein families'].notnull()]
df

Unnamed: 0,Entry,Organism,Protein names,Protein families,Sequence,Fragment
1,A0A0B4U9L8,Vipera ammodytes ammodytes (Western sand viper),Zinc metalloproteinase-disintegrin-like protei...,Venom metalloproteinase (M12B) family,MLQVLLVTICLAVFPYQGSSIILESGNVNDYEVVYPQKLTALLKGA...,
2,A0A0B5A8P4,Conus geographus (Geography cone) (Nubecula ge...,Con-Ins G3 (Insulin 3) [Cleaved into: Con-Ins ...,Insulin family,MTTSFYFLLVALGLLLYVCQSSFGNQHTRNSDTPKHRCGSELADQY...,
3,A0A0B5AC95,Conus geographus (Geography cone) (Nubecula ge...,Con-Ins G1a (Insulin 1) [Cleaved into: Con-Ins...,Insulin family,MTTSSYFLLMALGLLLYVCQSSFGNQHTRTFDTPKHRCGSEITNSY...,
5,A0A0D4WV12,Sicarius terrosus (Cave spider),Dermonecrotic toxin StSicTox-betaIB1i (EC 4.6....,Phospholipase family,GDSRRPIWNIAHMVNDLDLVDEYLDDGANSLELDVEFSKSGTALRT...,
6,A0A0N7CSQ4,Scolopendra mutilans (Chinese red-headed centi...,Tau-scoloptoxin(04)-Ssm1b (Tau-SLPTX(04)-Ssm1b...,Scoloptoxin family,MLKSFCILSVFMVLFLAKFPDLCSGEEISPLKIVVRNSEYLNNPCN...,
...,...,...,...,...,...,...
6405,W4VSI7,Trittame loki (Brush-footed trapdoor spider),Toxin ICK-13,Neurotoxin family,MKPTISILIFFALAVAIMGHRLNSGYGIPHIVEKLPNGQWCRTPGD...,
6406,W4VSI8,Trittame loki (Brush-footed trapdoor spider),Toxin ICK-8,Neurotoxin family,MMKLYSLVIIATLAAAAFAATSEEISAAVSEIISQHQEDLERYAKI...,
6407,W4VSI9,Trittame loki (Brush-footed trapdoor spider),U10-barytoxin-Tl1a (U10-BATX-Tl1a) (Toxin ICK-3),Neurotoxin family,MKTLVLVAVLGVASLYLLSSASEVQQLSPAEEEFRAFVSTFGGLFE...,
6408,X5IFY8,Conus geographus (Geography cone) (Nubecula ge...,Contryphan-G,Conotoxin family,MGKLTILVLVAAVLLSTQAMVQGDGDQPAARNAVPRDDNPDGPSAK...,


In [208]:
df["Protein families"].value_counts()

Protein families
Conotoxin family                           989
Neurotoxin family                          959
Snake three-finger toxin family            485
Long (4 C-C) scorpion toxin superfamily    337
Phospholipase family                       319
                                          ... 
DNase II family                              1
Type-B carboxylesterase/lipase family        1
Diuretic hormone class 2 family              1
CART family                                  1
SLPTX(15) family                             1
Name: count, Length: 95, dtype: int64

In [157]:
#df.to_csv('data/foldseek/toxins.csv', index=False)

### back to clustering and redundancy reduction

In [38]:
def write_fasta(df, filename):
    """Writes a DataFrame to a FASTA file."""
    with open(filename, "w") as f:
        for _, row in df.iterrows():
            f.write(f">{row['Entry']}\n{row['Sequence']}\n")

write_fasta(df, "data/toxins.fasta")

In [39]:
!mmseqs easy-cluster data/toxins.fasta data/mmseqs_0.3/toxins_cluster data/mmseqs_0.3/tmp --min-seq-id 0.3

easy-cluster data/toxins.fasta data/mmseqs_0.3/toxins_cluster data/mmseqs_0.3/tmp --min-seq-id 0.3 

MMseqs Version:                     	17.b804f
Substitution matrix                 	aa:blosum62.out,nucl:nucleotide.out
Seed substitution matrix            	aa:VTML80.out,nucl:nucleotide.out
Sensitivity                         	4
k-mer length                        	0
Target search mode                  	0
k-score                             	seq:2147483647,prof:2147483647
Alphabet size                       	aa:21,nucl:5
Max sequence length                 	65535
Max results per query               	20
Split database                      	0
Split mode                          	2
Split memory limit                  	0
Coverage threshold                  	0.8
Coverage mode                       	0
Compositional bias                  	1
Compositional bias                  	1
Diagonal scoring                    	true
Exact k-mer matching                	0
Mask residues 

In [209]:
rep_data = []
with open("data/mmseqs_0.3/toxins_cluster_rep_seq.fasta") as f:
    for record in SeqIO.parse(f, "fasta"):
        rep_data.append([record.id, str(record.seq)])

toxins_rep = pd.DataFrame(rep_data, columns=["Entry", "Sequence"])

toxins_rep = toxins_rep.merge(df[["Entry", "Protein families"]], on="Entry", how="left")
toxins_rep

Unnamed: 0,Entry,Sequence,Protein families
0,Q2I2R5,MKLSVMFIVFLMLTMPMTCAGISRSATNGGEADVRAHDKAANLMAL...,Conotoxin family
1,Q3BER1,MIPVLLVTICLAVFPFQGSSIILESGNINDYEIVYPKKVAVLPTGA...,Disintegrin family
2,B1P1B4,MTRKILAVLLVFTLVACNNAEKYSETDVEDSPMIQERRCEPSGKPC...,Neurotoxin family
3,B1P1H0,MYDEILSAFFEVNDELQSEARCGEKNDRCKTNQDCCSGFRCTKFRR...,Neurotoxin family
4,B1P1H8,MIFLLPPVIFVMLLAESVLILGDSEDADLMEMVQMSRPFFNPIIPA...,Neurotoxin family
...,...,...,...
747,A0A7S8RGC8,MYNMVSLFIVAVLLLTYANVEGSDVTGGFPVNSNNCIYPCYSTQDE...,Long (4 C-C) scorpion toxin superfamily
748,A5X2W8,MKTLLLILGVVAFVYLEPGYTTNCFTCTTWTLSCREFEKCPPDKGT...,Snake three-finger toxin family
749,A6YR42,MKLTCVLVVLLLVLPFGDLITTSNTEDNKRGATPWQNSLKARGVCS...,Conotoxin family
750,A7X3W6,MGQFTVVSLGLLAMFLSLSGAKGDNCPASWISRNGVCNKLFPDRKT...,True venom lectin family


In [210]:
# toxins_rep["Protein families"].value_counts()

In [211]:
cluster_df = pd.read_csv("data/mmseqs_0.3/toxins_cluster_cluster.tsv", sep='\t', header=None, names=['Entry', 'cluster_member'])
cluster_df

Unnamed: 0,Entry,cluster_member
0,A0A0D4WV12,A0A0D4WV12
1,A0A0D4WV12,A0A1L4BJ98
2,A0A0D4WV12,E5D3Z8
3,A0A0D4WV12,Q8I914
4,A0A0D4WV12,Q1KY79
...,...,...
5034,Q9BP83,Q9BH84
5035,Q9BP83,Q9XYZ0
5036,Q9BP83,Q9BP99
5037,Q9BP83,Q9XZL2


### noise

In [212]:
print((cluster_df['Entry'].value_counts() != 1).sum())

403


In [213]:
cluster_df['Cluster'] = cluster_df['Entry'].rank(method='dense').astype(int) - 1
cluster_df

Unnamed: 0,Entry,cluster_member,Cluster
0,A0A0D4WV12,A0A0D4WV12,8
1,A0A0D4WV12,A0A1L4BJ98,8
2,A0A0D4WV12,E5D3Z8,8
3,A0A0D4WV12,Q8I914,8
4,A0A0D4WV12,Q1KY79,8
...,...,...,...
5034,Q9BP83,Q9BH84,731
5035,Q9BP83,Q9XYZ0,731
5036,Q9BP83,Q9BP99,731
5037,Q9BP83,Q9XZL2,731


In [214]:
df = df.merge(cluster_df[['cluster_member', 'Cluster']], left_on='Entry', right_on='cluster_member', how='left').drop(columns='cluster_member')
df

Unnamed: 0,Entry,Organism,Protein names,Protein families,Sequence,Fragment,Cluster
0,A0A0B4U9L8,Vipera ammodytes ammodytes (Western sand viper),Zinc metalloproteinase-disintegrin-like protei...,Venom metalloproteinase (M12B) family,MLQVLLVTICLAVFPYQGSSIILESGNVNDYEVVYPQKLTALLKGA...,,60
1,A0A0B5A8P4,Conus geographus (Geography cone) (Nubecula ge...,Con-Ins G3 (Insulin 3) [Cleaved into: Con-Ins ...,Insulin family,MTTSFYFLLVALGLLLYVCQSSFGNQHTRNSDTPKHRCGSELADQY...,,7
2,A0A0B5AC95,Conus geographus (Geography cone) (Nubecula ge...,Con-Ins G1a (Insulin 1) [Cleaved into: Con-Ins...,Insulin family,MTTSSYFLLMALGLLLYVCQSSFGNQHTRTFDTPKHRCGSEITNSY...,,7
3,A0A0D4WV12,Sicarius terrosus (Cave spider),Dermonecrotic toxin StSicTox-betaIB1i (EC 4.6....,Phospholipase family,GDSRRPIWNIAHMVNDLDLVDEYLDDGANSLELDVEFSKSGTALRT...,,8
4,A0A0N7CSQ4,Scolopendra mutilans (Chinese red-headed centi...,Tau-scoloptoxin(04)-Ssm1b (Tau-SLPTX(04)-Ssm1b...,Scoloptoxin family,MLKSFCILSVFMVLFLAKFPDLCSGEEISPLKIVVRNSEYLNNPCN...,,12
...,...,...,...,...,...,...,...
5034,W4VSI7,Trittame loki (Brush-footed trapdoor spider),Toxin ICK-13,Neurotoxin family,MKPTISILIFFALAVAIMGHRLNSGYGIPHIVEKLPNGQWCRTPGD...,,748
5035,W4VSI8,Trittame loki (Brush-footed trapdoor spider),Toxin ICK-8,Neurotoxin family,MMKLYSLVIIATLAAAAFAATSEEISAAVSEIISQHQEDLERYAKI...,,19
5036,W4VSI9,Trittame loki (Brush-footed trapdoor spider),U10-barytoxin-Tl1a (U10-BATX-Tl1a) (Toxin ICK-3),Neurotoxin family,MKTLVLVAVLGVASLYLLSSASEVQQLSPAEEEFRAFVSTFGGLFE...,,581
5037,X5IFY8,Conus geographus (Geography cone) (Nubecula ge...,Contryphan-G,Conotoxin family,MGKLTILVLVAAVLLSTQAMVQGDGDQPAARNAVPRDDNPDGPSAK...,,24


In [215]:
rep_seq_ids = {record.id for record in SeqIO.parse("data/mmseqs_0.3/toxins_cluster_rep_seq.fasta", "fasta")}
df['Cluster_Rep'] = df['Entry'].isin(rep_seq_ids)
df

Unnamed: 0,Entry,Organism,Protein names,Protein families,Sequence,Fragment,Cluster,Cluster_Rep
0,A0A0B4U9L8,Vipera ammodytes ammodytes (Western sand viper),Zinc metalloproteinase-disintegrin-like protei...,Venom metalloproteinase (M12B) family,MLQVLLVTICLAVFPYQGSSIILESGNVNDYEVVYPQKLTALLKGA...,,60,False
1,A0A0B5A8P4,Conus geographus (Geography cone) (Nubecula ge...,Con-Ins G3 (Insulin 3) [Cleaved into: Con-Ins ...,Insulin family,MTTSFYFLLVALGLLLYVCQSSFGNQHTRNSDTPKHRCGSELADQY...,,7,False
2,A0A0B5AC95,Conus geographus (Geography cone) (Nubecula ge...,Con-Ins G1a (Insulin 1) [Cleaved into: Con-Ins...,Insulin family,MTTSSYFLLMALGLLLYVCQSSFGNQHTRTFDTPKHRCGSEITNSY...,,7,False
3,A0A0D4WV12,Sicarius terrosus (Cave spider),Dermonecrotic toxin StSicTox-betaIB1i (EC 4.6....,Phospholipase family,GDSRRPIWNIAHMVNDLDLVDEYLDDGANSLELDVEFSKSGTALRT...,,8,True
4,A0A0N7CSQ4,Scolopendra mutilans (Chinese red-headed centi...,Tau-scoloptoxin(04)-Ssm1b (Tau-SLPTX(04)-Ssm1b...,Scoloptoxin family,MLKSFCILSVFMVLFLAKFPDLCSGEEISPLKIVVRNSEYLNNPCN...,,12,True
...,...,...,...,...,...,...,...,...
5034,W4VSI7,Trittame loki (Brush-footed trapdoor spider),Toxin ICK-13,Neurotoxin family,MKPTISILIFFALAVAIMGHRLNSGYGIPHIVEKLPNGQWCRTPGD...,,748,False
5035,W4VSI8,Trittame loki (Brush-footed trapdoor spider),Toxin ICK-8,Neurotoxin family,MMKLYSLVIIATLAAAAFAATSEEISAAVSEIISQHQEDLERYAKI...,,19,False
5036,W4VSI9,Trittame loki (Brush-footed trapdoor spider),U10-barytoxin-Tl1a (U10-BATX-Tl1a) (Toxin ICK-3),Neurotoxin family,MKTLVLVAVLGVASLYLLSSASEVQQLSPAEEEFRAFVSTFGGLFE...,,581,False
5037,X5IFY8,Conus geographus (Geography cone) (Nubecula ge...,Contryphan-G,Conotoxin family,MGKLTILVLVAAVLLSTQAMVQGDGDQPAARNAVPRDDNPDGPSAK...,,24,False


### Train-Val-Test sets with 70:15:15 split

In [216]:
# Get unique clusters and shuffle them
clusters = df['Cluster'].unique()
np.random.seed(42)
np.random.shuffle(clusters)

# Calculate total number of entries
total_entries = len(df)

# Define target sizes
train_target = 0.7 * total_entries
val_target = 0.15 * total_entries
test_target = 0.15 * total_entries

# Initialize cluster lists
train_clusters, val_clusters, test_clusters = [], [], []

# Initialize counters
train_count, val_count, test_count = 0, 0, 0

# Assign clusters while considering their sizes
for cluster in clusters:
    cluster_size = len(df[df['Cluster'] == cluster])

    if train_count + cluster_size <= train_target:
        train_clusters.append(cluster)
        train_count += cluster_size
    elif val_count + cluster_size <= val_target:
        val_clusters.append(cluster)
        val_count += cluster_size
    else:
        test_clusters.append(cluster)
        test_count += cluster_size

# Filter original df by cluster IDs
train_df = df[df['Cluster'].isin(train_clusters)]
val_df = df[df['Cluster'].isin(val_clusters)]
test_df = df[df['Cluster'].isin(test_clusters)]

# Shuffle each dataset
train_df = shuffle(train_df, random_state=42).reset_index(drop=True)
val_df = shuffle(val_df, random_state=42).reset_index(drop=True)
test_df = shuffle(test_df, random_state=42).reset_index(drop=True)

# Print results
print(f"Total entries: {total_entries}")
print(f"Train entries: {len(train_df)} ({len(train_df)/total_entries:.2%})")
print(f"Validation entries: {len(val_df)} ({len(val_df)/total_entries:.2%})")
print(f"Test entries: {len(test_df)} ({len(test_df)/total_entries:.2%})")

# Output dataframes
train_df, val_df, test_df

Total entries: 5039
Train entries: 3527 (69.99%)
Validation entries: 755 (14.98%)
Test entries: 757 (15.02%)


(           Entry                                           Organism  \
 0         Q4VM07  Macrovipera lebetinus (Levantine viper) (Viper...   
 1         B6DD05  Lycosa singoriensis (Wolf spider) (Aranea sing...   
 2         P0CI57      Lychas mucronatus (Chinese swimming scorpion)   
 3         P10116  Laticauda colubrina (Yellow-lipped sea krait) ...   
 4         Q1ELU5                       Lachesana tarabaevi (Spider)   
 ...          ...                                                ...   
 3522  A0A2P1BRQ0          Scorpaena plumieri (Spotted scorpionfish)   
 3523      C1IC52        Walterinnesia aegyptia (Desert black snake)   
 3524      P82942    Naja kaouthia (Monocled cobra) (Naja siamensis)   
 3525      W4VS70              Conus victoriae (Queen Victoria cone)   
 3526      Q3YEE2                    Conus capitaneus (Captain cone)   
 
                                           Protein names  \
 0     Zinc metalloproteinase-disintegrin-like VLAIP-...   
 1     U10-lyc

In [217]:
write_fasta(train_df, "data/toxins_train.fasta")
write_fasta(val_df, "data/toxins_val.fasta")
write_fasta(test_df, "data/toxins_test.fasta")

In [218]:
!mmseqs easy-cluster data/toxins_train.fasta data/mmseqs_0.5/train_cluster data/mmseqs_0.5/tmp --min-seq-id 0.5

easy-cluster data/toxins_train.fasta data/mmseqs_0.5/train_cluster data/mmseqs_0.5/tmp --min-seq-id 0.5 

MMseqs Version:                     	17.b804f
Substitution matrix                 	aa:blosum62.out,nucl:nucleotide.out
Seed substitution matrix            	aa:VTML80.out,nucl:nucleotide.out
Sensitivity                         	4
k-mer length                        	0
Target search mode                  	0
k-score                             	seq:2147483647,prof:2147483647
Alphabet size                       	aa:21,nucl:5
Max sequence length                 	65535
Max results per query               	20
Split database                      	0
Split mode                          	2
Split memory limit                  	0
Coverage threshold                  	0.8
Coverage mode                       	0
Compositional bias                  	1
Compositional bias                  	1
Diagonal scoring                    	true
Exact k-mer matching                	0
Mask resi

In [219]:
!mmseqs easy-cluster data/toxins_val.fasta data/mmseqs_0.5/val_cluster data/mmseqs_0.5/tmp --min-seq-id 0.5

easy-cluster data/toxins_val.fasta data/mmseqs_0.5/val_cluster data/mmseqs_0.5/tmp --min-seq-id 0.5 

MMseqs Version:                     	17.b804f
Substitution matrix                 	aa:blosum62.out,nucl:nucleotide.out
Seed substitution matrix            	aa:VTML80.out,nucl:nucleotide.out
Sensitivity                         	4
k-mer length                        	0
Target search mode                  	0
k-score                             	seq:2147483647,prof:2147483647
Alphabet size                       	aa:21,nucl:5
Max sequence length                 	65535
Max results per query               	20
Split database                      	0
Split mode                          	2
Split memory limit                  	0
Coverage threshold                  	0.8
Coverage mode                       	0
Compositional bias                  	1
Compositional bias                  	1
Diagonal scoring                    	true
Exact k-mer matching                	0
Mask residues

In [220]:
!mmseqs easy-cluster data/toxins_test.fasta data/mmseqs_0.5/test_cluster data/mmseqs_0.5/tmp --min-seq-id 0.5

easy-cluster data/toxins_test.fasta data/mmseqs_0.5/test_cluster data/mmseqs_0.5/tmp --min-seq-id 0.5 

MMseqs Version:                     	17.b804f
Substitution matrix                 	aa:blosum62.out,nucl:nucleotide.out
Seed substitution matrix            	aa:VTML80.out,nucl:nucleotide.out
Sensitivity                         	4
k-mer length                        	0
Target search mode                  	0
k-score                             	seq:2147483647,prof:2147483647
Alphabet size                       	aa:21,nucl:5
Max sequence length                 	65535
Max results per query               	20
Split database                      	0
Split mode                          	2
Split memory limit                  	0
Coverage threshold                  	0.8
Coverage mode                       	0
Compositional bias                  	1
Compositional bias                  	1
Diagonal scoring                    	true
Exact k-mer matching                	0
Mask residu

In [221]:
rep_seq_files = {
    "Train_Cluster_Rep": "data/mmseqs_0.5/train_cluster_rep_seq.fasta",
    "Val_Cluster_Rep": "data/mmseqs_0.5/val_cluster_rep_seq.fasta",
    "Test_Cluster_Rep": "data/mmseqs_0.5/test_cluster_rep_seq.fasta",
}

for col, file in rep_seq_files.items():
    rep_seq_ids = {record.id for record in SeqIO.parse(file, "fasta")}
    df[col] = df['Entry'].isin(rep_seq_ids)

df

Unnamed: 0,Entry,Organism,Protein names,Protein families,Sequence,Fragment,Cluster,Cluster_Rep,Train_Cluster_Rep,Val_Cluster_Rep,Test_Cluster_Rep
0,A0A0B4U9L8,Vipera ammodytes ammodytes (Western sand viper),Zinc metalloproteinase-disintegrin-like protei...,Venom metalloproteinase (M12B) family,MLQVLLVTICLAVFPYQGSSIILESGNVNDYEVVYPQKLTALLKGA...,,60,False,False,False,False
1,A0A0B5A8P4,Conus geographus (Geography cone) (Nubecula ge...,Con-Ins G3 (Insulin 3) [Cleaved into: Con-Ins ...,Insulin family,MTTSFYFLLVALGLLLYVCQSSFGNQHTRNSDTPKHRCGSELADQY...,,7,False,False,False,False
2,A0A0B5AC95,Conus geographus (Geography cone) (Nubecula ge...,Con-Ins G1a (Insulin 1) [Cleaved into: Con-Ins...,Insulin family,MTTSSYFLLMALGLLLYVCQSSFGNQHTRTFDTPKHRCGSEITNSY...,,7,False,False,False,False
3,A0A0D4WV12,Sicarius terrosus (Cave spider),Dermonecrotic toxin StSicTox-betaIB1i (EC 4.6....,Phospholipase family,GDSRRPIWNIAHMVNDLDLVDEYLDDGANSLELDVEFSKSGTALRT...,,8,True,True,False,False
4,A0A0N7CSQ4,Scolopendra mutilans (Chinese red-headed centi...,Tau-scoloptoxin(04)-Ssm1b (Tau-SLPTX(04)-Ssm1b...,Scoloptoxin family,MLKSFCILSVFMVLFLAKFPDLCSGEEISPLKIVVRNSEYLNNPCN...,,12,True,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...
5034,W4VSI7,Trittame loki (Brush-footed trapdoor spider),Toxin ICK-13,Neurotoxin family,MKPTISILIFFALAVAIMGHRLNSGYGIPHIVEKLPNGQWCRTPGD...,,748,False,False,False,False
5035,W4VSI8,Trittame loki (Brush-footed trapdoor spider),Toxin ICK-8,Neurotoxin family,MMKLYSLVIIATLAAAAFAATSEEISAAVSEIISQHQEDLERYAKI...,,19,False,False,False,False
5036,W4VSI9,Trittame loki (Brush-footed trapdoor spider),U10-barytoxin-Tl1a (U10-BATX-Tl1a) (Toxin ICK-3),Neurotoxin family,MKTLVLVAVLGVASLYLLSSASEVQQLSPAEEEFRAFVSTFGGLFE...,,581,False,True,False,False
5037,X5IFY8,Conus geographus (Geography cone) (Nubecula ge...,Contryphan-G,Conotoxin family,MGKLTILVLVAAVLLSTQAMVQGDGDQPAARNAVPRDDNPDGPSAK...,,24,False,False,False,False


In [222]:
all_50_pct_seq = (
    df["Train_Cluster_Rep"].sum().item() +
    df["Val_Cluster_Rep"].sum().item() +
    df["Test_Cluster_Rep"].sum().item()
)
print("All representative sequences after 50% redundancy reduction:", all_50_pct_seq)
print(f"Train entries: {df["Train_Cluster_Rep"].sum().item()} ({df["Train_Cluster_Rep"].sum().item()/all_50_pct_seq:.2%})")

print(f"Validation entries: {df["Val_Cluster_Rep"].sum().item()} ({df["Val_Cluster_Rep"].sum().item()/all_50_pct_seq:.2%})")
print(f"Test entries: {df["Test_Cluster_Rep"].sum().item()} ({df["Test_Cluster_Rep"].sum().item()/all_50_pct_seq:.2%})")

All representative sequences after 50% redundancy reduction: 1045
Train entries: 757 (72.44%)
Validation entries: 164 (15.69%)
Test entries: 124 (11.87%)


In [223]:
train_val_test_data = df[df[["Train_Cluster_Rep", "Val_Cluster_Rep", "Test_Cluster_Rep"]].any(axis=1)]
train_val_test_data

Unnamed: 0,Entry,Organism,Protein names,Protein families,Sequence,Fragment,Cluster,Cluster_Rep,Train_Cluster_Rep,Val_Cluster_Rep,Test_Cluster_Rep
3,A0A0D4WV12,Sicarius terrosus (Cave spider),Dermonecrotic toxin StSicTox-betaIB1i (EC 4.6....,Phospholipase family,GDSRRPIWNIAHMVNDLDLVDEYLDDGANSLELDVEFSKSGTALRT...,,8,True,True,False,False
4,A0A0N7CSQ4,Scolopendra mutilans (Chinese red-headed centi...,Tau-scoloptoxin(04)-Ssm1b (Tau-SLPTX(04)-Ssm1b...,Scoloptoxin family,MLKSFCILSVFMVLFLAKFPDLCSGEEISPLKIVVRNSEYLNNPCN...,,12,True,True,False,False
7,A0A1L4BJ98,Hemiscorpius lepturus (Scorpion),Dermonecrotic toxin Hl-PLD1 (EC 4.6.1.-) (Phos...,Phospholipase family,MAHCYYNSKRGCNRVMKTVALVVLISTVMVEESRGDSQEDKKRPIW...,,8,False,True,False,False
15,A0A5C2A2T2,Conus purpurascens (Purple cone),Conodipine-P1 (Cdpi-P1) (EC 3.1.1.4) (Phosphat...,Phospholipase family,MKLLAPVLWAMAALGVTWLVAVDSKESCTKHSNGCSTPLRLPCQEY...,,35,False,True,False,False
16,A0A6B7FMR5,Vipera ammodytes ammodytes (Western sand viper),Disintegrin-like/cysteine-rich protein MPIII-3...,Venom metalloproteinase (M12B) family,MIQVLLVIICLAVFPYQVSSIILESGNINNYEVVYPQKVTALPKGA...,,36,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...
5024,W4VSB6,Conus victoriae (Queen Victoria cone),Conotoxin Vc7.1 (H_Vc7.1),Conotoxin family,MNTAGRLLLLCLALGLVFESLGIPVADDVEAVRDTDPDEKDPSVHN...,,744,False,True,False,False
5026,W4VSB9,Trittame loki (Brush-footed trapdoor spider),Toxin ICK-9,Neurotoxin family,MMKLYSLVIIATLAAAAFAATSEEISAAVSEIISQHQQDLERYAKI...,,19,False,True,False,False
5028,W4VSG7,Conus victoriae (Queen Victoria cone),Conotoxin Vc1 (H_Vc1),Conotoxin family,MRTSGRLLLLCLAVGLLLESQAHPNADAGDATRDVGSDRTSVELSK...,,746,True,True,False,False
5033,W4VSI6,Trittame loki (Brush-footed trapdoor spider),Toxin ICK-18,Neurotoxin family,MKTIFALVFCCAIAVVVLGFGENEGSTIDHDQNNCKGPGSRCSNKN...,,748,True,True,False,False


In [224]:
no_na = train_val_test_data.dropna(subset=["Protein families"]).copy()

print("All representative sequences after 50% redundancy reduction:", len(no_na))
print(f"Train entries: {no_na['Train_Cluster_Rep'].sum()} ({no_na['Train_Cluster_Rep'].sum()/len(no_na):.2%})")
print(f"Validation entries: {no_na['Val_Cluster_Rep'].sum()} ({no_na['Val_Cluster_Rep'].sum()/len(no_na):.2%})")
print(f"Test entries: {no_na['Test_Cluster_Rep'].sum()} ({no_na['Test_Cluster_Rep'].sum()/len(no_na):.2%})")

All representative sequences after 50% redundancy reduction: 1045
Train entries: 757 (72.44%)
Validation entries: 164 (15.69%)
Test entries: 124 (11.87%)


In [225]:
no_na["Protein families"].value_counts()

Protein families
Conotoxin family                                       215
Neurotoxin family                                      167
Scoloptoxin family                                      83
Short scorpion toxin superfamily                        69
Natriuretic, Bradykinin potentiating peptide family     52
                                                      ... 
Actinoporin family                                       1
ConoGAY family                                           1
Opioid neuropeptide precursor family                     1
Diuretic hormone class 2 family                          1
Cystatin family                                          1
Name: count, Length: 93, dtype: int64

In [226]:
print(no_na["Protein families"].value_counts()[no_na["Protein families"].value_counts() <= 10].sum())
no_na["Protein families"].value_counts()[no_na["Protein families"].value_counts() <= 10]

199


Protein families
Long chain scorpion toxin family           10
Bradykinin-related peptide family          10
Teretoxin family                            9
Vasopressin/oxytocin family                 8
FARP (FMRFamide related peptide) family     8
                                           ..
Actinoporin family                          1
ConoGAY family                              1
Opioid neuropeptide precursor family        1
Diuretic hormone class 2 family             1
Cystatin family                             1
Name: count, Length: 77, dtype: int64

In [227]:
print(no_na["Protein families"].value_counts()[no_na["Protein families"].value_counts() > 10].sum())
no_na["Protein families"].value_counts()[no_na["Protein families"].value_counts() > 10]

846


Protein families
Conotoxin family                                       215
Neurotoxin family                                      167
Scoloptoxin family                                      83
Short scorpion toxin superfamily                        69
Natriuretic, Bradykinin potentiating peptide family     52
MCD family                                              41
Long (4 C-C) scorpion toxin superfamily                 39
Snake three-finger toxin family                         38
Cationic peptide family                                 31
Non-disulfide-bridged peptide (NDBP) superfamily        19
Venom Kunitz-type family                                18
Formicidae venom family                                 16
Phospholipase family                                    16
Long (3 C-C) scorpion toxin superfamily                 15
Venom metalloproteinase (M12B) family                   15
Limacoditoxin family                                    12
Name: count, dtype: int64

### some interesting scoloptoxins are still in the data, nice

In [177]:
no_na[no_na["Entry"].isin(["I6R1R5", "P0DPX5", "P0DPX9", "P0DPY0", "P0DPY1", "P0DPX7", "P0DPX8"])]

Unnamed: 0,Entry,Organism,Protein names,Protein families,Sequence,Fragment,Cluster,Cluster_Rep,Train_Cluster_Rep,Val_Cluster_Rep,Test_Cluster_Rep
4187,P0DPX7,Scolopendra morsitans (Tanzanian blue ringleg ...,U-scoloptoxin(05)-Sm1a (U-SLPTX(05)-Sm1a),Scoloptoxin family,MNVLYTKIFFILILTRTSSALKCVQCFTGYGNIVQDCNKTSYSEPK...,,393,False,False,True,False
4188,P0DPX8,Ethmostigmus rubripes (Giant centipede),U-scoloptoxin(05)-Er1a (U-SLPTX(05)-Er1a),Scoloptoxin family,MLSLGVSIFLLVFLIPENSGLECYQCTWMKNSQSPDNCYKNLPNAT...,,393,False,False,True,False
4189,P0DPX9,Ethmostigmus rubripes (Giant centipede),U-scoloptoxin(05)-Er2a (U-SLPTX(05)-Er2a),Scoloptoxin family,MTFVVAAVVLLTVVPLATPLKCVQCDGPLTEFDCKTTVPEAKDCPQ...,,393,False,False,True,False
4190,P0DPY0,Ethmostigmus rubripes (Giant centipede),U-scoloptoxin(05)-Er3a (U-SLPTX(05)-Er3a),Scoloptoxin family,MRSWFVFVALLAVVFLPSSLDALKCIQCDSQPNRDECKTTLPEARD...,,393,True,False,True,False
4191,P0DPY1,Cormocephalus westwoodi (Westwood's green cent...,U-scoloptoxin(05)-Cw1a (U-SLPTX(05)-Cw1a),Scoloptoxin family,MNPLNLSTFIVFTLFAASATTALTCFQCTTSEGSDYCVSSFPKPSQ...,,393,False,False,True,False


In [228]:
no_na["Protein families"] = no_na["Protein families"].apply(lambda x: "other" if no_na["Protein families"].value_counts()[x] <= 10 else x)
no_na

Unnamed: 0,Entry,Organism,Protein names,Protein families,Sequence,Fragment,Cluster,Cluster_Rep,Train_Cluster_Rep,Val_Cluster_Rep,Test_Cluster_Rep
3,A0A0D4WV12,Sicarius terrosus (Cave spider),Dermonecrotic toxin StSicTox-betaIB1i (EC 4.6....,Phospholipase family,GDSRRPIWNIAHMVNDLDLVDEYLDDGANSLELDVEFSKSGTALRT...,,8,True,True,False,False
4,A0A0N7CSQ4,Scolopendra mutilans (Chinese red-headed centi...,Tau-scoloptoxin(04)-Ssm1b (Tau-SLPTX(04)-Ssm1b...,Scoloptoxin family,MLKSFCILSVFMVLFLAKFPDLCSGEEISPLKIVVRNSEYLNNPCN...,,12,True,True,False,False
7,A0A1L4BJ98,Hemiscorpius lepturus (Scorpion),Dermonecrotic toxin Hl-PLD1 (EC 4.6.1.-) (Phos...,Phospholipase family,MAHCYYNSKRGCNRVMKTVALVVLISTVMVEESRGDSQEDKKRPIW...,,8,False,True,False,False
15,A0A5C2A2T2,Conus purpurascens (Purple cone),Conodipine-P1 (Cdpi-P1) (EC 3.1.1.4) (Phosphat...,Phospholipase family,MKLLAPVLWAMAALGVTWLVAVDSKESCTKHSNGCSTPLRLPCQEY...,,35,False,True,False,False
16,A0A6B7FMR5,Vipera ammodytes ammodytes (Western sand viper),Disintegrin-like/cysteine-rich protein MPIII-3...,Venom metalloproteinase (M12B) family,MIQVLLVIICLAVFPYQVSSIILESGNINNYEVVYPQKVTALPKGA...,,36,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...
5024,W4VSB6,Conus victoriae (Queen Victoria cone),Conotoxin Vc7.1 (H_Vc7.1),Conotoxin family,MNTAGRLLLLCLALGLVFESLGIPVADDVEAVRDTDPDEKDPSVHN...,,744,False,True,False,False
5026,W4VSB9,Trittame loki (Brush-footed trapdoor spider),Toxin ICK-9,Neurotoxin family,MMKLYSLVIIATLAAAAFAATSEEISAAVSEIISQHQQDLERYAKI...,,19,False,True,False,False
5028,W4VSG7,Conus victoriae (Queen Victoria cone),Conotoxin Vc1 (H_Vc1),Conotoxin family,MRTSGRLLLLCLAVGLLLESQAHPNADAGDATRDVGSDRTSVELSK...,,746,True,True,False,False
5033,W4VSI6,Trittame loki (Brush-footed trapdoor spider),Toxin ICK-18,Neurotoxin family,MKTIFALVFCCAIAVVVLGFGENEGSTIDHDQNNCKGPGSRCSNKN...,,748,True,True,False,False


In [229]:
no_na["Protein families"].value_counts()

Protein families
Conotoxin family                                       215
other                                                  199
Neurotoxin family                                      167
Scoloptoxin family                                      83
Short scorpion toxin superfamily                        69
Natriuretic, Bradykinin potentiating peptide family     52
MCD family                                              41
Long (4 C-C) scorpion toxin superfamily                 39
Snake three-finger toxin family                         38
Cationic peptide family                                 31
Non-disulfide-bridged peptide (NDBP) superfamily        19
Venom Kunitz-type family                                18
Phospholipase family                                    16
Formicidae venom family                                 16
Venom metalloproteinase (M12B) family                   15
Long (3 C-C) scorpion toxin superfamily                 15
Limacoditoxin family                   

In [230]:
# Get all unique protein families
unique_protein_families = set(no_na["Protein families"].unique())

# Find unique protein families present in each cluster column separately
train_families = set(no_na.loc[no_na["Train_Cluster_Rep"] > 0, "Protein families"].unique())
val_families = set(no_na.loc[no_na["Val_Cluster_Rep"] > 0, "Protein families"].unique())
test_families = set(no_na.loc[no_na["Test_Cluster_Rep"] > 0, "Protein families"].unique())

# Find missing families in each set
missing_in_train = unique_protein_families - train_families
missing_in_val = unique_protein_families - val_families
missing_in_test = unique_protein_families - test_families

# Check if all unique protein families are present in all three
all_present_in_all = not (missing_in_train or missing_in_val or missing_in_test)

# Print results
print(f"All unique protein families are present in all three columns: {all_present_in_all}")

if not all_present_in_all:
    if missing_in_train:
        print(f"Missing in Train: {missing_in_train}")
    if missing_in_val:
        print(f"Missing in Val: {missing_in_val}")
    if missing_in_test:
        print(f"Missing in Test: {missing_in_test}")

All unique protein families are present in all three columns: False
Missing in Val: {'Venom Kunitz-type family'}
Missing in Test: {'Formicidae venom family', 'Phospholipase family'}


### added the scolops back for visualization only

In [63]:
# find duplicate entries
duplicate_entries = set(no_na['Entry']).intersection(set(scolop['Entry']))

# keep only unique scolop entries (not in no_na)
unique_scolop = scolop[~scolop['Entry'].isin(duplicate_entries)].copy()

# merge no_na with unique scolop entries
merged_df = pd.concat([no_na, unique_scolop], ignore_index=True)
merged_df

Unnamed: 0,Entry,Organism,Protein names,Protein families,Sequence,Fragment,Cluster,Cluster_Rep,Train_Cluster_Rep,Val_Cluster_Rep,Test_Cluster_Rep
0,A0A0D4WV12,Sicarius terrosus (Cave spider),Dermonecrotic toxin StSicTox-betaIB1i (EC 4.6....,other,GDSRRPIWNIAHMVNDLDLVDEYLDDGANSLELDVEFSKSGTALRT...,,8.0,True,True,False,False
1,A0A0N7CSQ4,Scolopendra mutilans (Chinese red-headed centi...,Tau-scoloptoxin(04)-Ssm1b (Tau-SLPTX(04)-Ssm1b...,other,MLKSFCILSVFMVLFLAKFPDLCSGEEISPLKIVVRNSEYLNNPCN...,,12.0,True,True,False,False
2,A0A1L4BJ98,Hemiscorpius lepturus (Scorpion),Dermonecrotic toxin Hl-PLD1 (EC 4.6.1.-) (Phos...,other,MAHCYYNSKRGCNRVMKTVALVVLISTVMVEESRGDSQEDKKRPIW...,,8.0,False,True,False,False
3,A0A5C2A2T2,Conus purpurascens (Purple cone),Conodipine-P1 (Cdpi-P1) (EC 3.1.1.4) (Phosphat...,Phospholipase A2 family,MKLLAPVLWAMAALGVTWLVAVDSKESCTKHSNGCSTPLRLPCQEY...,,35.0,False,True,False,False
4,A0A6B7FMR5,Vipera ammodytes ammodytes (Western sand viper),Disintegrin-like/cysteine-rich protein MPIII-3...,Venom metalloproteinase (M12B) family,MIQVLLVIICLAVFPYQVSSIILESGNINNYEVVYPQKVTALPKGA...,,36.0,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...
1042,W4VSG7,Conus victoriae (Queen Victoria cone),Conotoxin Vc1 (H_Vc1),other,MRTSGRLLLLCLAVGLLLESQAHPNADAGDATRDVGSDRTSVELSK...,,746.0,True,True,False,False
1043,W4VSI6,Trittame loki (Brush-footed trapdoor spider),Toxin ICK-18,other,MKTIFALVFCCAIAVVVLGFGENEGSTIDHDQNNCKGPGSRCSNKN...,,748.0,True,True,False,False
1044,W4VSI9,Trittame loki (Brush-footed trapdoor spider),U10-barytoxin-Tl1a (U10-BATX-Tl1a) (Toxin ICK-3),Neurotoxin 10 (Hwtx-1) family,MKTLVLVAVLGVASLYLLSSASEVQQLSPAEEEFRAFVSTFGGLFE...,,581.0,False,True,False,False
1045,I6R1R5,Scolopendra mutilans (Chinese red-headed centi...,Omega-scoloptoxin(05)-Ssm1a (Omega-SLPTX(05)-S...,Scoloptoxin-05 family,MPSLCIIALFGTLTFYTLIPSIHTLKCVRCDGPMSNYDCKTTYPAA...,,,,,,


In [64]:
merged_df[merged_df["Entry"].isin(["I6R1R5", "P0DPX5", "P0DPX9", "P0DPY0", "P0DPY1", "P0DPX7", "P0DPX8"])]

Unnamed: 0,Entry,Organism,Protein names,Protein families,Sequence,Fragment,Cluster,Cluster_Rep,Train_Cluster_Rep,Val_Cluster_Rep,Test_Cluster_Rep
874,P0DPX7,Scolopendra morsitans (Tanzanian blue ringleg ...,U-scoloptoxin(05)-Sm1a (U-SLPTX(05)-Sm1a),other,MNVLYTKIFFILILTRTSSALKCVQCFTGYGNIVQDCNKTSYSEPK...,,393.0,False,False,True,False
875,P0DPX8,Ethmostigmus rubripes (Giant centipede),U-scoloptoxin(05)-Er1a (U-SLPTX(05)-Er1a),other,MLSLGVSIFLLVFLIPENSGLECYQCTWMKNSQSPDNCYKNLPNAT...,,393.0,False,False,True,False
876,P0DPX9,Ethmostigmus rubripes (Giant centipede),U-scoloptoxin(05)-Er2a (U-SLPTX(05)-Er2a),other,MTFVVAAVVLLTVVPLATPLKCVQCDGPLTEFDCKTTVPEAKDCPQ...,,393.0,False,False,True,False
877,P0DPY0,Ethmostigmus rubripes (Giant centipede),U-scoloptoxin(05)-Er3a (U-SLPTX(05)-Er3a),other,MRSWFVFVALLAVVFLPSSLDALKCIQCDSQPNRDECKTTLPEARD...,,393.0,True,False,True,False
878,P0DPY1,Cormocephalus westwoodi (Westwood's green cent...,U-scoloptoxin(05)-Cw1a (U-SLPTX(05)-Cw1a),other,MNPLNLSTFIVFTLFAASATTALTCFQCTTSEGSDYCVSSFPKPSQ...,,393.0,False,False,True,False
1045,I6R1R5,Scolopendra mutilans (Chinese red-headed centi...,Omega-scoloptoxin(05)-Ssm1a (Omega-SLPTX(05)-S...,Scoloptoxin-05 family,MPSLCIIALFGTLTFYTLIPSIHTLKCVRCDGPMSNYDCKTTYPAA...,,,,,,
1046,P0DPX5,Scolopendra alternans (Florida Keys giant cent...,U-scoloptoxin(05)-Sa1a (U-SLPTX(05)-Sa1a),Scoloptoxin-05 family,MPSLCIIALFGTLTFYTLIPSIHTLKCVICDSPMGNYDCKTTYPAA...,,,,,,


In [65]:
merged_df.loc[merged_df["Protein families"] == "Scoloptoxin-05 family", "Protein families"] = "other"

In [66]:
merged_df["Protein families"].value_counts()

Protein families
other                                               456
Short scorpion toxin superfamily                     69
Neurotoxin 10 (Hwtx-1) family                        42
MCD family                                           41
Long (4 C-C) scorpion toxin superfamily              39
Snake three-finger toxin family                      38
Bradykinin-potentiating peptide family               37
Conotoxin O1 superfamily                             37
Conotoxin M superfamily                              35
Conotoxin A superfamily                              30
Conotoxin O2 superfamily                             27
Conotoxin T superfamily                              26
Non-disulfide-bridged peptide (NDBP) superfamily     19
Venom Kunitz-type family                             18
Neurotoxin 02 (plectoxin) family                     18
Cationic peptide 04 (cupiennin) family               18
Neurotoxin 14 (magi-1) family                        16
Natriuretic peptide family     

In [67]:
write_fasta(merged_df, "data/toxins_no_na_scolop.fasta")
write_fasta(no_na, "data/toxins_no_na.fasta")

In [68]:
merged_df["Fragment"].value_counts()

Series([], Name: count, dtype: int64)

In [180]:
merged_df

Unnamed: 0,Entry,Organism,Protein names,Protein families,Sequence,Fragment,Cluster,Cluster_Rep,Train_Cluster_Rep,Val_Cluster_Rep,Test_Cluster_Rep,Scoloptoxin,protein_category
0,A0A0D4WV12,Sicarius terrosus (Cave spider),Dermonecrotic toxin StSicTox-betaIB1i (EC 4.6....,other,GDSRRPIWNIAHMVNDLDLVDEYLDDGANSLELDVEFSKSGTALRT...,mature_seq (NO_SP),8.0,True,True,False,False,other,phospholipase_other
1,A0A0N7CSQ4,Scolopendra mutilans (Chinese red-headed centi...,Tau-scoloptoxin(04)-Ssm1b (Tau-SLPTX(04)-Ssm1b...,other,MLKSFCILSVFMVLFLAKFPDLCSGEEISPLKIVVRNSEYLNNPCN...,full_seq (SP),12.0,True,True,False,False,other,scoloptoxin
2,A0A1L4BJ98,Hemiscorpius lepturus (Scorpion),Dermonecrotic toxin Hl-PLD1 (EC 4.6.1.-) (Phos...,other,MAHCYYNSKRGCNRVMKTVALVVLISTVMVEESRGDSQEDKKRPIW...,full_seq (SP),8.0,False,True,False,False,other,phospholipase_other
3,A0A5C2A2T2,Conus purpurascens (Purple cone),Conodipine-P1 (Cdpi-P1) (EC 3.1.1.4) (Phosphat...,Phospholipase A2 family,MKLLAPVLWAMAALGVTWLVAVDSKESCTKHSNGCSTPLRLPCQEY...,mature_seq (NO_SP),35.0,False,True,False,False,other,phospholipase_a2
4,A0A6B7FMR5,Vipera ammodytes ammodytes (Western sand viper),Disintegrin-like/cysteine-rich protein MPIII-3...,Venom metalloproteinase (M12B) family,MIQVLLVIICLAVFPYQVSSIILESGNINNYEVVYPQKVTALPKGA...,full_seq (SP),36.0,True,False,False,True,other,metalloproteinase
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1042,W4VSG7,Conus victoriae (Queen Victoria cone),Conotoxin Vc1 (H_Vc1),other,MRTSGRLLLLCLAVGLLLESQAHPNADAGDATRDVGSDRTSVELSK...,full_seq (SP),746.0,True,True,False,False,other,conotoxin
1043,W4VSI6,Trittame loki (Brush-footed trapdoor spider),Toxin ICK-18,other,MKTIFALVFCCAIAVVVLGFGENEGSTIDHDQNNCKGPGSRCSNKN...,full_seq (SP),748.0,True,True,False,False,other,neurotoxin
1044,W4VSI9,Trittame loki (Brush-footed trapdoor spider),U10-barytoxin-Tl1a (U10-BATX-Tl1a) (Toxin ICK-3),Neurotoxin 10 (Hwtx-1) family,MKTLVLVAVLGVASLYLLSSASEVQQLSPAEEEFRAFVSTFGGLFE...,mature_seq (NO_SP),581.0,False,True,False,False,other,neurotoxin
1045,I6R1R5,Scolopendra mutilans (Chinese red-headed centi...,Omega-scoloptoxin(05)-Ssm1a (Omega-SLPTX(05)-S...,other,MPSLCIIALFGTLTFYTLIPSIHTLKCVRCDGPMSNYDCKTTYPAA...,full_seq (SP),,,,,,Scoloptoxin,scoloptoxin


In [70]:
merged_df["Scoloptoxin"] = merged_df.apply(
    lambda row: "Scoloptoxin" if row["Entry"] in ["I6R1R5", "P0DPX5", "P0DPX9", "P0DPY0", "P0DPY1", "P0DPX7", "P0DPX8"]
    else "3FTX" if row["Protein families"] == "Snake three-finger toxin family"
    else "other",
    axis=1
)

#merged_df.to_csv("data/scolop.csv", columns=["Entry", "Protein families", "Fragment", "Scoloptoxin"], index=False, header=["identifier", "Protein families", "Fragment", "Scoloptoxin"])

### add regex protein_category

In [231]:
regex_tox = pd.read_csv("data/toxins_regex.csv")
regex_tox

Unnamed: 0,Entry,Organism,Protein names,Protein families,Sequence,Fragment,protein_category
0,A0A068B6Q6,Conus betulinus (Beech cone),Conotoxin Bt1.8,Conotoxin A superfamily,PDGRNAAAKAFDLITPTVRKGCCSNPACILNNPNQCG,fragment,conotoxin
1,A0A0B4U9L8,Vipera ammodytes ammodytes (Western sand viper),Zinc metalloproteinase-disintegrin-like protei...,"Venom metalloproteinase (M12B) family, P-III s...",MLQVLLVTICLAVFPYQGSSIILESGNVNDYEVVYPQKLTALLKGA...,,metalloproteinase
2,A0A0B5A8P4,Conus geographus (Geography cone) (Nubecula ge...,Con-Ins G3 (Insulin 3) [Cleaved into: Con-Ins ...,Insulin family,MTTSFYFLLVALGLLLYVCQSSFGNQHTRNSDTPKHRCGSELADQY...,,hormone_related
3,A0A0B5AC95,Conus geographus (Geography cone) (Nubecula ge...,Con-Ins G1a (Insulin 1) [Cleaved into: Con-Ins...,Insulin family,MTTSSYFLLMALGLLLYVCQSSFGNQHTRTFDTPKHRCGSEITNSY...,,hormone_related
4,A0A0D4WTV1,Loxosceles arizonica (Arizona brown spider),Dermonecrotic toxin LarSicTox-betaID1 (EC 4.6....,"Arthropod phospholipase D family, Class II sub...",EGAEQDGSERTDGGRPIWNIAHMVNNKQAIDKYLDKGANSVESDVS...,fragment,phospholipase_other
...,...,...,...,...,...,...,...
6423,P0CI18,Hastula hectica (Sea snail) (Impages hectica),Augerpeptide hhe6.3,,VLFTPPELLGCGNRCSDDCCKWGRCQPGCTD,,
6424,P0CI19,Hastula hectica (Sea snail) (Impages hectica),Augerpeptide hhe6.4,,SSLHCGDDPWCPTGCCENEDCDIGCKRDWEKSRSQP,,
6425,P0CI21,Hastula hectica (Sea snail) (Impages hectica),Augerpeptide hhe53,,GLSQSGCQAFTGRWCVGCERLRSRVVWECSPKRVVNSI,,
6426,Q45RU8,Conus striatus (Striated cone),Conotoxin S5.1,,HNDLIRAGLTVCLSENRKRLTCSGLLNMAGSVCCKVDTSCCSSQ,fragment,


In [232]:
merged_df = merged_df.merge(regex_tox[['Entry', 'protein_category']], on='Entry', how='left')
no_na = no_na.merge(regex_tox[['Entry', 'protein_category']], on='Entry', how='left')

merged_df # supposed to be 1393 rows

Unnamed: 0,Entry,Organism,Protein names,Protein families,Sequence,Fragment,Cluster,Cluster_Rep,Train_Cluster_Rep,Val_Cluster_Rep,Test_Cluster_Rep,Scoloptoxin,protein_category_x,protein_category_y,protein_category
0,A0A0D4WV12,Sicarius terrosus (Cave spider),Dermonecrotic toxin StSicTox-betaIB1i (EC 4.6....,other,GDSRRPIWNIAHMVNDLDLVDEYLDDGANSLELDVEFSKSGTALRT...,mature_seq (NO_SP),8.0,True,True,False,False,other,phospholipase_other,phospholipase_other,phospholipase_other
1,A0A0N7CSQ4,Scolopendra mutilans (Chinese red-headed centi...,Tau-scoloptoxin(04)-Ssm1b (Tau-SLPTX(04)-Ssm1b...,other,MLKSFCILSVFMVLFLAKFPDLCSGEEISPLKIVVRNSEYLNNPCN...,full_seq (SP),12.0,True,True,False,False,other,scoloptoxin,scoloptoxin,scoloptoxin
2,A0A1L4BJ98,Hemiscorpius lepturus (Scorpion),Dermonecrotic toxin Hl-PLD1 (EC 4.6.1.-) (Phos...,other,MAHCYYNSKRGCNRVMKTVALVVLISTVMVEESRGDSQEDKKRPIW...,full_seq (SP),8.0,False,True,False,False,other,phospholipase_other,phospholipase_other,phospholipase_other
3,A0A5C2A2T2,Conus purpurascens (Purple cone),Conodipine-P1 (Cdpi-P1) (EC 3.1.1.4) (Phosphat...,Phospholipase A2 family,MKLLAPVLWAMAALGVTWLVAVDSKESCTKHSNGCSTPLRLPCQEY...,mature_seq (NO_SP),35.0,False,True,False,False,other,phospholipase_a2,phospholipase_a2,phospholipase_a2
4,A0A6B7FMR5,Vipera ammodytes ammodytes (Western sand viper),Disintegrin-like/cysteine-rich protein MPIII-3...,Venom metalloproteinase (M12B) family,MIQVLLVIICLAVFPYQVSSIILESGNINNYEVVYPQKVTALPKGA...,full_seq (SP),36.0,True,False,False,True,other,metalloproteinase,metalloproteinase,metalloproteinase
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1042,W4VSG7,Conus victoriae (Queen Victoria cone),Conotoxin Vc1 (H_Vc1),other,MRTSGRLLLLCLAVGLLLESQAHPNADAGDATRDVGSDRTSVELSK...,full_seq (SP),746.0,True,True,False,False,other,conotoxin,conotoxin,conotoxin
1043,W4VSI6,Trittame loki (Brush-footed trapdoor spider),Toxin ICK-18,other,MKTIFALVFCCAIAVVVLGFGENEGSTIDHDQNNCKGPGSRCSNKN...,full_seq (SP),748.0,True,True,False,False,other,neurotoxin,neurotoxin,neurotoxin
1044,W4VSI9,Trittame loki (Brush-footed trapdoor spider),U10-barytoxin-Tl1a (U10-BATX-Tl1a) (Toxin ICK-3),Neurotoxin 10 (Hwtx-1) family,MKTLVLVAVLGVASLYLLSSASEVQQLSPAEEEFRAFVSTFGGLFE...,mature_seq (NO_SP),581.0,False,True,False,False,other,neurotoxin,neurotoxin,neurotoxin
1045,I6R1R5,Scolopendra mutilans (Chinese red-headed centi...,Omega-scoloptoxin(05)-Ssm1a (Omega-SLPTX(05)-S...,other,MPSLCIIALFGTLTFYTLIPSIHTLKCVRCDGPMSNYDCKTTYPAA...,full_seq (SP),,,,,,Scoloptoxin,scoloptoxin,scoloptoxin,scoloptoxin


In [73]:
merged_df.to_csv("data/scolop.csv", columns=["Entry", "Protein families", "Fragment", "Scoloptoxin", "protein_category"], index=False, header=["identifier", "Protein families", "Fragment", "Scoloptoxin", "protein_category"])

### cut off SP and then see which Entry is full_seq, mature or fragment

In [74]:
write_fasta(merged_df, "data/toxins_no_na_scolop.fasta")

In [233]:
sp6_results = pd.read_csv("data/sp6/output.gff3", sep="\t", usecols=[0]).rename(columns={"## gff-version 3": "Entry"})
sp6_results

Unnamed: 0,Entry
0,A0A0N7CSQ4
1,A0A1L4BJ98
2,A0A6B7FMR5
3,A0S864
4,A8YPR6
...,...
590,W4VSB6
591,W4VSB9
592,W4VSG7
593,W4VSI6


In [234]:
# Merge to check which entries are in sp6_results
merged_df = merged_df.merge(sp6_results[['Entry']], on='Entry', how='left', indicator=True)
no_na = no_na.merge(sp6_results[['Entry']], on='Entry', how='left', indicator=True)

# Replace NaN values in "Fragment" based on presence in sp6_results
merged_df['Fragment'] = np.where(
    merged_df['Fragment'].isna(),
    np.where(merged_df['_merge'] == 'both', 'full_seq (SP)', 'mature_seq (NO_SP)'),
    merged_df['Fragment']
)
no_na['Fragment'] = np.where(
    no_na['Fragment'].isna(),
    np.where(no_na['_merge'] == 'both', 'full_seq (SP)', 'mature_seq (NO_SP)'),
    no_na['Fragment']
)

# Drop the '_merge' column as it's no longer needed
merged_df.drop(columns=['_merge'], inplace=True)
no_na.drop(columns=['_merge'], inplace=True)
merged_df

Unnamed: 0,Entry,Organism,Protein names,Protein families,Sequence,Fragment,Cluster,Cluster_Rep,Train_Cluster_Rep,Val_Cluster_Rep,Test_Cluster_Rep,Scoloptoxin,protein_category_x,protein_category_y,protein_category
0,A0A0D4WV12,Sicarius terrosus (Cave spider),Dermonecrotic toxin StSicTox-betaIB1i (EC 4.6....,other,GDSRRPIWNIAHMVNDLDLVDEYLDDGANSLELDVEFSKSGTALRT...,mature_seq (NO_SP),8.0,True,True,False,False,other,phospholipase_other,phospholipase_other,phospholipase_other
1,A0A0N7CSQ4,Scolopendra mutilans (Chinese red-headed centi...,Tau-scoloptoxin(04)-Ssm1b (Tau-SLPTX(04)-Ssm1b...,other,MLKSFCILSVFMVLFLAKFPDLCSGEEISPLKIVVRNSEYLNNPCN...,full_seq (SP),12.0,True,True,False,False,other,scoloptoxin,scoloptoxin,scoloptoxin
2,A0A1L4BJ98,Hemiscorpius lepturus (Scorpion),Dermonecrotic toxin Hl-PLD1 (EC 4.6.1.-) (Phos...,other,MAHCYYNSKRGCNRVMKTVALVVLISTVMVEESRGDSQEDKKRPIW...,full_seq (SP),8.0,False,True,False,False,other,phospholipase_other,phospholipase_other,phospholipase_other
3,A0A5C2A2T2,Conus purpurascens (Purple cone),Conodipine-P1 (Cdpi-P1) (EC 3.1.1.4) (Phosphat...,Phospholipase A2 family,MKLLAPVLWAMAALGVTWLVAVDSKESCTKHSNGCSTPLRLPCQEY...,mature_seq (NO_SP),35.0,False,True,False,False,other,phospholipase_a2,phospholipase_a2,phospholipase_a2
4,A0A6B7FMR5,Vipera ammodytes ammodytes (Western sand viper),Disintegrin-like/cysteine-rich protein MPIII-3...,Venom metalloproteinase (M12B) family,MIQVLLVIICLAVFPYQVSSIILESGNINNYEVVYPQKVTALPKGA...,full_seq (SP),36.0,True,False,False,True,other,metalloproteinase,metalloproteinase,metalloproteinase
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1042,W4VSG7,Conus victoriae (Queen Victoria cone),Conotoxin Vc1 (H_Vc1),other,MRTSGRLLLLCLAVGLLLESQAHPNADAGDATRDVGSDRTSVELSK...,full_seq (SP),746.0,True,True,False,False,other,conotoxin,conotoxin,conotoxin
1043,W4VSI6,Trittame loki (Brush-footed trapdoor spider),Toxin ICK-18,other,MKTIFALVFCCAIAVVVLGFGENEGSTIDHDQNNCKGPGSRCSNKN...,full_seq (SP),748.0,True,True,False,False,other,neurotoxin,neurotoxin,neurotoxin
1044,W4VSI9,Trittame loki (Brush-footed trapdoor spider),U10-barytoxin-Tl1a (U10-BATX-Tl1a) (Toxin ICK-3),Neurotoxin 10 (Hwtx-1) family,MKTLVLVAVLGVASLYLLSSASEVQQLSPAEEEFRAFVSTFGGLFE...,mature_seq (NO_SP),581.0,False,True,False,False,other,neurotoxin,neurotoxin,neurotoxin
1045,I6R1R5,Scolopendra mutilans (Chinese red-headed centi...,Omega-scoloptoxin(05)-Ssm1a (Omega-SLPTX(05)-S...,other,MPSLCIIALFGTLTFYTLIPSIHTLKCVRCDGPMSNYDCKTTYPAA...,full_seq (SP),,,,,,Scoloptoxin,scoloptoxin,scoloptoxin,scoloptoxin


In [77]:
merged_df.to_csv("data/scolop.csv", columns=["Entry", "Protein families", "Fragment", "Scoloptoxin", "protein_category"], index=False, header=["identifier", "Protein families", "Fragment", "Scoloptoxin", "protein_category"])

In [235]:
no_na.to_csv("data/toxins_no_na.tsv", sep="\t", index=False)