In [1]:
import pandas as pd
from Bio import SeqIO
import numpy as np
from sklearn.utils import shuffle

# Toxins
(taxonomy_id:33208) AND (cc_tissue_specificity:venom) AND (reviewed:true) AND (keyword:KW-0800) AND (fragment:false)

In [2]:
tox = pd.read_csv('../data/raw/tox.tsv', sep='\t')
tox = tox.dropna(subset=["Protein families"])

tox

Unnamed: 0,Entry,Sequence,Protein families
0,A0A0B4U9L8,MLQVLLVTICLAVFPYQGSSIILESGNVNDYEVVYPQKLTALLKGA...,"Venom metalloproteinase (M12B) family, P-III s..."
1,A0A0B5A8P4,MTTSFYFLLVALGLLLYVCQSSFGNQHTRNSDTPKHRCGSELADQY...,Insulin family
2,A0A0B5AC95,MTTSSYFLLMALGLLLYVCQSSFGNQHTRTFDTPKHRCGSEITNSY...,Insulin family
3,A0A0D4WV12,GDSRRPIWNIAHMVNDLDLVDEYLDDGANSLELDVEFSKSGTALRT...,"Arthropod phospholipase D family, Class II sub..."
4,A0A0N7CSQ4,MLKSFCILSVFMVLFLAKFPDLCSGEEISPLKIVVRNSEYLNNPCN...,Scoloptoxin-04 family
...,...,...,...
5308,W4VSI7,MKPTISILIFFALAVAIMGHRLNSGYGIPHIVEKLPNGQWCRTPGD...,Neurotoxin 21 family
5309,W4VSI8,MMKLYSLVIIATLAAAAFAATSEEISAAVSEIISQHQEDLERYAKI...,"Neurotoxin 25 family, ICK-8 subfamily"
5310,W4VSI9,MKTLVLVAVLGVASLYLLSSASEVQQLSPAEEEFRAFVSTFGGLFE...,"Neurotoxin 10 (Hwtx-1) family, 27 (ICK-3) subf..."
5311,X5IFY8,MGKLTILVLVAAVLLSTQAMVQGDGDQPAARNAVPRDDNPDGPSAK...,"O2 superfamily, Contryphan family"


In [3]:
tox['Protein families'] = tox['Protein families'].str.split(',').str[0]
tox['Protein families'] = tox['Protein families'].str.split(';').str[0]

In [4]:
tox['Protein families'] = tox['Protein families'].replace('I1 superfamily', 'Conotoxin I1 superfamily')
tox['Protein families'] = tox['Protein families'].replace('O1 superfamily', 'Conotoxin O1 superfamily')
tox['Protein families'] = tox['Protein families'].replace('O2 superfamily', 'Conotoxin O2 superfamily')
tox['Protein families'] = tox['Protein families'].replace('E superfamily', 'Conotoxin E superfamily')
tox['Protein families'] = tox['Protein families'].replace('F superfamily', 'Conotoxin F superfamily')
tox['Protein families'] = tox['Protein families'].replace('Conotoxin M family', 'Conotoxin M superfamily')
tox['Protein families'] = tox['Protein families'].replace('Conotoxin B2 family', 'Conotoxin B2 superfamily')
tox['Protein families'] = tox['Protein families'].replace('Conotoxin O1 family', 'Conotoxin O1 superfamily')
tox['Protein families'] = tox['Protein families'].replace('Conotoxin O2 family', 'Conotoxin O2 superfamily')

In [5]:
mapping = {
    r'Conotoxin.*': 'Conotoxin family',
    r'Neurotoxin.*': 'Neurotoxin family',
    r'Scoloptoxin.*|Scolopendra.*': 'Scoloptoxin family',
    r'Caterpillar.*': 'Caterpillar family',
    r'Teretoxin.*': 'Teretoxin family',
    r'Limacoditoxin.*': 'Limacoditoxin family',
    r'Scutigerotoxin.*': 'Scutigerotoxin family',
    r'Cationic peptide.*': 'Cationic peptide family',
    r'Formicidae venom.*': 'Formicidae venom family',
    r'Bradykinin-potentiating peptide family|Natriuretic peptide family': 'Natriuretic, Bradykinin potentiating peptide family',
    r'.*phospholipase.*|.*Phospholipase.*': 'Phospholipase family'
}

# Apply mapping
for pattern, replacement in mapping.items():
    tox['Protein families'] = tox['Protein families'].str.replace(pattern, replacement, regex=True)

tox['Protein families'].value_counts()

Protein families
Conotoxin family                           990
Neurotoxin family                          959
Three-finger toxin family                  485
Long (4 C-C) scorpion toxin superfamily    339
Phospholipase family                       320
                                          ... 
Xylopin-like family                          1
Gastrin/cholecystokinin family               1
CART family                                  1
Diuretic hormone class 2 family              1
SLPTX(15) family                             1
Name: count, Length: 97, dtype: int64

# Non-Toxins
(taxonomy_id:33208) AND (reviewed:true) AND (fragment:false) NOT (keyword:KW-0800) AND ((existence:1) OR (existence:2))

In [6]:
nontox = pd.read_csv('../data/raw/nontox.tsv', sep='\t')
nontox

Unnamed: 0,Entry,Sequence,Protein families
0,A0A026W182,MMKMKQQGLVADLLPNIRVMKTFGHFVFNYYNDNSSKYLHKVYCCV...,"Insect chemoreceptor superfamily, Heteromeric ..."
1,A0A044RE18,MYWQLVRILVLFDCLQKILAIEHDSICIADVDDACPEPSHTVMRLR...,"Peptidase S8 family, Furin subfamily"
2,A0A061ACU2,MTVPPLLKSCVVKLLLPAALLAAAIIRPSFLSIGYVLLALVSAVLP...,PIEZO (TC 1.A.75) family
3,A0A061I403,MPMASVIAVAEPKWISVWGRFLWLTLLSMALGSLLALLLPLGAVEE...,Fic family
4,A0A075F932,MVSESHHEALAAPPATTVAAAPPSNVTEPASPGGGGGKEDAFSKLK...,Synaptotagmin family
...,...,...,...
83955,Q9W3M2,MAKRGKKGGIPRAEMVQVASANRDENQVTELKKADYLPYLFNLVMP...,DM7 family
83956,Q9WVB7,MKPPMQPLTQALPFSLRDALQGTGLRVPVIKMGTGWEGMYRTLKEV...,
83957,Q9XVA4,MPDNHKDPPDFNNLEMKLEERIELSREDQDIQSTSSSYPHCEALDH...,
83958,Q9Y0Y7,MERRYLKNPFPDFAGGENTPFASDEEHIKNLICTYVDAILEHCHPN...,LanC-like protein family


### Fasta Generation

In [10]:
def write_fasta(df, filename):
    """Writes a DataFrame to a FASTA file."""
    with open(filename, "w") as f:
        for _, row in df.iterrows():
            f.write(f">{row['Entry']}\n{row['Sequence']}\n")

write_fasta(tox, "../data/raw/tox.fasta")
write_fasta(tox, "../data/raw/nontox.fasta")

## Remove SPs

In [2]:
!signalp6 --fastafile ../data/raw/tox.fasta --output_dir ../data/sp6/tox/ --organism eukarya --mode fast --model_dir /Users/selin/Desktop/Uni/signalp6/signalp-6-package/models/

Predicting: 100%|████████████████████| 5051/5051 [03:46<00:00, 22.27sequences/s]
Writing files: 100%|██████████████████████| 5051/5051 [00:01<00:00, 2988.46it/s]


In [3]:
!signalp6 --fastafile ../data/nontox.fasta --output_dir ../data/sp6/nontox/ --organism eukarya --mode fast --model_dir /Users/selin/Desktop/Uni/signalp6/signalp-6-package/models/

Predicting: 100%|██████████████████| 52596/52596 [40:20<00:00, 21.73sequences/s]
Writing files: 100%|████████████████████| 52596/52596 [00:05<00:00, 9470.77it/s]


In [7]:
def fasta_to_dataframe(fasta_file):
    records = SeqIO.parse(fasta_file, "fasta")
    data = []

    for record in records:
        id_part = record.id.split('|')[-1]
        data.append({"identifier": id_part, "Sequence": str(record.seq)})

    df = pd.DataFrame(data)
    return df

# SignalP6 all (processed) sequences
proc_tox = fasta_to_dataframe("../data/sp6/tox/processed_entries.fasta")
proc_nontox = fasta_to_dataframe("../data/sp6/nontox/processed_entries.fasta")
#proc_tox = proc_tox.rename(columns={'Sequence': 'Sequence'})
#proc

In [10]:
proc_tox

Unnamed: 0,identifier,Sequence
0,A0A0B4U9L8,IILESGNVNDYEVVYPQKLTALLKGAIQQPEQKYEDAMQYEFKVNG...
1,A0A0B5A8P4,NQHTRNSDTPKHRCGSELADQYVQLCHGKRNDAGKKRGRASPLWQR...
2,A0A0B5AC95,NQHTRTFDTPKHRCGSEITNSYMDLCYRKRNDAGEKRGRASPLWQR...
3,A0A0N7CSQ4,EEISPLKIVVRNSEYLNNPCNGVTCPSGYRCSIVDKQCIKKEK
4,A0A193CHJ5,HLLQFNKMIKFETRKNAIPFYAFYGCYCGWGGRGRPKDATDRCCFV...
...,...,...
3455,W4VSI7,HRLNSGYGIPHIVEKLPNGQWCRTPGDDCSESKQCCKPEDTATYAH...
3456,W4VSI8,ATSEEISAAVSEIISQHQEDLERYAKIVERGEEPKKYIRCSKQLGQ...
3457,W4VSI9,SEVQQLSPAEEEFRAFVSTFGGLFETEERGVDSEDCRAMFGGCGED...
3458,X5IFY8,DGDQPAARNAVPRDDNPDGPSAKFMNVQRRSGCPWEPWCG


In [11]:
gff3_tox = pd.read_csv('../data/sp6/tox/output.gff3', sep='\t', comment='#', header=None)
gff3_nontox = pd.read_csv('../data/sp6/nontox/output.gff3', sep='\t', comment='#', header=None)

cols = [
    'identifier', 'source', 'feature_type', 'start', 'end',
    'score', 'strand', 'phase', 'attributes'
]
gff3_tox.columns = cols
gff3_nontox.columns = cols

def extract_seqid(full_seqid):
    return full_seqid.split('|')[-1].split(' ')[0]

gff3_tox['identifier'] = gff3_tox['identifier'].apply(extract_seqid)
gff3_nontox['identifier'] = gff3_nontox['identifier'].apply(extract_seqid)

gff3_tox = pd.merge(gff3_tox, proc_tox, on='identifier')
gff3_nontox = pd.merge(gff3_nontox, proc_nontox, on='identifier')

In [12]:
gff3_tox

Unnamed: 0,identifier,source,feature_type,start,end,score,strand,phase,attributes,Sequence
0,A0A0B4U9L8,SignalP-6.0,signal_peptide,1,20,0.999781,.,.,.,IILESGNVNDYEVVYPQKLTALLKGAIQQPEQKYEDAMQYEFKVNG...
1,A0A0B5A8P4,SignalP-6.0,signal_peptide,1,24,0.999788,.,.,.,NQHTRNSDTPKHRCGSELADQYVQLCHGKRNDAGKKRGRASPLWQR...
2,A0A0B5AC95,SignalP-6.0,signal_peptide,1,24,0.999788,.,.,.,NQHTRTFDTPKHRCGSEITNSYMDLCYRKRNDAGEKRGRASPLWQR...
3,A0A0N7CSQ4,SignalP-6.0,signal_peptide,1,25,0.999729,.,.,.,EEISPLKIVVRNSEYLNNPCNGVTCPSGYRCSIVDKQCIKKEK
4,A0A193CHJ5,SignalP-6.0,signal_peptide,1,16,0.998810,.,.,.,HLLQFNKMIKFETRKNAIPFYAFYGCYCGWGGRGRPKDATDRCCFV...
...,...,...,...,...,...,...,...,...,...,...
3455,W4VSI7,SignalP-6.0,signal_peptide,1,19,0.999751,.,.,.,HRLNSGYGIPHIVEKLPNGQWCRTPGDDCSESKQCCKPEDTATYAH...
3456,W4VSI8,SignalP-6.0,signal_peptide,1,19,0.999768,.,.,.,ATSEEISAAVSEIISQHQEDLERYAKIVERGEEPKKYIRCSKQLGQ...
3457,W4VSI9,SignalP-6.0,signal_peptide,1,21,0.999693,.,.,.,SEVQQLSPAEEEFRAFVSTFGGLFETEERGVDSEDCRAMFGGCGED...
3458,X5IFY8,SignalP-6.0,signal_peptide,1,23,0.999690,.,.,.,DGDQPAARNAVPRDDNPDGPSAKFMNVQRRSGCPWEPWCG


In [14]:
gff3_tox[gff3_tox['score'] < 0.9]

Unnamed: 0,identifier,source,feature_type,start,end,score,strand,phase,attributes,Sequence
7,A0A2U8QPE6,SignalP-6.0,signal_peptide,1,19,0.582636,.,.,.,DDTRPLGECFREADYEEFLEIARNGLKKTSNPKHVVVVGAGMSGLS...
22,A8QL52,SignalP-6.0,signal_peptide,1,18,0.65587,.,.,.,ADDRRSALEECFREADYEEFLEIARNGLKKTSNPKHVVVVGAGMAG...
40,C0HJE7,SignalP-6.0,signal_peptide,1,19,0.622422,.,.,.,HDRNPLEECFRETDYEEFLEIARNGLTVTSNPKHVVIVGAGMAGLS...
79,G8XQX1,SignalP-6.0,signal_peptide,1,18,0.618284,.,.,.,ADDKNPLEECFREDDYEEFLEIAKNGLKKTSNPKHIVIVGAGMSGL...
82,J7H670,SignalP-6.0,signal_peptide,1,19,0.590507,.,.,.,DDRNPLGECFRETDYEEFLEIAKNGLRATSNPKHVVIVGAGMSGLS...
89,O93364,SignalP-6.0,signal_peptide,1,19,0.633526,.,.,.,HDRNPLEECFRETDYEEFLEIAKNGLTATSNPKRVVIVGAGMAGLS...
113,P07231,SignalP-6.0,signal_peptide,1,21,0.735614,.,.,.,TGTLDDGGALTERRSADATALKAEPVLLQKSAARSTDDNGKDRLTQ...
179,P56742,SignalP-6.0,signal_peptide,1,19,0.633526,.,.,.,HDRNPLEECFRETDYEEFLEIAKNGLTATSNPKRVVIVGAGMAGLS...
182,P58806,SignalP-6.0,signal_peptide,1,26,0.500608,.,.,.,HGGALTERRSTDATALKPEPVLLQKSSARSTDDNGNDRLTQMKRIL...
210,P81382,SignalP-6.0,signal_peptide,1,19,0.62441,.,.,.,DDRNPLAECFQENDYEEFLEIARNGLKATSNPKHVVIVGAGMAGLS...


## Clustering
### run mmseqs2 30% sequence similarity clustering

!mmseqs easy-cluster data/toxins.fasta data/mmseqs_0.3/toxins_cluster data/mmseqs_0.3/tmp --min-seq-id 0.3

In [None]:
rep_data = []
with open("./data/mmseqs_0.3/toxins_cluster_rep_seq.fasta") as f:
    for record in SeqIO.parse(f, "fasta"):
        rep_data.append([record.id, str(record.seq)])

toxins_rep = pd.DataFrame(rep_data, columns=["Entry", "Sequence"])

toxins_rep = toxins_rep.merge(tox[["Entry", "Protein families"]], on="Entry", how="left")
toxins_rep

In [None]:
cluster_df = pd.read_csv("./data/mmseqs_0.3/toxins_cluster_cluster.tsv", sep='\t', header=None, names=['Entry', 'cluster_member'])
cluster_df

### noise

In [212]:
print((cluster_df['Entry'].value_counts() != 1).sum())

403


In [1]:
cluster_df['Cluster'] = cluster_df['Entry'].rank(method='dense').astype(int) - 1
cluster_df

NameError: name 'cluster_df' is not defined

In [214]:
tox = tox.merge(cluster_df[['cluster_member', 'Cluster']], left_on='Entry', right_on='cluster_member', how='left').drop(columns='cluster_member')
tox

Unnamed: 0,Entry,Organism,Protein names,Protein families,Sequence,Fragment,Cluster
0,A0A0B4U9L8,Vipera ammodytes ammodytes (Western sand viper),Zinc metalloproteinase-disintegrin-like protei...,Venom metalloproteinase (M12B) family,MLQVLLVTICLAVFPYQGSSIILESGNVNDYEVVYPQKLTALLKGA...,,60
1,A0A0B5A8P4,Conus geographus (Geography cone) (Nubecula ge...,Con-Ins G3 (Insulin 3) [Cleaved into: Con-Ins ...,Insulin family,MTTSFYFLLVALGLLLYVCQSSFGNQHTRNSDTPKHRCGSELADQY...,,7
2,A0A0B5AC95,Conus geographus (Geography cone) (Nubecula ge...,Con-Ins G1a (Insulin 1) [Cleaved into: Con-Ins...,Insulin family,MTTSSYFLLMALGLLLYVCQSSFGNQHTRTFDTPKHRCGSEITNSY...,,7
3,A0A0D4WV12,Sicarius terrosus (Cave spider),Dermonecrotic toxin StSicTox-betaIB1i (EC 4.6....,Phospholipase family,GDSRRPIWNIAHMVNDLDLVDEYLDDGANSLELDVEFSKSGTALRT...,,8
4,A0A0N7CSQ4,Scolopendra mutilans (Chinese red-headed centi...,Tau-scoloptoxin(04)-Ssm1b (Tau-SLPTX(04)-Ssm1b...,Scoloptoxin family,MLKSFCILSVFMVLFLAKFPDLCSGEEISPLKIVVRNSEYLNNPCN...,,12
...,...,...,...,...,...,...,...
5034,W4VSI7,Trittame loki (Brush-footed trapdoor spider),Toxin ICK-13,Neurotoxin family,MKPTISILIFFALAVAIMGHRLNSGYGIPHIVEKLPNGQWCRTPGD...,,748
5035,W4VSI8,Trittame loki (Brush-footed trapdoor spider),Toxin ICK-8,Neurotoxin family,MMKLYSLVIIATLAAAAFAATSEEISAAVSEIISQHQEDLERYAKI...,,19
5036,W4VSI9,Trittame loki (Brush-footed trapdoor spider),U10-barytoxin-Tl1a (U10-BATX-Tl1a) (Toxin ICK-3),Neurotoxin family,MKTLVLVAVLGVASLYLLSSASEVQQLSPAEEEFRAFVSTFGGLFE...,,581
5037,X5IFY8,Conus geographus (Geography cone) (Nubecula ge...,Contryphan-G,Conotoxin family,MGKLTILVLVAAVLLSTQAMVQGDGDQPAARNAVPRDDNPDGPSAK...,,24


In [215]:
rep_seq_ids = {record.id for record in SeqIO.parse("data/mmseqs_0.3/toxins_cluster_rep_seq.fasta", "fasta")}
tox['Cluster_Rep'] = tox['Entry'].isin(rep_seq_ids)
tox

Unnamed: 0,Entry,Organism,Protein names,Protein families,Sequence,Fragment,Cluster,Cluster_Rep
0,A0A0B4U9L8,Vipera ammodytes ammodytes (Western sand viper),Zinc metalloproteinase-disintegrin-like protei...,Venom metalloproteinase (M12B) family,MLQVLLVTICLAVFPYQGSSIILESGNVNDYEVVYPQKLTALLKGA...,,60,False
1,A0A0B5A8P4,Conus geographus (Geography cone) (Nubecula ge...,Con-Ins G3 (Insulin 3) [Cleaved into: Con-Ins ...,Insulin family,MTTSFYFLLVALGLLLYVCQSSFGNQHTRNSDTPKHRCGSELADQY...,,7,False
2,A0A0B5AC95,Conus geographus (Geography cone) (Nubecula ge...,Con-Ins G1a (Insulin 1) [Cleaved into: Con-Ins...,Insulin family,MTTSSYFLLMALGLLLYVCQSSFGNQHTRTFDTPKHRCGSEITNSY...,,7,False
3,A0A0D4WV12,Sicarius terrosus (Cave spider),Dermonecrotic toxin StSicTox-betaIB1i (EC 4.6....,Phospholipase family,GDSRRPIWNIAHMVNDLDLVDEYLDDGANSLELDVEFSKSGTALRT...,,8,True
4,A0A0N7CSQ4,Scolopendra mutilans (Chinese red-headed centi...,Tau-scoloptoxin(04)-Ssm1b (Tau-SLPTX(04)-Ssm1b...,Scoloptoxin family,MLKSFCILSVFMVLFLAKFPDLCSGEEISPLKIVVRNSEYLNNPCN...,,12,True
...,...,...,...,...,...,...,...,...
5034,W4VSI7,Trittame loki (Brush-footed trapdoor spider),Toxin ICK-13,Neurotoxin family,MKPTISILIFFALAVAIMGHRLNSGYGIPHIVEKLPNGQWCRTPGD...,,748,False
5035,W4VSI8,Trittame loki (Brush-footed trapdoor spider),Toxin ICK-8,Neurotoxin family,MMKLYSLVIIATLAAAAFAATSEEISAAVSEIISQHQEDLERYAKI...,,19,False
5036,W4VSI9,Trittame loki (Brush-footed trapdoor spider),U10-barytoxin-Tl1a (U10-BATX-Tl1a) (Toxin ICK-3),Neurotoxin family,MKTLVLVAVLGVASLYLLSSASEVQQLSPAEEEFRAFVSTFGGLFE...,,581,False
5037,X5IFY8,Conus geographus (Geography cone) (Nubecula ge...,Contryphan-G,Conotoxin family,MGKLTILVLVAAVLLSTQAMVQGDGDQPAARNAVPRDDNPDGPSAK...,,24,False


### Train-Val-Test sets with 70:15:15 split

In [216]:
# Get unique clusters and shuffle them
clusters = tox['Cluster'].unique()
np.random.seed(42)
np.random.shuffle(clusters)

# Calculate total number of entries
total_entries = len(tox)

# Define target sizes
train_target = 0.7 * total_entries
val_target = 0.15 * total_entries
test_target = 0.15 * total_entries

# Initialize cluster lists
train_clusters, val_clusters, test_clusters = [], [], []

# Initialize counters
train_count, val_count, test_count = 0, 0, 0

# Assign clusters while considering their sizes
for cluster in clusters:
    cluster_size = len(tox[tox['Cluster'] == cluster])

    if train_count + cluster_size <= train_target:
        train_clusters.append(cluster)
        train_count += cluster_size
    elif val_count + cluster_size <= val_target:
        val_clusters.append(cluster)
        val_count += cluster_size
    else:
        test_clusters.append(cluster)
        test_count += cluster_size

# Filter original df by cluster IDs
train_df = tox[tox['Cluster'].isin(train_clusters)]
val_df = tox[tox['Cluster'].isin(val_clusters)]
test_df = tox[tox['Cluster'].isin(test_clusters)]

# Shuffle each dataset
train_df = shuffle(train_df, random_state=42).reset_index(drop=True)
val_df = shuffle(val_df, random_state=42).reset_index(drop=True)
test_df = shuffle(test_df, random_state=42).reset_index(drop=True)

# Print results
print(f"Total entries: {total_entries}")
print(f"Train entries: {len(train_df)} ({len(train_df)/total_entries:.2%})")
print(f"Validation entries: {len(val_df)} ({len(val_df)/total_entries:.2%})")
print(f"Test entries: {len(test_df)} ({len(test_df)/total_entries:.2%})")

# Output dataframes
train_df, val_df, test_df

Total entries: 5039
Train entries: 3527 (69.99%)
Validation entries: 755 (14.98%)
Test entries: 757 (15.02%)


(           Entry                                           Organism  \
 0         Q4VM07  Macrovipera lebetinus (Levantine viper) (Viper...   
 1         B6DD05  Lycosa singoriensis (Wolf spider) (Aranea sing...   
 2         P0CI57      Lychas mucronatus (Chinese swimming scorpion)   
 3         P10116  Laticauda colubrina (Yellow-lipped sea krait) ...   
 4         Q1ELU5                       Lachesana tarabaevi (Spider)   
 ...          ...                                                ...   
 3522  A0A2P1BRQ0          Scorpaena plumieri (Spotted scorpionfish)   
 3523      C1IC52        Walterinnesia aegyptia (Desert black snake)   
 3524      P82942    Naja kaouthia (Monocled cobra) (Naja siamensis)   
 3525      W4VS70              Conus victoriae (Queen Victoria cone)   
 3526      Q3YEE2                    Conus capitaneus (Captain cone)   
 
                                           Protein names  \
 0     Zinc metalloproteinase-disintegrin-like VLAIP-...   
 1     U10-lyc

In [None]:
write_fasta(train_df, "./data/toxins_train.fasta")
write_fasta(val_df, "./data/toxins_val.fasta")
write_fasta(test_df, "./data/toxins_test.fasta")

### actual redundancy reduction, 50% sequence similarity for train, val and test

In [None]:
!mmseqs easy-cluster data/toxins_train.fasta data/mmseqs_0.5/train_cluster data/mmseqs_0.5/tmp --min-seq-id 0.5

In [219]:
!mmseqs easy-cluster data/toxins_val.fasta data/mmseqs_0.5/val_cluster data/mmseqs_0.5/tmp --min-seq-id 0.5

easy-cluster data/toxins_val.fasta data/mmseqs_0.5/val_cluster data/mmseqs_0.5/tmp --min-seq-id 0.5 

MMseqs Version:                     	17.b804f
Substitution matrix                 	aa:blosum62.out,nucl:nucleotide.out
Seed substitution matrix            	aa:VTML80.out,nucl:nucleotide.out
Sensitivity                         	4
k-mer length                        	0
Target search mode                  	0
k-score                             	seq:2147483647,prof:2147483647
Alphabet size                       	aa:21,nucl:5
Max sequence length                 	65535
Max results per query               	20
Split database                      	0
Split mode                          	2
Split memory limit                  	0
Coverage threshold                  	0.8
Coverage mode                       	0
Compositional bias                  	1
Compositional bias                  	1
Diagonal scoring                    	true
Exact k-mer matching                	0
Mask residues

In [220]:
!mmseqs easy-cluster data/toxins_test.fasta data/mmseqs_0.5/test_cluster data/mmseqs_0.5/tmp --min-seq-id 0.5

easy-cluster data/toxins_test.fasta data/mmseqs_0.5/test_cluster data/mmseqs_0.5/tmp --min-seq-id 0.5 

MMseqs Version:                     	17.b804f
Substitution matrix                 	aa:blosum62.out,nucl:nucleotide.out
Seed substitution matrix            	aa:VTML80.out,nucl:nucleotide.out
Sensitivity                         	4
k-mer length                        	0
Target search mode                  	0
k-score                             	seq:2147483647,prof:2147483647
Alphabet size                       	aa:21,nucl:5
Max sequence length                 	65535
Max results per query               	20
Split database                      	0
Split mode                          	2
Split memory limit                  	0
Coverage threshold                  	0.8
Coverage mode                       	0
Compositional bias                  	1
Compositional bias                  	1
Diagonal scoring                    	true
Exact k-mer matching                	0
Mask residu

In [221]:
rep_seq_files = {
    "Train_Cluster_Rep": "data/mmseqs_0.5/train_cluster_rep_seq.fasta",
    "Val_Cluster_Rep": "data/mmseqs_0.5/val_cluster_rep_seq.fasta",
    "Test_Cluster_Rep": "data/mmseqs_0.5/test_cluster_rep_seq.fasta",
}

for col, file in rep_seq_files.items():
    rep_seq_ids = {record.id for record in SeqIO.parse(file, "fasta")}
    df[col] = df['Entry'].isin(rep_seq_ids)

df

Unnamed: 0,Entry,Organism,Protein names,Protein families,Sequence,Fragment,Cluster,Cluster_Rep,Train_Cluster_Rep,Val_Cluster_Rep,Test_Cluster_Rep
0,A0A0B4U9L8,Vipera ammodytes ammodytes (Western sand viper),Zinc metalloproteinase-disintegrin-like protei...,Venom metalloproteinase (M12B) family,MLQVLLVTICLAVFPYQGSSIILESGNVNDYEVVYPQKLTALLKGA...,,60,False,False,False,False
1,A0A0B5A8P4,Conus geographus (Geography cone) (Nubecula ge...,Con-Ins G3 (Insulin 3) [Cleaved into: Con-Ins ...,Insulin family,MTTSFYFLLVALGLLLYVCQSSFGNQHTRNSDTPKHRCGSELADQY...,,7,False,False,False,False
2,A0A0B5AC95,Conus geographus (Geography cone) (Nubecula ge...,Con-Ins G1a (Insulin 1) [Cleaved into: Con-Ins...,Insulin family,MTTSSYFLLMALGLLLYVCQSSFGNQHTRTFDTPKHRCGSEITNSY...,,7,False,False,False,False
3,A0A0D4WV12,Sicarius terrosus (Cave spider),Dermonecrotic toxin StSicTox-betaIB1i (EC 4.6....,Phospholipase family,GDSRRPIWNIAHMVNDLDLVDEYLDDGANSLELDVEFSKSGTALRT...,,8,True,True,False,False
4,A0A0N7CSQ4,Scolopendra mutilans (Chinese red-headed centi...,Tau-scoloptoxin(04)-Ssm1b (Tau-SLPTX(04)-Ssm1b...,Scoloptoxin family,MLKSFCILSVFMVLFLAKFPDLCSGEEISPLKIVVRNSEYLNNPCN...,,12,True,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...
5034,W4VSI7,Trittame loki (Brush-footed trapdoor spider),Toxin ICK-13,Neurotoxin family,MKPTISILIFFALAVAIMGHRLNSGYGIPHIVEKLPNGQWCRTPGD...,,748,False,False,False,False
5035,W4VSI8,Trittame loki (Brush-footed trapdoor spider),Toxin ICK-8,Neurotoxin family,MMKLYSLVIIATLAAAAFAATSEEISAAVSEIISQHQEDLERYAKI...,,19,False,False,False,False
5036,W4VSI9,Trittame loki (Brush-footed trapdoor spider),U10-barytoxin-Tl1a (U10-BATX-Tl1a) (Toxin ICK-3),Neurotoxin family,MKTLVLVAVLGVASLYLLSSASEVQQLSPAEEEFRAFVSTFGGLFE...,,581,False,True,False,False
5037,X5IFY8,Conus geographus (Geography cone) (Nubecula ge...,Contryphan-G,Conotoxin family,MGKLTILVLVAAVLLSTQAMVQGDGDQPAARNAVPRDDNPDGPSAK...,,24,False,False,False,False


In [222]:
all_50_pct_seq = (
    tox["Train_Cluster_Rep"].sum().item() +
    tox["Val_Cluster_Rep"].sum().item() +
    tox["Test_Cluster_Rep"].sum().item()
)
print("All representative sequences after 50% redundancy reduction:", all_50_pct_seq)
print(f"Train entries: {tox["Train_Cluster_Rep"].sum().item()} ({tox["Train_Cluster_Rep"].sum().item()/all_50_pct_seq:.2%})")

print(f"Validation entries: {tox["Val_Cluster_Rep"].sum().item()} ({tox["Val_Cluster_Rep"].sum().item()/all_50_pct_seq:.2%})")
print(f"Test entries: {tox["Test_Cluster_Rep"].sum().item()} ({tox["Test_Cluster_Rep"].sum().item()/all_50_pct_seq:.2%})")

All representative sequences after 50% redundancy reduction: 1045
Train entries: 757 (72.44%)
Validation entries: 164 (15.69%)
Test entries: 124 (11.87%)


In [223]:
train_val_test_data = tox[tox[["Train_Cluster_Rep", "Val_Cluster_Rep", "Test_Cluster_Rep"]].any(axis=1)]
train_val_test_data

Unnamed: 0,Entry,Organism,Protein names,Protein families,Sequence,Fragment,Cluster,Cluster_Rep,Train_Cluster_Rep,Val_Cluster_Rep,Test_Cluster_Rep
3,A0A0D4WV12,Sicarius terrosus (Cave spider),Dermonecrotic toxin StSicTox-betaIB1i (EC 4.6....,Phospholipase family,GDSRRPIWNIAHMVNDLDLVDEYLDDGANSLELDVEFSKSGTALRT...,,8,True,True,False,False
4,A0A0N7CSQ4,Scolopendra mutilans (Chinese red-headed centi...,Tau-scoloptoxin(04)-Ssm1b (Tau-SLPTX(04)-Ssm1b...,Scoloptoxin family,MLKSFCILSVFMVLFLAKFPDLCSGEEISPLKIVVRNSEYLNNPCN...,,12,True,True,False,False
7,A0A1L4BJ98,Hemiscorpius lepturus (Scorpion),Dermonecrotic toxin Hl-PLD1 (EC 4.6.1.-) (Phos...,Phospholipase family,MAHCYYNSKRGCNRVMKTVALVVLISTVMVEESRGDSQEDKKRPIW...,,8,False,True,False,False
15,A0A5C2A2T2,Conus purpurascens (Purple cone),Conodipine-P1 (Cdpi-P1) (EC 3.1.1.4) (Phosphat...,Phospholipase family,MKLLAPVLWAMAALGVTWLVAVDSKESCTKHSNGCSTPLRLPCQEY...,,35,False,True,False,False
16,A0A6B7FMR5,Vipera ammodytes ammodytes (Western sand viper),Disintegrin-like/cysteine-rich protein MPIII-3...,Venom metalloproteinase (M12B) family,MIQVLLVIICLAVFPYQVSSIILESGNINNYEVVYPQKVTALPKGA...,,36,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...
5024,W4VSB6,Conus victoriae (Queen Victoria cone),Conotoxin Vc7.1 (H_Vc7.1),Conotoxin family,MNTAGRLLLLCLALGLVFESLGIPVADDVEAVRDTDPDEKDPSVHN...,,744,False,True,False,False
5026,W4VSB9,Trittame loki (Brush-footed trapdoor spider),Toxin ICK-9,Neurotoxin family,MMKLYSLVIIATLAAAAFAATSEEISAAVSEIISQHQQDLERYAKI...,,19,False,True,False,False
5028,W4VSG7,Conus victoriae (Queen Victoria cone),Conotoxin Vc1 (H_Vc1),Conotoxin family,MRTSGRLLLLCLAVGLLLESQAHPNADAGDATRDVGSDRTSVELSK...,,746,True,True,False,False
5033,W4VSI6,Trittame loki (Brush-footed trapdoor spider),Toxin ICK-18,Neurotoxin family,MKTIFALVFCCAIAVVVLGFGENEGSTIDHDQNNCKGPGSRCSNKN...,,748,True,True,False,False


In [224]:
print("All representative sequences after 50% redundancy reduction:", len(tox))
print(f"Train entries: {tox['Train_Cluster_Rep'].sum()} ({tox['Train_Cluster_Rep'].sum()/len(tox):.2%})")
print(f"Validation entries: {tox['Val_Cluster_Rep'].sum()} ({tox['Val_Cluster_Rep'].sum()/len(tox):.2%})")
print(f"Test entries: {tox['Test_Cluster_Rep'].sum()} ({tox['Test_Cluster_Rep'].sum()/len(tox):.2%})")

All representative sequences after 50% redundancy reduction: 1045
Train entries: 757 (72.44%)
Validation entries: 164 (15.69%)
Test entries: 124 (11.87%)


In [225]:
tox["Protein families"].value_counts()

Protein families
Conotoxin family                                       215
Neurotoxin family                                      167
Scoloptoxin family                                      83
Short scorpion toxin superfamily                        69
Natriuretic, Bradykinin potentiating peptide family     52
                                                      ... 
Actinoporin family                                       1
ConoGAY family                                           1
Opioid neuropeptide precursor family                     1
Diuretic hormone class 2 family                          1
Cystatin family                                          1
Name: count, Length: 93, dtype: int64

In [226]:
print(tox["Protein families"].value_counts()[tox["Protein families"].value_counts() <= 10].sum())
tox["Protein families"].value_counts()[tox["Protein families"].value_counts() <= 10]

199


Protein families
Long chain scorpion toxin family           10
Bradykinin-related peptide family          10
Teretoxin family                            9
Vasopressin/oxytocin family                 8
FARP (FMRFamide related peptide) family     8
                                           ..
Actinoporin family                          1
ConoGAY family                              1
Opioid neuropeptide precursor family        1
Diuretic hormone class 2 family             1
Cystatin family                             1
Name: count, Length: 77, dtype: int64

In [227]:
print(tox["Protein families"].value_counts()[tox["Protein families"].value_counts() > 10].sum())
tox["Protein families"].value_counts()[tox["Protein families"].value_counts() > 10]

846


Protein families
Conotoxin family                                       215
Neurotoxin family                                      167
Scoloptoxin family                                      83
Short scorpion toxin superfamily                        69
Natriuretic, Bradykinin potentiating peptide family     52
MCD family                                              41
Long (4 C-C) scorpion toxin superfamily                 39
Snake three-finger toxin family                         38
Cationic peptide family                                 31
Non-disulfide-bridged peptide (NDBP) superfamily        19
Venom Kunitz-type family                                18
Formicidae venom family                                 16
Phospholipase family                                    16
Long (3 C-C) scorpion toxin superfamily                 15
Venom metalloproteinase (M12B) family                   15
Limacoditoxin family                                    12
Name: count, dtype: int64

## Remove SPs
!signalp6 --fastafile data/nontox.fasta --output_dir data/sp6/ --organism eukarya --mode fast --model_dir /Users/selin/Desktop/Uni/signalp6/signalp-6-package/models/

In [None]:
def fasta_to_dataframe(fasta_file):
    records = SeqIO.parse(fasta_file, "fasta")
    data = []

    for record in records:
        id_part = record.id.split('|')[-1]
        data.append({"identifier": id_part, "Sequence": str(record.seq)})

    df = pd.DataFrame(data)
    return df


# SignalP6 processed sequences (input: 5,181 or 21,484 seqs)
proc = fasta_to_dataframe("../data/SP6/processed_entries.fasta")
proc = proc.rename(columns={'Sequence': 'Sequence'})
proc
gff3 = pd.read_csv('../data/SP6/output.gff3', sep='\t', comment='#', header=None)

gff3.columns = [
    'identifier', 'source', 'feature_type', 'start', 'end',
    'score', 'strand', 'phase', 'attributes'
]


def extract_seqid(full_seqid):
    return full_seqid.split('|')[-1].split(' ')[0]


gff3['identifier'] = gff3['identifier'].apply(extract_seqid)
gff3 = pd.merge(gff3, proc, on='identifier')
gff3