In [1]:
import pandas as pd

In [2]:
train = pd.read_excel('../data/bitter_train.xlsx')
test = pd.read_excel('../data/bitter_test.xlsx')

In [3]:
train.shape, test.shape

((512, 2), (128, 2))

In [4]:
train.head()

Unnamed: 0,sequence,label
0,YYY,0
1,IVY,1
2,PQAFP,1
3,VPPFLE,0
4,KVLAGM,1


In [5]:
test.head()

Unnamed: 0,sequence,label
0,RMLGQTPTK,1
1,PSFQP,1
2,PGP,0
3,FPF,0
4,ALNEINQFYQK,1


# Sequence Similarity Filtering for Train/Test Splits
To ensure a fair evaluation and address reviewer concerns, we will:
- Calculate pairwise sequence identity within the training set, within the test set, and between training and test sets.
- Remove or flag peptides that exceed a chosen similarity threshold (e.g., 80% or 90%) in any of these comparisons.
- This process helps avoid data leakage and ensures that highly similar peptides do not appear in both training and test sets.

In [6]:
# Install Biopython if not already installed
!pip install biopython --quiet

In [7]:
from Bio import pairwise2
import numpy as np
def sequence_identity(seq1, seq2):
    """Calculate sequence identity percentage between two sequences."""
    alignments = pairwise2.align.globalxx(seq1, seq2, one_alignment_only=True)
    aln1, aln2, score, start, end = alignments[0]
    matches = sum(a == b for a, b in zip(aln1, aln2))
    return matches / max(len(seq1), len(seq2)) * 100
def filter_similar_sequences(df1, df2=None, seq_col='Sequence', threshold=80):
    """Remove sequences in df1 that are too similar to each other or to those in df2."""
    keep = np.ones(len(df1), dtype=bool)
    seqs1 = df1[seq_col].tolist()
    # Within df1
    for i in range(len(seqs1)):
        for j in range(i+1, len(seqs1)):
            if keep[j] and sequence_identity(seqs1[i], seqs1[j]) >= threshold:
                keep[j] = False
    filtered_df1 = df1[keep].reset_index(drop=True)
    # Across df1 and df2
    if df2 is not None:
        seqs2 = df2[seq_col].tolist()
        keep = np.ones(len(filtered_df1), dtype=bool)
        for i, s1 in enumerate(filtered_df1[seq_col]):
            for s2 in seqs2:
                if sequence_identity(s1, s2) >= threshold:
                    keep[i] = False
                    break
        filtered_df1 = filtered_df1[keep].reset_index(drop=True)
    return filtered_df1



In [None]:
threshold = 80  # Set your desired threshold here (e.g., 80 or 90)
train_filtered = filter_similar_sequences(train, seq_col='sequence', threshold=threshold)
test_filtered = filter_similar_sequences(test, seq_col='sequence', threshold=threshold)
# Remove test sequences similar to any in train
test_filtered = filter_similar_sequences(test_filtered, df2=train_filtered, seq_col='sequence', threshold=threshold)
print(f"Filtered train: {len(train_filtered)} sequences\nFiltered test: {len(test_filtered)} sequences")

Filtered train: 428 sequences
Filtered test: 86 sequences


In [9]:
train_filtered.shape, test_filtered.shape

((428, 2), (86, 2))

In [10]:
train_filtered.to_excel('../data/bitter_train_filtered_80.xlsx', index=False)
test_filtered.to_excel('../data/bitter_test_filtered_80.xlsx', index=False)

In [1]:
import pandas as pd

In [2]:
train = pd.read_excel('../data/bitter_train_filtered_80.xlsx')
test = pd.read_excel('../data/bitter_test_filtered_80.xlsx')

In [3]:
train.shape, test.shape

((428, 2), (86, 2))

In [4]:
# check label distribution
train['label'].value_counts(), test['label'].value_counts()

(label
 1    219
 0    209
 Name: count, dtype: int64,
 label
 1    44
 0    42
 Name: count, dtype: int64)