In [4]:
!pip install Bio

Collecting Bio
  Downloading bio-1.8.1-py3-none-any.whl.metadata (5.7 kB)
Collecting biopython>=1.80 (from Bio)
  Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Collecting gprofiler-official (from Bio)
  Downloading gprofiler_official-1.0.0-py3-none-any.whl.metadata (11 kB)
Collecting mygene (from Bio)
  Downloading mygene-3.2.2-py2.py3-none-any.whl.metadata (10 kB)
Collecting biothings-client>=0.2.6 (from mygene->Bio)
  Downloading biothings_client-0.4.1-py3-none-any.whl.metadata (10 kB)
Downloading bio-1.8.1-py3-none-any.whl (321 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m321.3/321.3 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m91.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading g

# Getting Veltri Datasets

In [5]:
from Bio import SeqIO
import pandas as pd

def fasta_to_dataframe(fasta_file, label):
    """Convert FASTA file to dataframe with sequences and labels"""
    sequences = []
    for record in SeqIO.parse(fasta_file, "fasta"):
        sequences.append({
            'sequence': str(record.seq),
            'label': label
        })
    return pd.DataFrame(sequences)

# Process training data
amp_train = fasta_to_dataframe('AMP.tr.fa', label=1)
decoy_train = fasta_to_dataframe('DECOY.tr.fa', label=0)
train_df = pd.concat([amp_train, decoy_train], ignore_index=True).sample(frac=1, random_state=42)  # shuffle

# Process validation data
amp_val = fasta_to_dataframe('AMP.eval.fa', label=1)
decoy_val = fasta_to_dataframe('DECOY.eval.fa', label=0)
val_df = pd.concat([amp_val, decoy_val], ignore_index=True).sample(frac=1, random_state=42)

# Process test data
amp_test = fasta_to_dataframe('AMP.te.fa', label=1)
decoy_test = fasta_to_dataframe('DECOY.te.fa', label=0)
test_df = pd.concat([amp_test, decoy_test], ignore_index=True).sample(frac=1, random_state=42)

# Save
train_df.to_csv('veltri_train.csv', index=False)
val_df.to_csv('veltri_val.csv', index=False)
test_df.to_csv('veltri_test.csv', index=False)

print(f"Veltri Train: {len(train_df)} samples")
print(f"Veltri Val: {len(val_df)} samples")
print(f"Veltri Test: {len(test_df)} samples")

Veltri Train: 1424 samples
Veltri Val: 708 samples
Veltri Test: 1424 samples


# Getting LMPred Datasets

In [6]:
import pandas as pd

# Load X and y files
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')

X_val = pd.read_csv('X_val.csv')
y_val = pd.read_csv('y_val.csv')

X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv')

# Merge (assuming y files have 'label' column and X files have 'sequence')
train_df = pd.concat([X_train, y_train], axis=1)
val_df = pd.concat([X_val, y_val], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

# Save
train_df.to_csv('lmpred_train.csv', index=False)
val_df.to_csv('lmpred_val.csv', index=False)
test_df.to_csv('lmpred_test.csv', index=False)

print(f"LMPred Train: {len(train_df)} samples")
print(f"LMPred Val: {len(val_df)} samples")
print(f"LMPred Test: {len(test_df)} samples")

LMPred Train: 3005 samples
LMPred Val: 1504 samples
LMPred Test: 3007 samples
