In [12]:
from Bio import SeqIO
import pandas as pd
import re
import os
import random

In [13]:
### Step1: Obtain positive CPPs
with open('../source_data/positive-cpps.txt' , 'r') as info:
    lines = info.readlines()
    positive_cpps = [line.strip() for line in lines]

In [37]:
### Step2: Get non-CPPs candidates

fasta_file = 'uniprot_sprot.fasta'
save_dir = os.getcwd()

os.makedirs(save_dir, exist_ok=True)
print(f"Directory created: {save_dir}")

pattern1 = r'\|(.*?)\|'     
pattern2 = r'GN=([^ ]+)'    

results = []
for record in SeqIO.parse(fasta_file, "fasta"):
    sequence = str(record.seq)
    sequence_length = len(sequence)

    header_info = record.description
    EntryName = header_info.split()[0].split('|')[2]
    Name = EntryName.split('_')[0]
    
    UniqueId = re.findall(pattern1, header_info)[0]
    gene_name = re.search(pattern2, header_info)
    
    if gene_name:
        GeneName = gene_name.group(1)
    else:
        GeneName = ''

    if (sequence_length < 62) and all(char not in sequence for char in ['X', 'B', '-', 'Z', 'O', 'U']) and (sequence not in positive_cpps):
        results.append([UniqueId, EntryName, GeneName, sequence, sequence_length])

    # with open(current_dir + UniqueId + ".fasta", "w") as output_handle:
    #     output_handle.write(f">{UniqueId}\n")
    #     output_handle.write(f"{sequence}\n")

col_names = ['UniprotID', 'EntryName', 'GeneName','Sequence', 'Length']
prot_df = pd.DataFrame(results, columns=col_names)     
prot_df.to_csv('all_negative_CPPs.csv', index=False)

Directory created: c:\Users\KerryChen\Desktop\CPP_Manuscript_Revise\uniprot_sprot.fasta


In [41]:
### Step3: Obtain negative CPPs
random.seed(42)
num_positive_cpps = len(positive_cpps)

random_negative_cpps = random.sample(prot_df['Sequence'].unique().tolist(), num_positive_cpps)

positive_df = pd.DataFrame(positive_cpps, columns=["Sequence"])
positive_df["Label"] = 1 


negative_df = pd.DataFrame(random_negative_cpps, columns=["Sequence"])
negative_df["Label"] = 0  

combined_df = pd.concat([positive_df, negative_df], ignore_index=True)
combined_df.to_csv('CPPSet1.csv', index=False)