In [11]:
import pandas as pd
import numpy as np
import re
from pathlib import Path
from Bio import SeqIO

## load in SignalP6 results

In [16]:
def fasta_to_dataframe(fasta_file):
    records = SeqIO.parse(fasta_file, "fasta")
    data = []

    for record in records:
        id_part = record.id.split('|')[-1]
        data.append({"identifier": id_part, "Sequence": str(record.seq)})

    df = pd.DataFrame(data)
    return df

# SignalP6 processed sequences (input: 5,181 or 21,484 seqs)
proc = fasta_to_dataframe("../data/SP6/processed_entries.fasta")
proc = proc.rename(columns={'Sequence': 'Sequence'})
proc

Unnamed: 0,identifier,Sequence
0,A0A044RE18,IEHDSICIADVDDACPEPSHTVMRLRERNDKKAHLIAKQHGLEIRG...
1,A0A0B7P9G0,TSVDTSNKLLLQKANDFNLSQNLSSSRTRRTIANSFRIVGIRLEDE...
2,A0A0K2S4Q6,VSGPSTVMGAVGESLSVQCRYEEKYKTFNKYWCRQPCLPIWHEMVE...
3,A0A0K3AWM6,DQRLSSTSISSMNGFSTTRKCEHITIPMCKNLDYNQTVFPNLLGHT...
4,A0A0R4IKU3,QDQSCCVHHAADIPRCRDACEQLASIRSESRLRHLLHRLPSYCPET...
...,...,...
4074,O08675,CQSGINVSDNSAKPTLTIKSFNGGPQNTFEEFPLSDIEGWTGATTT...
4075,O08692,LRQLRYEEIVDRAIEAYNQGRQGRPLFRLLSATPPSSQNPATNIPL...
4076,O08712,ETLPPKYLHYDPETGHQLLCDKCAPGTYLKQHCTVRRKTLCVPCPD...
4077,O08762,DPVSRSPLHRPHPSPPRSQHAHYLPSSRRPPRTPRFPLPLRIPAAQ...


In [17]:
gff3 = pd.read_csv('../data/SP6/output.gff3', sep='\t', comment='#', header=None)

gff3.columns = [
    'identifier', 'source', 'feature_type', 'start', 'end', 
    'score', 'strand', 'phase', 'attributes'
]
def extract_seqid(full_seqid):
    return full_seqid.split('|')[-1].split(' ')[0]

gff3['identifier'] = gff3['identifier'].apply(extract_seqid)
gff3 = pd.merge(gff3, proc, on='identifier')
gff3

Unnamed: 0,identifier,source,feature_type,start,end,score,strand,phase,attributes,Sequence
0,A0A044RE18,SignalP-6.0,signal_peptide,1,20,0.844694,.,.,.,IEHDSICIADVDDACPEPSHTVMRLRERNDKKAHLIAKQHGLEIRG...
1,A0A0B7P9G0,SignalP-6.0,signal_peptide,1,20,0.999824,.,.,.,TSVDTSNKLLLQKANDFNLSQNLSSSRTRRTIANSFRIVGIRLEDE...
2,A0A0K2S4Q6,SignalP-6.0,signal_peptide,1,24,0.999847,.,.,.,VSGPSTVMGAVGESLSVQCRYEEKYKTFNKYWCRQPCLPIWHEMVE...
3,A0A0K3AWM6,SignalP-6.0,signal_peptide,1,16,0.999809,.,.,.,DQRLSSTSISSMNGFSTTRKCEHITIPMCKNLDYNQTVFPNLLGHT...
4,A0A0R4IKU3,SignalP-6.0,signal_peptide,1,22,0.999853,.,.,.,QDQSCCVHHAADIPRCRDACEQLASIRSESRLRHLLHRLPSYCPET...
...,...,...,...,...,...,...,...,...,...,...
4074,O08675,SignalP-6.0,signal_peptide,1,18,0.999954,.,.,.,CQSGINVSDNSAKPTLTIKSFNGGPQNTFEEFPLSDIEGWTGATTT...
4075,O08692,SignalP-6.0,signal_peptide,1,21,0.999849,.,.,.,LRQLRYEEIVDRAIEAYNQGRQGRPLFRLLSATPPSSQNPATNIPL...
4076,O08712,SignalP-6.0,signal_peptide,1,21,0.999857,.,.,.,ETLPPKYLHYDPETGHQLLCDKCAPGTYLKQHCTVRRKTLCVPCPD...
4077,O08762,SignalP-6.0,signal_peptide,1,21,0.999831,.,.,.,DPVSRSPLHRPHPSPPRSQHAHYLPSSRRPPRTPRFPLPLRIPAAQ...


### significant SignalP6 results

In [18]:
significant_sp6_results = gff3[gff3['score'] >= 0.95]
significant_sp6_results

Unnamed: 0,identifier,source,feature_type,start,end,score,strand,phase,attributes,Sequence
1,A0A0B7P9G0,SignalP-6.0,signal_peptide,1,20,0.999824,.,.,.,TSVDTSNKLLLQKANDFNLSQNLSSSRTRRTIANSFRIVGIRLEDE...
2,A0A0K2S4Q6,SignalP-6.0,signal_peptide,1,24,0.999847,.,.,.,VSGPSTVMGAVGESLSVQCRYEEKYKTFNKYWCRQPCLPIWHEMVE...
3,A0A0K3AWM6,SignalP-6.0,signal_peptide,1,16,0.999809,.,.,.,DQRLSSTSISSMNGFSTTRKCEHITIPMCKNLDYNQTVFPNLLGHT...
4,A0A0R4IKU3,SignalP-6.0,signal_peptide,1,22,0.999853,.,.,.,QDQSCCVHHAADIPRCRDACEQLASIRSESRLRHLLHRLPSYCPET...
5,A0A126GUP6,SignalP-6.0,signal_peptide,1,22,0.999823,.,.,.,QEIFGYCRTPDENSGTCINLRECGYLFELLQSEEVTEQDRRFLQAS...
...,...,...,...,...,...,...,...,...,...,...
4074,O08675,SignalP-6.0,signal_peptide,1,18,0.999954,.,.,.,CQSGINVSDNSAKPTLTIKSFNGGPQNTFEEFPLSDIEGWTGATTT...
4075,O08692,SignalP-6.0,signal_peptide,1,21,0.999849,.,.,.,LRQLRYEEIVDRAIEAYNQGRQGRPLFRLLSATPPSSQNPATNIPL...
4076,O08712,SignalP-6.0,signal_peptide,1,21,0.999857,.,.,.,ETLPPKYLHYDPETGHQLLCDKCAPGTYLKQHCTVRRKTLCVPCPD...
4077,O08762,SignalP-6.0,signal_peptide,1,21,0.999831,.,.,.,DPVSRSPLHRPHPSPPRSQHAHYLPSSRRPPRTPRFPLPLRIPAAQ...


## all sequences

In [19]:
# all seqs with their annotation
all = pd.read_csv('../data/nontox_all_embeds.csv')
all = pd.merge(all, significant_sp6_results[['identifier', 'Sequence']], on='identifier', how='left')
all

Unnamed: 0,identifier,combined,Sequence
0,A0A023FDY8,"[0.055755615234375, 0.0185089111328125, 0.1109...",ENTQQEEQDYDYGTDTCPFPVLANKTNKAKFVGCHQKCNGGDQKLT...
1,A0A044RE18,"[0.0186614990234375, 0.0379638671875, 0.036102...",
2,A0A060XQP6,"[0.10888671875, 0.012176513671875, 0.170166015...",GTEADNDERAASLLVHLKGDKDGGGLTGSPDGVSAGTTDGTDSSKE...
3,A0A061ACU2,"[0.036407470703125, -0.059844970703125, 0.0141...",
4,A0A061I403,"[0.0618896484375, 0.0238494873046875, -0.02001...",
...,...,...,...
21479,W6RTA4,"[0.1593017578125, 0.0726318359375, 0.033996582...",
21480,X2JAU8,"[0.0020542144775390625, -0.0240478515625, 0.04...",
21481,X2JDY8,"[0.0010547637939453125, -0.0033245086669921875...",
21482,X4Y2L4,"[0.036163330078125, -0.00980377197265625, 0.02...",


## cleave off signal peptides based off of Uniprot annotation

In [20]:
# def cleave_signal_peptide(row):
#     if pd.notna(row['Signal peptide']):
#         match = re.search(r'SIGNAL\s+\d+\.\.(\d+)', row['Signal peptide'])
#         if match:
#             end = int(match.group(1))
#             cleaved_sequence = row['Sequence'][end:]
#             return cleaved_sequence
#         else:
#             # if end = ? -> returns whole sequence
#             return np.nan
#     else:
#         # if no SP annotation is in there, just put the same seq
#         return np.nan
# 
# 
# #df_copy = all.copy()
# all['annotation cleaved Sequence'] = all.apply(cleave_signal_peptide, axis=1)
# 
# #diff_sp_annotation = df_copy
# #diff_sp_annotation
# all

In [21]:
# with open('data/mature_seqs.fasta', 'w') as f:
#     for _, row in all.iterrows():
#         
#         # if the data has an annotation, use the annotation
#         if not pd.isna(row['annotation cleaved Sequence']):
#             f.write(f">{row['Entry Name']}\n"
#                     f"{row['annotation cleaved Sequence']}\n")
#             continue
#         
#         # if the data has no annotation but SP6 predicted a significant SP, use this
#         elif not pd.isna(row['SP6 Sequence']):
#             f.write(f">{row['Entry Name']}\n"
#                     f"{row['SP6 Sequence']}\n")
#             continue
#         
#         # in any other case, just use the normal sequence
#         else:
#             f.write(f">{row['Entry Name']}\n"
#                     f"{row['Sequence']}\n")
#             continue


In [22]:
ids = []
sequences = []

# Parse the FASTA file
for record in SeqIO.parse('../data/nontox_all.fasta', "fasta"):
    ids.append(record.id)
    sequences.append(str(record.seq))

# Create a DataFrame
df = pd.DataFrame({
    'identifier': ids,
    'Sequence': sequences
})

df['identifier'] = df['identifier'].apply(lambda x: x.split('|')[1].strip())
df

Unnamed: 0,identifier,Sequence
0,A0A026W182,MMKMKQQGLVADLLPNIRVMKTFGHFVFNYYNDNSSKYLHKVYCCV...
1,A0A044RE18,MYWQLVRILVLFDCLQKILAIEHDSICIADVDDACPEPSHTVMRLR...
2,A0A061ACU2,MTVPPLLKSCVVKLLLPAALLAAAIIRPSFLSIGYVLLALVSAVLP...
3,A0A061AE05,MLTPRDENNEGDAMPMLKKPRYSSLSGQSTNITYQEHTISREERAA...
4,A0A061I403,MPMASVIAVAEPKWISVWGRFLWLTLLSMALGSLLALLLPLGAVEE...
...,...,...
52466,Q9JJR9,MFYSGLLTEGGRKETDMREAASLRQQRRMKQAVQFIHKDSADLLPL...
52467,Q9U8G7,MSGFDVTKTFNRFTQRAGELVNKNEKTSYPTRTSDLIHEIDQMKAW...
52468,Q9UF83,MRRPSTASLTRTPSRASPTRMPSRASLKMTPFRASLTKMESTALLR...
52469,Q9VVR0,MPFANEGNDPIAARLSKCYWNLSSPFLKDVIPKKRPSKAFNRKPPT...


In [23]:
all['Sequence'] = all['Sequence'].fillna(all['identifier'].map(df.set_index('identifier')['Sequence']))
all

Unnamed: 0,identifier,combined,Sequence
0,A0A023FDY8,"[0.055755615234375, 0.0185089111328125, 0.1109...",ENTQQEEQDYDYGTDTCPFPVLANKTNKAKFVGCHQKCNGGDQKLT...
1,A0A044RE18,"[0.0186614990234375, 0.0379638671875, 0.036102...",MYWQLVRILVLFDCLQKILAIEHDSICIADVDDACPEPSHTVMRLR...
2,A0A060XQP6,"[0.10888671875, 0.012176513671875, 0.170166015...",GTEADNDERAASLLVHLKGDKDGGGLTGSPDGVSAGTTDGTDSSKE...
3,A0A061ACU2,"[0.036407470703125, -0.059844970703125, 0.0141...",MTVPPLLKSCVVKLLLPAALLAAAIIRPSFLSIGYVLLALVSAVLP...
4,A0A061I403,"[0.0618896484375, 0.0238494873046875, -0.02001...",MPMASVIAVAEPKWISVWGRFLWLTLLSMALGSLLALLLPLGAVEE...
...,...,...,...
21479,W6RTA4,"[0.1593017578125, 0.0726318359375, 0.033996582...",MSSWFSYFGFSKGPPLEEVREESEEDAQVPEQVVSKNTEEEIADAI...
21480,X2JAU8,"[0.0020542144775390625, -0.0240478515625, 0.04...",MQPPPRKGNYVKFLKNLHTEQVAKLQLKNQHECDLLEDIRQFTIKR...
21481,X2JDY8,"[0.0010547637939453125, -0.0033245086669921875...",MGGGKNVRRGLEPLEFEECIVDSPDFRENLNRHEKELDHTSHQIKR...
21482,X4Y2L4,"[0.036163330078125, -0.00980377197265625, 0.02...",MKEIAVTIDDKNVIASVSESFHGVAFDASLFSPKGLWSFVDITSPK...


In [26]:
all.to_csv('../data/nontox_all_embeds.csv')

In [27]:
def create_fasta(df, output_file):
    with open(output_file, 'w') as f:
        for _, row in df.iterrows():
            f.write(f">{row['identifier']}\n")
            f.write(f"{row['Sequence']}\n")

In [28]:
create_fasta(all, '../data/reduced_nontox_noSP.fasta')