In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqUtils import molecular_weight
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.SeqUtils import molecular_weight, ProtParam
import matplotlib.pyplot as plt
from pathlib import Path
import pandas as pd
import warnings

warnings.filterwarnings('ignore')
plt.rcParams["font.sans-serif"] = ["SimHei"] # 正常显示中文
plt.rcParams["axes.unicode_minus"] = False # 正常显示负号

path=Path('D:/Coding_in_Python/deep_learning_porject_1')

In [2]:
def read_fasta(fname):
    with open(fname, "r") as f:
        records = ((rec.id, str(rec.seq))
                   for rec in SeqIO.parse(fname, "fasta"))
    return pd.DataFrame(records, columns=["Id", "Sequence"])

In [3]:
dic=read_fasta(path / 'data/test1.fasta')
dic

Unnamed: 0,Id,Sequence
0,peptide|1,EITVEPVRHPKKDPSEAE
1,peptide|1,MLKFILALALFLHLTMEASAACKGKKCPPTGFVGMRG
2,peptide|1,CAKKRNWCGKTEDCCCPMKCVYAWYNEQGSCQSTISALWKKC
3,peptide|1,RPPGFTPFRKA
4,peptide|1,GDCHKFLGWCRGEPDPCCEHLSCSRKHGWCVWDWTV
...,...,...
1121,peptide|0,MLRTYQPKKRHRKKVHGFRKRMSTKAGRNVLKRRRLKGRHRLTA
1122,peptide|0,MGQFFAYATVITVKENDHVA
1123,peptide|0,MDIISLGWVFLMVFFSFSLSLVVWARNGL
1124,peptide|0,MKNTVKLEQFVALKEKDLQKIQGGEMRKSNNNFFHFLRRI


In [4]:
def process_data(dic):
    # 1. 从Id列中提取标签
    dic['toxicity'] = dic['Id'].apply(lambda x: x.split('|')[1])
    dic=dic.drop(columns=['Id'])
    # 2. 计算长度
    dic['length'] = dic['Sequence'].apply(len)
    # 3. 计算质量，假设线性
    df=pd.read_csv(path / 'data/amino_acids.csv')
    """
     'Amino Acids: Formula, Molecular Weight', WebQC.Org, 10 February 2026, https://zh.webqc.org/aminoacids.php
    """
    df.set_index('Called', inplace=True)
    hydrone_mass = 18.01528
    dic['mass'] = dic['Sequence'].apply(
        lambda seq: sum(df.loc[aa, 'Mass'] for aa in seq) - (len(seq) - 1) * hydrone_mass
    )
    # 4. 创建ProteinAnalysis对象缓存（关键优化！）
    analyzers = dic['Sequence'].apply(ProteinAnalysis)

    # 5. 计算疏水性（GRAVY）
    dic['hydrophobicity'] = analyzers.apply(lambda x: x.gravy())

    # 6. 计算等电点（整个序列，不是平均！）
    dic['isoelectric_point'] = analyzers.apply(lambda x: x.isoelectric_point())

    # 7. 计算人体pH下的电荷
    dic['charge_at_pH7.4'] = analyzers.apply(lambda x: x.charge_at_pH(7.4))

    # 8. 计算二级结构倾向
    ss_fractions = analyzers.apply(lambda x: x.secondary_structure_fraction())
    dic['helix'] = ss_fractions.apply(lambda x: x[0])  # α-螺旋
    dic['turn'] = ss_fractions.apply(lambda x: x[1])   # 转角
    dic['sheet'] = ss_fractions.apply(lambda x: x[2])  # β-折叠

    # 9. 计算氨基酸组成
    amino_acids = 'ARNDCEQGHILKMFPSTWYV'
    for aa in amino_acids:
        dic[f'comp_{aa}'] = dic['Sequence'].apply(lambda seq, a=aa: seq.count(a) / len(seq))

    return dic

In [5]:
dic = process_data(dic)
dic

Unnamed: 0,Sequence,toxicity,length,mass,hydrophobicity,isoelectric_point,charge_at_pH7.4,helix,turn,sheet,...,comp_L,comp_K,comp_M,comp_F,comp_P,comp_S,comp_T,comp_W,comp_Y,comp_V
0,EITVEPVRHPKKDPSEAE,1,18,2061.26294,-1.366667,5.101436,-2.295107,0.388889,0.277778,0.222222,...,0.000000,0.111111,0.000000,0.000000,0.166667,0.055556,0.055556,0.000000,0.000000,0.111111
1,MLKFILALALFLHLTMEASAACKGKKCPPTGFVGMRG,1,37,3952.90632,0.697297,9.697931,3.263581,0.513514,0.189189,0.351351,...,0.162162,0.108108,0.081081,0.081081,0.054054,0.027027,0.054054,0.000000,0.000000,0.027027
2,CAKKRNWCGKTEDCCCPMKCVYAWYNEQGSCQSTISALWKKC,1,42,4855.69402,-0.573810,8.701250,3.344044,0.309524,0.214286,0.238095,...,0.023810,0.142857,0.023810,0.000000,0.023810,0.071429,0.047619,0.071429,0.047619,0.023810
3,RPPGFTPFRKA,1,11,1273.49190,-1.036364,11.999968,2.554897,0.181818,0.363636,0.272727,...,0.000000,0.090909,0.000000,0.181818,0.272727,0.000000,0.090909,0.000000,0.000000,0.000000
4,GDCHKFLGWCRGEPDPCCEHLSCSRKHGWCVWDWTV,1,36,4234.80510,-0.577778,6.269761,-1.481168,0.166667,0.305556,0.277778,...,0.055556,0.055556,0.000000,0.027778,0.055556,0.055556,0.027778,0.111111,0.000000,0.055556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1121,MLRTYQPKKRHRKKVHGFRKRMSTKAGRNVLKRRRLKGRHRLTA,0,44,5439.55956,-1.602273,11.999968,18.371941,0.363636,0.136364,0.250000,...,0.090909,0.181818,0.045455,0.022727,0.022727,0.022727,0.068182,0.000000,0.022727,0.045455
1122,MGQFFAYATVITVKENDHVA,0,20,2241.53258,0.290000,5.302646,-1.681929,0.300000,0.150000,0.450000,...,0.000000,0.050000,0.050000,0.100000,0.000000,0.000000,0.100000,0.000000,0.050000,0.150000
1123,MDIISLGWVFLMVFFSFSLSLVVWARNGL,0,29,3349.03476,1.527586,5.587750,-0.714690,0.275862,0.275862,0.586207,...,0.172414,0.000000,0.068966,0.137931,0.000000,0.137931,0.000000,0.068966,0.000000,0.137931
1124,MKNTVKLEQFVALKEKDLQKIQGGEMRKSNNNFFHFLRRI,0,40,4837.65558,-0.787500,10.282917,4.310215,0.400000,0.200000,0.325000,...,0.100000,0.150000,0.050000,0.100000,0.000000,0.025000,0.025000,0.000000,0.000000,0.050000


In [6]:
dic.to_csv(path / 'data/processed_test1_done.csv', index=False)