In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqUtils import molecular_weight
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.SeqUtils import molecular_weight, ProtParam
import matplotlib.pyplot as plt
from pathlib import Path
import pandas as pd
import warnings

warnings.filterwarnings('ignore')
plt.rcParams["font.sans-serif"] = ["SimHei"] # 正常显示中文
plt.rcParams["axes.unicode_minus"] = False # 正常显示负号

path=Path('D:/Coding_in_Python/deep_learning_porject_1')

In [2]:
def read_fasta(fname):
    with open(fname, "r") as f:
        records = ((rec.id, str(rec.seq))
                   for rec in SeqIO.parse(fname, "fasta"))
    return pd.DataFrame(records, columns=["Id", "Sequence"])

In [9]:
dic=read_fasta(path / 'data/train_data.fasta')
dic

Unnamed: 0,Id,Sequence
0,peptide|1,APAKRWFGHEECTYWLGPCEVDDTCCSASCESKFCGLW
1,peptide|1,DCCIIAGCPFGCTICC
2,peptide|1,CKGKGAPCTRLMYDCCHGSCSSSKGRCG
3,peptide|1,LAKRADICQPGKTSQRACET
4,peptide|1,RSVCSNGCRPKPFGGCSC
...,...,...
6382,peptide|0,MPTWLTTIFSVVIILGIFAWIGLSIYQKIKQIRGKKKDKKEIERKESNK
6383,peptide|0,MNRVQFNHHHHHHPD
6384,peptide|0,MKIKFVFDLLTPDDILHPSNHVNLIIRPI
6385,peptide|0,MAAACRCLSLLLLSTCVALLL


In [10]:
def process_data(dic):
    # 1. 从Id列中提取标签
    dic['toxicity'] = dic['Id'].apply(lambda x: x.split('|')[1])
    dic=dic.drop(columns=['Id'])
    # 2. 计算长度
    dic['length'] = dic['Sequence'].apply(len)
    # 3. 计算质量，假设线性
    df=pd.read_csv(path / 'data/amino_acids.csv')
    """
     'Amino Acids: Formula, Molecular Weight', WebQC.Org, 10 February 2026, https://zh.webqc.org/aminoacids.php
    """
    df.set_index('Called', inplace=True)
    hydrone_mass = 18.01528
    dic['mass'] = dic['Sequence'].apply(
        lambda seq: sum(df.loc[aa, 'Mass'] for aa in seq) - (len(seq) - 1) * hydrone_mass
    )
    # 4. 创建ProteinAnalysis对象缓存（关键优化！）
    analyzers = dic['Sequence'].apply(ProteinAnalysis)

    # 5. 计算疏水性（GRAVY）
    dic['hydrophobicity'] = analyzers.apply(lambda x: x.gravy())

    # 6. 计算等电点（整个序列，不是平均！）
    dic['isoelectric_point'] = analyzers.apply(lambda x: x.isoelectric_point())

    # 7. 计算人体pH下的电荷
    dic['charge_at_pH7.4'] = analyzers.apply(lambda x: x.charge_at_pH(7.4))

    # 8. 计算二级结构倾向
    ss_fractions = analyzers.apply(lambda x: x.secondary_structure_fraction())
    dic['helix'] = ss_fractions.apply(lambda x: x[0])  # α-螺旋
    dic['turn'] = ss_fractions.apply(lambda x: x[1])   # 转角
    dic['sheet'] = ss_fractions.apply(lambda x: x[2])  # β-折叠

    # 9. 计算氨基酸组成
    amino_acids = 'ARNDCEQGHILKMFPSTWYV'
    for aa in amino_acids:
        dic[f'comp_{aa}'] = dic['Sequence'].apply(lambda seq, a=aa: seq.count(a) / len(seq))

    return dic

In [11]:
dic = process_data(dic)
dic

Unnamed: 0,Sequence,toxicity,length,mass,hydrophobicity,isoelectric_point,charge_at_pH7.4,helix,turn,sheet,...,comp_L,comp_K,comp_M,comp_F,comp_P,comp_S,comp_T,comp_W,comp_Y,comp_V
0,APAKRWFGHEECTYWLGPCEVDDTCCSASCESKFCGLW,1,38,4316.85164,-0.286842,4.642632,-3.504757,0.289474,0.263158,0.289474,...,0.052632,0.052632,0.000000,0.052632,0.052632,0.078947,0.052632,0.078947,0.026316,0.026316
1,DCCIIAGCPFGCTICC,1,16,1622.01680,1.656250,4.050028,-1.589121,0.062500,0.250000,0.312500,...,0.000000,0.000000,0.000000,0.062500,0.062500,0.000000,0.062500,0.000000,0.000000,0.000000
2,CKGKGAPCTRLMYDCCHGSCSSSKGRCG,1,28,2896.39394,-0.489286,8.921087,3.437433,0.214286,0.392857,0.107143,...,0.035714,0.107143,0.035714,0.000000,0.035714,0.142857,0.035714,0.000000,0.035714,0.000000
3,LAKRADICQPGKTSQRACET,1,20,2176.48878,-0.815000,8.902713,1.504952,0.350000,0.200000,0.200000,...,0.050000,0.100000,0.000000,0.000000,0.050000,0.050000,0.100000,0.000000,0.000000,0.000000
4,RSVCSNGCRPKPFGGCSC,1,18,1858.16484,-0.344444,8.975691,2.456884,0.055556,0.500000,0.111111,...,0.000000,0.055556,0.000000,0.055556,0.111111,0.166667,0.000000,0.000000,0.000000,0.055556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6382,MPTWLTTIFSVVIILGIFAWIGLSIYQKIKQIRGKKKDKKEIERKESNK,0,49,5777.93976,-0.191837,10.170935,6.263591,0.346939,0.183673,0.448980,...,0.061224,0.183673,0.020408,0.040816,0.020408,0.061224,0.061224,0.040816,0.020408,0.040816
6383,MNRVQFNHHHHHHPD,0,15,1943.08438,-2.026667,7.016626,-0.493662,0.066667,0.266667,0.133333,...,0.000000,0.000000,0.066667,0.066667,0.066667,0.000000,0.000000,0.000000,0.000000,0.066667
6384,MKIKFVFDLLTPDDILHPSNHVNLIIRPI,0,29,3400.06136,0.382759,6.693610,-0.645555,0.241379,0.310345,0.482759,...,0.137931,0.068966,0.034483,0.068966,0.103448,0.034483,0.034483,0.000000,0.000000,0.068966
6385,MAAACRCLSLLLLSTCVALLL,0,21,2178.80090,2.114286,7.736378,0.211353,0.619048,0.095238,0.476190,...,0.380952,0.000000,0.047619,0.000000,0.000000,0.095238,0.047619,0.000000,0.000000,0.047619


In [12]:
dic.to_csv(path / 'data/processed_train_data_done.csv', index=False)

In [13]:
dic.shape

(6387, 30)