In [22]:
import pandas as pd
import numpy as np

from Bio.SeqUtils.ProtParam import ProteinAnalysis

In [None]:
df = pd.read_csv('tcell_full_v3_processed.csv')
df.head()

In [26]:
def calculate_physchem(sequence):
    analysis = ProteinAnalysis(sequence)
    features = {
        'molecular_weight': analysis.molecular_weight(),
        'aromaticity': analysis.aromaticity(),
        'ss_helix': analysis.secondary_structure_fraction()[0],
        'ss_turn': analysis.secondary_structure_fraction()[1],
        'ss_sheet': analysis.secondary_structure_fraction()[2],
        'isoelectric_point': analysis.isoelectric_point(),
        'gravy': analysis.gravy(),
        'molar_extinction': analysis.molar_extinction_coefficient()[0], # with reduced cysteines
        'flexibility': np.mean(analysis.flexibility()), # ср знач по всей послд-ти
        'instability_index': analysis.instability_index(),
        # 'charge_at_pH': analysis.charge_at_pH(pH=7.4) выс лин корр с isoelectric_point
    }
    return features


In [27]:
df = pd.concat([df, df['Epitope Seq'].apply(calculate_physchem).apply(pd.Series)], axis=1)

In [29]:
from propy import PyPro
from joblib import Parallel, delayed
from tqdm import tqdm

In [30]:
tqdm.pandas() 

дескрипторы PAAC, APAAC, QSO, shannon_entropy выбраны на основе статьи Meta IL-4

In [None]:
def get_PAAC(sequence):
    DesObject = PyPro.GetProDes(sequence)
    DesDescr = DesObject.GetPAAC(lamda=6, weight=0.05)
    return list(DesDescr.values())


df['PAAC'] = list(
    tqdm(
        Parallel(return_as="generator", n_jobs=6)(
            delayed(get_PAAC)(seq) for seq in df['Epitope Seq']
        ),
        total=len(df),
        desc="Processing sequences"
    )
)

paac_length = len(df['PAAC'].iloc[0])
paac_columns = [f'PAAC_{i}' for i in range(paac_length)]
paac_expanded = pd.DataFrame(df['PAAC'].tolist(),
                            index=df.index,
                            columns=paac_columns)

df = pd.concat([df, paac_expanded], axis=1)

df.drop('PAAC', axis=1, inplace=True)

print(df.columns)
print(df.shape)

In [None]:
def get_APAAC(sequence):
    DesObject = PyPro.GetProDes(sequence)
    DesDescr = DesObject.GetAPAAC(lamda=6, weight=0.05)
    return list(DesDescr.values())


df['APAAC'] = list(
    tqdm(
        Parallel(return_as="generator", n_jobs=6)(
            delayed(get_APAAC)(seq) for seq in df['Epitope Seq']
        ),
        total=len(df),
        desc="Processing sequences"
    )
)

apaac_length = len(df['APAAC'].iloc[0])
apaac_columns = [f'APAAC_{i}' for i in range(apaac_length)]
apaac_expanded = pd.DataFrame(df['APAAC'].tolist(),
                            index=df.index,
                            columns=apaac_columns)

df = pd.concat([df, apaac_expanded], axis=1)

df.drop('APAAC', axis=1, inplace=True)

print(df.columns)
print(df.shape)

In [33]:
def get_QSO(sequence):
  DesObject = PyPro.GetProDes(sequence)
  DesDescr = DesObject.GetQSO()
  return list(DesDescr.values())


df['QSO'] = list(
    tqdm(
        Parallel(return_as="generator", n_jobs=6)(
            delayed(get_QSO)(seq) for seq in df['Epitope Seq']
        ),
        total=len(df),
        desc="Processing sequences"
    )
)

qso_length = len(df['QSO'].iloc[0])
qso_columns = [f'QSO_{i}' for i in range(qso_length)]
qso_expanded = pd.DataFrame(df['QSO'].tolist(),
                                   index=df.index,
                                   columns=qso_columns)

df = pd.concat([df, qso_expanded], axis=1)

df.drop('QSO', axis=1, inplace=True)

print(df.columns)
print(df.shape)

Processing sequences: 100%|██████████| 63761/63761 [00:02<00:00, 24280.77it/s]


Index(['MHC Allele', 'Evidence Code', 'MHC Class', 'Epitope ID', 'Epitope Seq',
       'Host', 'Antigen Source', 'IL-2 release', 'IL-4 release',
       'IL-10 release',
       ...
       'QSO_90', 'QSO_91', 'QSO_92', 'QSO_93', 'QSO_94', 'QSO_95', 'QSO_96',
       'QSO_97', 'QSO_98', 'QSO_99'],
      dtype='object', length=121)
(63761, 121)


In [34]:
# Функция для расчета энтропии Шеннона
def shannon_entropy(sequence):
    freq = pd.Series(list(sequence)).value_counts(normalize=True)
    return -np.sum(freq * np.log2(freq))


df['shannon_entropy'] = df['Epitope Seq'].apply(shannon_entropy)

In [None]:
cols_to_drop = ['MHC Allele', 
                'Evidence Code', 
                'MHC Class',
                'Host', 
                'Antigen Source']

df.drop(cols_to_drop, axis=1, inplace=True)