<a href="https://colab.research.google.com/github/Sievv/AMPs-against-KP/blob/main/Calculation_Feature_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install biopython

import pandas as pd
from Bio.SeqUtils import molecular_weight
from Bio.Seq import Seq

# Define peptide feature calculation functions
def calculate_charge(sequence):
    """Calculate net charge: K, R, H (+), D, E (-)"""
    positive = sequence.count('K') + sequence.count('R') + sequence.count('H')
    negative = sequence.count('D') + sequence.count('E')
    return positive - negative

def calculate_molecular_weight(sequence):
    """Calculate molecular weight of peptide sequence"""
    return molecular_weight(Seq(sequence), seq_type='protein')

def calculate_hydrophobicity(sequence):
    """Calculate hydrophobicity based on Kyte-Doolittle scale residues"""
    hydrophobic_residues = 'AVILMFWP'
    hydrophobic_score = sum([1 for aa in sequence if aa in hydrophobic_residues])
    return hydrophobic_score

def calculate_number_of_cysteines(sequence):
    """Count cysteine residues"""
    return sequence.count('C')

def calculate_number_of_disulfide_bridges(sequence):
    """Estimate disulfide bridges as integer division of cysteines by 2"""
    cysteines = sequence.count('C')
    return cysteines // 2

def calculate_isoelectric_point(sequence):
    """Simplified isoelectric point estimate"""
    pKa_acidic = {'D': 3.9, 'E': 4.25}
    pKa_basic = {'K': 10.5, 'R': 12.5, 'H': 6.0}
    acidic_count = sum([sequence.count(aa) for aa in pKa_acidic])
    basic_count = sum([sequence.count(aa) for aa in pKa_basic])
    if acidic_count + basic_count > 0:
        pI = (sum([pKa_acidic[aa] * sequence.count(aa) for aa in pKa_acidic]) +
              sum([pKa_basic[aa] * sequence.count(aa) for aa in pKa_basic])) / (acidic_count + basic_count)
    else:
        pI = 7.0  # Neutral default if no acidic/basic residues
    return pI

def amino_acid_composition(sequence):
    """Calculate percentage composition of each amino acid"""
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    composition = {aa: sequence.count(aa) for aa in amino_acids}
    total_residues = len(sequence)
    composition_percent = {aa: (count / total_residues) * 100 if total_residues > 0 else 0
                          for aa, count in composition.items()}
    return composition_percent

def secondary_structure_features(sequence):
    """Simplified secondary structure feature counts"""
    helix_aa = 'ALIVMFYW'
    sheet_aa = 'FYW'
    turn_aa = 'GP'
    helix = sum([sequence.count(aa) for aa in helix_aa])
    sheet = sum([sequence.count(aa) for aa in sheet_aa])
    turn = sum([sequence.count(aa) for aa in turn_aa])
    flexibility = helix / len(sequence) if len(sequence) > 0 else 0
    return helix, turn, sheet, flexibility

# Load your train and test datasets (assumed to include 'Sequence' column)
X_train = pd.read_csv('/content/drive/MyDrive/EC/EC imbalance/X_train.csv')
X_test = pd.read_csv('/content/drive/MyDrive/EC/EC imbalance/X_test.csv')

# Define feature engineering function
def apply_feature_engineering(df):
    seq_col = 'Sequence'
    df['Length'] = df[seq_col].apply(len)
    df['Charge'] = df[seq_col].apply(calculate_charge)
    df['Hydrophobicity'] = df[seq_col].apply(calculate_hydrophobicity)
    df['Molecular_Weight'] = df[seq_col].apply(calculate_molecular_weight)
    df['Number_of_Cysteines'] = df[seq_col].apply(calculate_number_of_cysteines)
    df['Number_of_Disulfide_Bridges'] = df[seq_col].apply(calculate_number_of_disulfide_bridges)
    df['Isoelectric_Point'] = df[seq_col].apply(calculate_isoelectric_point)
    df['Amino_Acid_Composition'] = df[seq_col].apply(amino_acid_composition)
    df[['Helix', 'Turn', 'Sheet', 'Flexibility']] = df[seq_col].apply(secondary_structure_features).apply(pd.Series)

    # Expand amino acid composition dict into individual columns
    aa_columns = pd.DataFrame(df['Amino_Acid_Composition'].tolist(), columns=list('ACDEFGHIKLMNPQRSTVWY'))
    df = pd.concat([df.reset_index(drop=True), aa_columns.reset_index(drop=True)], axis=1)
    df = df.drop(columns=['Amino_Acid_Composition'])

    return df

# Apply feature engineering
X_train_fe = apply_feature_engineering(X_train.copy())
X_test_fe = apply_feature_engineering(X_test.copy())

# Save the enhanced feature datasets if needed
X_train_fe.to_csv('/content/drive/MyDrive/EC/EC imbalance/X_train_features.csv', index=False)
X_test_fe.to_csv('/content/drive/MyDrive/EC/EC imbalance/X_test_features.csv', index=False)

# Display first few rows of the enhanced training features
print(X_train_fe.head())
print(X_test_fe.head())
