In [None]:
!pip install xgboost
!pip install imblearn
!pip install --upgrade --force-reinstall tensorflow
!pip install --upgrade --force-reinstall tensorflow-gpu

In [None]:
%%writefile peptide_classification.py

import json
import argparse
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from imblearn.over_sampling import SMOTE
from xgboost import XGBRFClassifier

class PeptideClassifier:
    '''
    A class to classify peptide sequences based on extracted biochemical features.
    It utilizes Ensemble of various machine learning models including Random Forest, Gradient Boosting,
    Deep Neural Network (DNN), and XGBoost Random Forest.
    '''

    def __init__(self, train_path, test_path, properties_path, output_path):
        '''
        Initializes the PeptideClassifier with paths for training, test, properties, and output.
        '''
        self.train_path = train_path
        self.test_path = test_path
        self.properties_path = properties_path
        self.output_path = output_path
        self.aa_list = "ACDEFGHIKLMNPQRSTVWY"
        self.models = {}
        self.scaler = StandardScaler()
        self.load_data()
        self.y_test = None
        self.X_test = None

    def load_data(self):
        '''
        Loads and preprocesses training and test datasets.
        '''
        self.train_df = pd.read_csv(self.train_path).dropna()
        self.train_df.columns = self.train_df.columns.str.strip().str.replace("# ", "")
        self.train_df['Label'] = self.train_df['Label'].replace(-1, 0)
        self.test_df = pd.read_csv(self.test_path)

    def extract_features(self, sequences):
        '''
        Extracts biochemical properties from peptide sequences and computes relevant features.
        '''
        '''Reference for Feature Extraction: https://www.imgt.org/IMGTeducation/Aide-memoire/_UK/aminoacids/IMGTclasses.html'''

        with open(self.properties_path, "r") as file:
            properties = json.load(file)

        features = {key: [] for key in ['length', 'molecular_weight', 'avg_hydrophobicity', 'avg_hydrophilicity',
                                         'polar_residue_fraction', 'aromatic_residue_fraction', 'pos_charge',
                                         'neg_charge', 'net_charge', 'boman_index']}
        aa_comp = []

        for seq in sequences:
            seq_length = len(seq)
            features['length'].append(seq_length)
            comp = [seq.count(aa) / seq_length if seq_length else 0 for aa in self.aa_list]
            aa_comp.append(comp)
            features['molecular_weight'].append(sum(properties["molecular_weight"].get(aa, 0) for aa in seq) + 18.02)
            features['avg_hydrophobicity'].append(np.mean([properties['hydrophobicity'].get(aa, 0) for aa in seq]))
            features['avg_hydrophilicity'].append(np.mean([properties['hydrophilicity'].get(aa, 0) for aa in seq]))
            features['polar_residue_fraction'].append(np.mean([properties['polarity'].get(aa, 0) for aa in seq]))
            features['aromatic_residue_fraction'].append(np.mean([properties['aromaticity'].get(aa, 0) for aa in seq]))
            pos_count = sum(seq.count(aa) for aa in "KRH")
            neg_count = sum(seq.count(aa) for aa in "DE")
            features['pos_charge'].append(pos_count / seq_length if seq_length else 0)
            features['neg_charge'].append(neg_count / seq_length if seq_length else 0)
            features['net_charge'].append((pos_count - neg_count) / seq_length if seq_length else 0)
            features['boman_index'].append(np.mean([-properties['hydrophobicity'].get(aa, 0) for aa in seq]))

        feature_df = pd.concat([pd.DataFrame(features), pd.DataFrame(aa_comp, columns=[f'aa_{aa}' for aa in self.aa_list])], axis=1)
        return feature_df

    def add_gaussian_noise(self, X, noise_level=0.01):
        '''
        Adds Gaussian noise to training data for better generalization.
        '''
        noise = np.random.normal(0, noise_level, X.shape)
        return X + noise

    def train_models(self):
        '''
        Trains multiple classification models including RF, GBC, DNN, and XGBRF.
        '''
        X = self.extract_features(self.train_df['Sequence'])
        y = self.train_df['Label']
        X_train, y_train = SMOTE(random_state=42).fit_resample(X, y)
        X_train = self.add_gaussian_noise(X_train, noise_level=0.01)
        X_train_scaled = self.scaler.fit_transform(X_train)

        self.models['rf'] = RandomForestClassifier(n_estimators=350, max_depth=10, min_samples_split=32, min_samples_leaf=4, random_state=42, n_jobs=-1, verbose=1)
        self.models['rf'].fit(X_train_scaled, y_train)

        dnn = Sequential([
            Input(shape=(X_train_scaled.shape[1],)),
            Dense(256, activation='relu'), BatchNormalization(), Dropout(0.3),
            Dense(128, activation='relu'), BatchNormalization(), Dropout(0.3),
            Dense(64, activation='relu'), BatchNormalization(), Dropout(0.2),
            Dense(32, activation='relu'), BatchNormalization(), Dropout(0.1),
            Dense(2, activation='softmax')
        ])
        dnn.compile(optimizer=Adam(learning_rate=0.008), loss='categorical_crossentropy', metrics=['AUC'])
        dnn.fit(X_train_scaled, to_categorical(y_train, num_classes=2), epochs=100, batch_size=32, validation_split=0.1, callbacks=[
            EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
            ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=0.00001)
        ])
        self.models['dnn'] = dnn

        self.models['gbc'] = GradientBoostingClassifier(n_estimators=170, learning_rate=0.005, max_depth=6)
        self.models['gbc'].fit(X_train_scaled, LabelEncoder().fit_transform(y_train))

        self.models['xgbrf'] = XGBRFClassifier(n_estimators=100, subsample=0.9, colsample_bynode=0.2, random_state=42)
        self.models['xgbrf'].fit(X_train_scaled, y_train)

    def predict(self):
        '''
        Generates predictions using trained models and saves results.
        '''
        test_X = self.extract_features(self.test_df[' Sequence'])
        test_X = self.scaler.transform(test_X)
        seq_IDs = self.test_df["# ID"].values
        weights = {'rf': 0.2, 'gbc': 0.2, 'xgbrf': 0.2}
        ensemble_probs = sum(weights[model] * self.models[model].predict_proba(test_X)[:, 1] for model in weights)
        ensemble_probs += (0.4 * self.models['dnn'].predict(test_X)[:, 1])
        pd.DataFrame({'# ID': seq_IDs, 'Label': ensemble_probs}).to_csv(self.output_path, index=False)

if __name__ == "__main__":
    '''
    Parses command-line arguments and runs training and prediction.
    '''
    parser = argparse.ArgumentParser()
    parser.add_argument("--train", required=True, help="Path to training CSV file")
    parser.add_argument("--test", required=True, help="Path to test CSV file")
    parser.add_argument("--properties", required=True, help="Path to peptide properties JSON file")
    parser.add_argument("--output", required=True, help="Path to save predictions CSV file")
    args = parser.parse_args()
    classifier = PeptideClassifier(args.train, args.test, args.properties, args.output)
    classifier.train_models()
    classifier.predict()

In [None]:
!python peptide_classification.py --train train.csv --test test.csv --properties peptide_properties.json --output submission.csv