In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.layers import Input
from scikeras.wrappers import KerasClassifier

In [4]:
df = pd.read_excel('features.xlsx')
df.drop('Unnamed: 0', axis=1, inplace=True)
# Prepare the data
features = df.drop(columns=['Label'])
labels = df['Label']

# Perform one-hot encoding on 'First Sequence', 'Second Sequence', and 'HLA Type'
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_features = encoder.fit_transform(features[['First Sequence', 'Second Sequence', 'HLA Type']])
encoded_features_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['First Sequence', 'Second Sequence', 'HLA Type']))

# Concatenate encoded features with the rest of the features
features = pd.concat([encoded_features_df, features.drop(columns=['First Sequence', 'Second Sequence', 'HLA Type'])], axis=1)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Scale features for SVM, Logistic Regression, and Neural Network
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
df

Unnamed: 0,First Sequence,Second Sequence,Label,Hydrophobicity Diff,Isoelectric Point Diff,BLOSUM50_Score_1,BLOSUM50_Score_2,BLOSUM50_Score_3,BLOSUM50_Score_4,BLOSUM50_Score_5,...,Hydrophobicity_Diff_AA_8,Hydrophobicity_Diff_AA_9,HLA Type,First Sequence EL_Rank,First Sequence BA_Rank,Second Sequence EL_Rank,Second Sequence BA_Rank,EL_Rank Diff,BA_Rank Diff,BLOSUM50_Alignment_Score
0,LAGIGILTV,AAGIGILTV,1,0.222222,-0.045016,-2,5,8,5,8,...,0.0,0.0,HLAA0201,3.8205,6.7028,2.6577,8.2938,1.1628,-1.5910,46
1,LAGIGILTV,LAGIGTVPI,1,0.600000,0.000000,5,5,8,5,8,...,0.9,-0.3,HLAA0201,3.8205,6.7028,18.3790,11.6335,-14.5585,-4.9307,41
2,LAGIGILTV,VTGITIHFV,1,0.655556,-1.186116,1,0,8,5,-2,...,-3.5,0.0,HLAA0201,3.8205,6.7028,2.2448,3.7754,1.5757,2.9274,27
3,LAGIGILTV,VAGIGLLSV,1,0.044444,0.030011,1,5,8,5,8,...,0.1,0.0,HLAA0201,3.8205,6.7028,3.9211,7.3292,-0.1006,-0.6264,41
4,LAGIGILTV,VAGIGILAI,1,-0.355556,0.030011,1,5,8,5,8,...,-2.5,-0.3,HLAA0201,3.8205,6.7028,11.1377,15.3300,-7.3172,-8.6272,41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5452,SLFNTVATL,YASLTTIGT,0,0.577778,-0.284309,0,-2,0,7,-1,...,2.8,-0.7,HLAA0201,0.0925,0.4886,17.9097,12.0875,-17.8172,-11.5989,24
5453,SLFNTVATL,YAVLSEYET,0,1.333333,1.189981,-1,2,-4,-2,-1,...,3.2,-0.4,HLAA0201,0.0925,0.4886,23.5636,17.3838,-23.4711,-16.8952,15
5454,SLFNTVATL,YLFNTVATL,1,0.055556,-0.284309,5,5,8,7,5,...,-2.5,0.0,HLAA0201,0.0925,0.4886,0.0462,0.0640,0.0463,0.4246,45
5455,SLFNTVATL,YLSKEDRII,0,1.711111,-0.828997,1,1,-2,-4,0,...,3.2,-0.7,HLAA0201,0.0925,0.4886,2.2498,5.5595,-2.1573,-5.0709,13


In [11]:

def objective_nn(trial):
    def create_model(input_shape):
        model = Sequential([
            Input(shape=(input_shape,)),
            Dense(trial.suggest_int('units1', 32, 128), activation='relu'),
            Dropout(trial.suggest_float('dropout1', 0.2, 0.5)),
            Dense(trial.suggest_int('units2', 32, 128), activation='relu'),
            Dropout(trial.suggest_float('dropout2', 0.2, 0.5)),
            Dense(1, activation='sigmoid')
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=trial.suggest_float('lr', 1e-5, 1e-2, log=True)),
                      loss='binary_crossentropy', metrics=['accuracy'])
        return model

    model = KerasClassifier(model=lambda: create_model(X_train.shape[1]), epochs=trial.suggest_int('epochs', 10, 50), batch_size=trial.suggest_int('batch_size', 16, 64), verbose=0)
    scores = cross_val_score(model, X_train_scaled, y_train, cv=3, scoring='accuracy')
    return scores.mean()

def objective_rf(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2'])
    }
    model = RandomForestClassifier(**params, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy')
    return scores.mean()

def objective_svm(trial):
    params = {
        'C': trial.suggest_float('C', 1e-3, 1e2, log=True),
        'gamma': trial.suggest_float('gamma', 1e-4, 1e-1, log=True),
        'kernel': trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly']),
    }
    model = SVC(**params, random_state=42)
    scores = cross_val_score(model, X_train_scaled, y_train, cv=3, scoring='accuracy')
    return scores.mean()

def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0)
    }
    model = XGBClassifier(**params, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy')
    return scores.mean()

def objective_lr(trial):
    params = {
        'C': trial.suggest_float('C', 1e-5, 1e2, log=True),
        'penalty': trial.suggest_categorical('penalty', ['l1', 'l2']),
        'solver': trial.suggest_categorical('solver', ['liblinear', 'saga'])
    }
    model = LogisticRegression(**params, random_state=42, max_iter=500)
    scores = cross_val_score(model, X_train_scaled, y_train, cv=3, scoring='accuracy')
    return scores.mean()

In [12]:
# Create Optuna studies for each model type
study_nn = optuna.create_study(direction='maximize')
study_rf = optuna.create_study(direction='maximize')
study_svm = optuna.create_study(direction='maximize')
study_xgb = optuna.create_study(direction='maximize')
study_lr = optuna.create_study(direction='maximize')

# Optimize hyperparameters with a timeout for each trial to avoid extremely long trials
study_nn.optimize(objective_nn, n_trials=50, timeout=6000)  # 100 minutes timeout
study_rf.optimize(objective_rf, n_trials=50, timeout=6000)
study_svm.optimize(objective_svm, n_trials=50, timeout=6000)
study_xgb.optimize(objective_xgb, n_trials=50, timeout=6000)
study_lr.optimize(objective_lr, n_trials=50, timeout=6000)

# Print best trials
print("Best NN trial: ", study_nn.best_trial)
print("Best RF trial: ", study_rf.best_trial)
print("Best SVM trial: ", study_svm.best_trial)
print("Best XGB trial: ", study_xgb.best_trial)
print("Best LR trial: ", study_lr.best_trial)

[I 2024-05-22 13:13:53,085] A new study created in memory with name: no-name-e37c3942-22a6-4cd4-9e22-4475a15167dd
[I 2024-05-22 13:13:53,086] A new study created in memory with name: no-name-741760ad-7310-4894-a165-9821afeb7274
[I 2024-05-22 13:13:53,087] A new study created in memory with name: no-name-86fd186e-87f9-4b81-a369-0bb6ae5d3e5f
[I 2024-05-22 13:13:53,088] A new study created in memory with name: no-name-7a824d43-5a0d-4cd2-895f-744cd986715d
[I 2024-05-22 13:13:53,090] A new study created in memory with name: no-name-eaa812c3-77f3-4496-847b-3a206705088a
[I 2024-05-22 13:14:08,935] Trial 0 finished with value: 0.8639175257731959 and parameters: {'epochs': 22, 'batch_size': 62, 'units1': 61, 'dropout1': 0.33834265938271524, 'units2': 97, 'dropout2': 0.4929482884463071, 'lr': 0.0009118431094247623}. Best is trial 0 with value: 0.8639175257731959.
[W 2024-05-22 13:14:18,669] Trial 1 failed with parameters: {'epochs': 49, 'batch_size': 24, 'units1': 95, 'dropout1': 0.2518463752108

KeyboardInterrupt: 