# Datasets Guide 📊

#### You can use .npy files provided in ./Data/Esm2  file or make it yourself using ./Data/Esm2/Esm2.ipynb notebook

##### MP_esm2.npy is for Mpfit positive samples and nonMP_esm2.npy is for Mpfit Negative samples.
##### posshirafkan_esm2.npy is Shirafkan positive samples and negshirafkan_esm2.npy is for Shrafkan negative samples.
##### lastly clean_esm2.npy is for cleaned up dataset.

# Code guide ⌨️

# (python 🐍)

##  📥 Data Import and 🔮 Model Building and Testing on Mpfit

In [None]:
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import ComplementNB
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SVMSMOTE
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler


# Load the .npy files
embeddings_class2 = np.load("..\\Data\\Esm2\\MP_esm2.npy")
embeddings_class1 = np.load(
    "..\\Data\\Esm2\\nonMP_esm2.npy")

# Stack the embeddings to create a single array
X = np.vstack((embeddings_class1, embeddings_class2))

# Create a label array (0 for class 1, 1 for class 2)
y = np.concatenate((np.zeros(len(embeddings_class1)),
                   np.ones(len(embeddings_class2))))



classifiers = [
    ("Extra Trees Classifier", ExtraTreesClassifier(
        n_estimators=500, random_state=42)),
    ("SVC", SVC(kernel='rbf', C=10, gamma='auto', probability=True, random_state=42)),
    ("MLP", MLPClassifier(hidden_layer_sizes=(200, 45),  # 2 hidden layers with 50 and 20 neurons
                          activation='relu',  # ReLU activation function
                          solver='adam',  # Adam optimizer
                          batch_size=128,  # Batch size for training
                          max_iter=200,  # Maximum number of iterations
                          random_state=42)),
    ("Random Forest Classifier", RandomForestClassifier(
        n_estimators=500, random_state=42)),
    ("KNN", KNeighborsClassifier(n_neighbors=25)),
    ("NB", ComplementNB()),
    ("GBD", GradientBoostingClassifier(
        n_estimators=500, learning_rate=0.1, random_state=42))
    # Add more classifiers as needed
]


for name, clf in classifiers:
    scores = []
    precisions = []
    recalls = []
    f1_scores = []
    aurocs = []  # Store AUROC scores
    auprcs = []  # Store AUPRC scores
    tprs = []  # Store TPRs for ROC
    fprs = []  # Store FPRs for ROC

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    for i, (train_index, val_index) in enumerate(kf.split(X)):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        # Apply Min-Max scaling separately to the training and validation data
        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_val_scaled = scaler.transform(X_val)

        clf.fit(X_train_scaled, y_train)
        y_pred = clf.predict(X_val_scaled)

        scores.append(accuracy_score(y_val, y_pred))
        precisions.append(precision_score(y_val, y_pred))
        recalls.append(recall_score(y_val, y_pred))
        f1_scores.append(f1_score(y_val, y_pred))

        # Calculate AUROC for this fold
        y_pred_proba = clf.predict_proba(X_val_scaled)[:, 1]
        fpr, tpr, thresholds = roc_curve(y_val, y_pred_proba)
        auroc = roc_auc_score(y_val, y_pred_proba)
        aurocs.append(auroc)
        tprs.append(tpr)
        fprs.append(fpr)

        # Calculate AUPRC for this fold
        precision, recall, _ = precision_recall_curve(y_val, y_pred_proba)
        auprc = average_precision_score(y_val, y_pred_proba)
        auprcs.append(auprc)

    print(f"{name}:")
    for i in range(5):
        print(f"Fold {i+1}: Accuracy = {scores[i]:.3f}, Precision = {precisions[i]:.3f}, Recall = {recalls[i]:.3f}, "
              f"F1 = {f1_scores[i]:.3f}, AUROC = {aurocs[i]:.3f}, AUPRC = {auprcs[i]:.3f}")
    print(f"Mean Accuracy: {np.mean(scores):.3f}, Mean Precision: {np.mean(precisions):.3f}, "
          f"Mean Recall: {np.mean(recalls):.3f}, Mean F1: {np.mean(f1_scores):.3f}, "
          f"Mean AUROC: {np.mean(aurocs):.3f}, Mean AUPRC: {np.mean(auprcs):.3f}\n")

## Testing on Shirafkan and independent dataset.🎯

In [None]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score

# Load test embeddings
test_class1 = np.load("..\\Data\\Esm2\\negshirafkan_esm2.npy")
test_class2 = np.load("..\Data\\Esm2\\posshirafkan_esm2.npy")

# Load training embeddings
embeddings_class2 = np.load("..\\Data\\Esm2\\MP_esm2.npy")
embeddings_class1 = np.load("..\\Data\\Esm2\\nonMP_esm2.npy")

# Prepare training data
X_train = np.vstack((embeddings_class1, embeddings_class2))
y_train = np.concatenate((np.zeros(len(embeddings_class1)), np.ones(len(embeddings_class2))))

# Prepare test data
X_test = np.vstack((test_class1, test_class2))
y_test = np.concatenate((np.zeros(len(test_class1)), np.ones(len(test_class2))))

# Define classifiers
classifiers = [
    ("Extra Trees Classifier", ExtraTreesClassifier(n_estimators=500, random_state=42)),
    ("SVC", SVC(kernel='rbf', C=10, gamma='auto', probability=True, random_state=42)),
    ("MLP", MLPClassifier(hidden_layer_sizes=(200, 45), activation='relu', solver='adam', batch_size=128, max_iter=200, random_state=42)),
    ("Random Forest Classifier", RandomForestClassifier(n_estimators=500, random_state=42)),
    ("KNN", KNeighborsClassifier(n_neighbors=25)),
    ("NB", ComplementNB()),
    ("GBD", GradientBoostingClassifier(n_estimators=500, learning_rate=0.1, random_state=42))
]



# Train and test each classifier
for name, clf in classifiers:
    # Train the model
    clf.fit(X_train, y_train)

    # Test the model
    y_test_pred = clf.predict(X_test)
    y_test_proba = clf.predict_proba(X_test)[:, 1]

    # Evaluate test performance
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    test_auroc = roc_auc_score(y_test, y_test_proba)
    test_auprc = average_precision_score(y_test, y_test_proba)

    print(f"{name} Test Results:")
    print(f"Accuracy: {test_accuracy:.2f}, Precision: {test_precision:.2f}, Recall: {test_recall:.2f}, "
          f"F1: {test_f1:.2f}, AUROC: {test_auroc:.2f}, AUPRC: {test_auprc:.2f}\n")


Extra Trees Classifier Test Results:
Accuracy: 0.94, Precision: 1.00, Recall: 0.91, F1: 0.95, AUROC: 1.00, AUPRC: 1.00

SVC Test Results:
Accuracy: 0.83, Precision: 0.94, Recall: 0.77, F1: 0.85, AUROC: 0.90, AUPRC: 0.94

MLP Test Results:
Accuracy: 0.85, Precision: 0.98, Recall: 0.78, F1: 0.87, AUROC: 0.95, AUPRC: 0.97

Random Forest Classifier Test Results:
Accuracy: 0.94, Precision: 1.00, Recall: 0.90, F1: 0.95, AUROC: 0.99, AUPRC: 1.00

KNN Test Results:
Accuracy: 0.82, Precision: 0.94, Recall: 0.76, F1: 0.84, AUROC: 0.90, AUPRC: 0.93

NB Test Results:
Accuracy: 0.78, Precision: 0.91, Recall: 0.72, F1: 0.80, AUROC: 0.86, AUPRC: 0.91

GBD Test Results:
Accuracy: 0.94, Precision: 1.00, Recall: 0.91, F1: 0.95, AUROC: 0.98, AUPRC: 0.99

