In [None]:
# only run this cell if you are in colab. Don't run the cell in a local computer.
!unzip qshgm-code-main.zip

***Implementation of Support Vector Classifier, K-Nearest Neighbors, and Random Forest Classifier in the classification of Bacterial Amino Acids (Qs Enzyme or Receptor.)***

In [49]:
import numpy as np
import sklearn
from sklearn import metrics
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score

In [50]:
# Define the list of estimators (models) to evaluate
estimators = ["svm", "knn", "rf"]

data_path = "qshgm-code-main/data/train/data.csv"  # Change to the actual data path
label_path = "qshgm-code-main/data/train/label.csv"  # Change to the actual label path


In [51]:
# define models and hyperparameters
def model(model, data, label):
    if model == "svm":
        svm = sklearn.svm.SVC(C=4, gamma=0.125, kernel='rbf', probability=True)
        svm.fit(data, label)
        return svm
    if model == "knn":
        knn = KNeighborsClassifier(n_neighbors=5)
        knn.fit(data, label)
        return knn
    if model == "rf":
        rf = RandomForestClassifier(n_estimators=122, criterion='gini', max_depth=55)
        rf.fit(data, label)
        return rf


In [52]:
# Read data
data = np.loadtxt(data_path, delimiter=',')
label = np.loadtxt(label_path, delimiter=',')

# Min-max scaler
scaler = preprocessing.MinMaxScaler(feature_range=(0, 1)).fit(data)
data = scaler.transform(data)
print(data)

[[0.19417476 0.08277977 0.24466019 ... 0.29341963 0.13592233 0.06407767]
 [0.19067797 0.09634255 0.10677966 ... 0.28625236 0.20762712 0.0559322 ]
 [0.2991453  0.0242915  0.32307692 ... 0.1671415  0.14957265 0.09871795]
 ...
 [0.22522523 0.03072546 0.27243243 ... 0.26906907 0.02522523 0.05351351]
 [0.16786571 0.0136312  0.21151079 ... 0.29842793 0.06714628 0.09496403]
 [0.16788767 0.04511278 0.27692308 ... 0.22357889 0.05982906 0.10476191]]


In [53]:
# Split data into training and testing sets with an 80:20 ratio
data_train, data_test, label_train, label_test = train_test_split(data, label, test_size=0.2, random_state=42)

***Train the 3 Machine Learning classifiers on the bacterial amino acids data.***

In [54]:
# Takes approximately 40 mins to train. Be patient.
results = {}

# Evaluate each model using cross-validation
for estimator in estimators:
    model_instance = model(estimator, data_train, label_train)

    # Perform cross-validation and calculate evaluation metrics
    cv_accuracy = cross_val_score(model_instance, data_train, label_train, cv=3, scoring='accuracy')
    cv_precision = cross_val_score(model_instance, data_train, label_train, cv=3, scoring='precision')
    cv_recall = cross_val_score(model_instance, data_train, label_train, cv=3, scoring='recall')
    cv_f1 = cross_val_score(model_instance, data_train, label_train, cv=3, scoring='f1')

    results[estimator] = {
        'Accuracy': np.mean(cv_accuracy),
        'Precision': np.mean(cv_precision),
        'Recall': np.mean(cv_recall),
        'F1': np.mean(cv_f1)
    }

In [55]:
# Print the results for all models
for estimator, metrics in results.items():
    print(f'{estimator}: [Accuracy: {metrics["Accuracy"]:.4f}, Precision: {metrics["Precision"]:.4f}, Recall: {metrics["Recall"]:.4f}, F1: {metrics["F1"]:.4f}]')

svm: [Accuracy: 0.8235, Precision: 0.7925, Recall: 0.8388, F1: 0.8150]
knn: [Accuracy: 0.8541, Precision: 0.7868, Recall: 0.9399, F1: 0.8566]
rf: [Accuracy: 0.8892, Precision: 0.8957, Recall: 0.8594, F1: 0.8779]


***Use the 3 ML classifiers to determine whether a bacterial amino acid is a QS enzyme or a receptor.***

In [57]:
# Takes 5 mins to classify test data.
from sklearn import metrics  # Import the metrics module

# Create a dictionary to store the results and predicted labels
results = {}

# Evaluate each model using cross-validation and make predictions on the test set
for estimator in estimators:
    model_instance = model(estimator, data_train, label_train)

    # Make predictions on the test set
    test_predictions = model_instance.predict(data_test)

    # Calculate evaluation metrics on the test set for this model with zero_division=0
    accuracy = metrics.accuracy_score(label_test, test_predictions)
    precision = metrics.precision_score(label_test, test_predictions, zero_division=0)
    recall = metrics.recall_score(label_test, test_predictions, zero_division=0)
    f1 = metrics.f1_score(label_test, test_predictions, zero_division=0)


    results[estimator] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1': f1,
        'Test_Predictions': test_predictions
    }

# Print the results and predicted labels for all models
for estimator, metrics in results.items():
    print(f'{estimator} (Cross-Validation): [Accuracy: {metrics["Accuracy"]:.4f}, Precision: {metrics["Precision"]:.4f}, Recall: {metrics["Recall"]:.4f}, F1: {metrics["F1"]:.4f}]')
    print(f'{estimator} (Test Set Predictions): {metrics["Test_Predictions"]}')


svm (Cross-Validation): [Accuracy: 0.8242, Precision: 0.7927, Recall: 0.8346, F1: 0.8131]
svm (Test Set Predictions): [0. 1. 0. ... 1. 0. 0.]
knn (Cross-Validation): [Accuracy: 0.8631, Precision: 0.7960, Recall: 0.9428, F1: 0.8632]
knn (Test Set Predictions): [0. 1. 0. ... 1. 0. 1.]
rf (Cross-Validation): [Accuracy: 0.8945, Precision: 0.8975, Recall: 0.8690, F1: 0.8830]
rf (Test Set Predictions): [0. 1. 0. ... 1. 0. 1.]


***Print the results and the first 20 predicted labels for all models.***

In [58]:
# Print the results and the first 20 predicted labels for all models
for estimator, metrics in results.items():
    print(f'{estimator} (Cross-Validation): [Accuracy: {metrics["Accuracy"]:.4f}, Precision: {metrics["Precision"]:.4f}, Recall: {metrics["Recall"]:.4f}, F1: {metrics["F1"]:.4f}]')
    print(f'{estimator} (Test Set Predictions): {metrics["Test_Predictions"][:20]}')

svm (Cross-Validation): [Accuracy: 0.8242, Precision: 0.7927, Recall: 0.8346, F1: 0.8131]
svm (Test Set Predictions): [0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 1.]
knn (Cross-Validation): [Accuracy: 0.8631, Precision: 0.7960, Recall: 0.9428, F1: 0.8632]
knn (Test Set Predictions): [0. 1. 0. 0. 0. 1. 1. 0. 1. 0. 0. 1. 1. 0. 1. 0. 1. 0. 1. 1.]
rf (Cross-Validation): [Accuracy: 0.8945, Precision: 0.8975, Recall: 0.8690, F1: 0.8830]
rf (Test Set Predictions): [0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 1.]
