In [1]:
import zipfile

zip_file_path = 'qshgm-code-main.zip'
extraction_path = 'extraction_folder'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extraction_path)


***Bacterial Amino Acid Classification using Neural Networks.***

In [2]:
# Import required libraries
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset
from torch.autograd import Variable
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import argparse

In [3]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.l1 = nn.Linear(20, 32)  # Increased the number of neurons in the first layer
        self.l2 = nn.Linear(32, 16)  # Add an additional layer with more neurons
        self.l3 = nn.Linear(16, 8)   # Add another layer
        self.l4 = nn.Linear(8, 4)
        self.l5 = nn.Linear(4, 2)    # Increase the number of neurons in the last layer

    def forward(self, x):
        x = F.relu(self.l1(x))
        x = F.relu(self.l2(x))
        x = F.relu(self.l3(x))
        x = F.relu(self.l4(x))
        return self.l5(x)

In [4]:
# code the train functionality
def train(epoch, model, train_loader, criterion, optimizer):
    model.train()
    train_loss = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = Variable(data), Variable(target)
        optimizer.zero_grad()
        data = torch.tensor(data, dtype=torch.float32)
        output = model(data)
        loss = criterion(output, target.long())
        train_loss += loss.data.item()
        loss.backward()
        optimizer.step()

    train_loss /= len(train_loader.dataset)
    print('Train Epoch: {}, Average loss: {:.4f}'.format(epoch, train_loss))

In [6]:
# Define the test functionality.
def test(model, test_loader, criterion):
    model.eval()
    test_loss = 0
    correct = 0
    tp = 0
    tn = 0
    fn = 0
    fp = 0
    for data, target in test_loader:
        data, target = Variable(data, volatile=True), Variable(target)
        data = torch.tensor(data, dtype=torch.float32)
        output = model(data)
        test_loss += criterion(output, target.long()).data.item()
        pred = output.data.max(1, keepdim=True)[1]
        tp += ((pred == 1) & (target.data.view_as(pred) == 1)).cpu().sum()
        tn += ((pred == 0) & (target.data.view_as(pred) == 0)).cpu().sum()
        fn += ((pred == 0) & (target.data.view_as(pred) == 1)).cpu().sum()
        fp += ((pred == 1) & (target.data.view_as(pred) == 0)).cpu().sum()
        correct += pred.eq(target.data.view_as(pred)).cpu().sum()

    test_loss /= len(test_loader.dataset)
    accuracy = 1. * (tp + tn) / (tp + tn + fp + fn)
    precision = 1. * tp / (tp + fp)
    sensitive = 1. * tp / (tp + fn)
    f1 = 2 * precision * sensitive / (precision + sensitive)

    print('Test set: Average loss: {:.4f}, Accuracy: {:.4f}, '
          'Precision: {:.4f}, Sensitive: {:.4f}, F1: {:.4f}\n'.format(
              test_loss, accuracy, precision, sensitive, f1)
    )

In [7]:
# Data preprocessing
data_path = "extraction_folder/qshgm-code-main/data/train/merge.csv"
data = pd.read_csv(data_path)
data = np.array(data)
scaler = MinMaxScaler(feature_range=(0, 1))
data = scaler.fit_transform(data)

In [8]:
# Train-test split
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

X_train = train_set[:, 0:-1]
X_train_label = train_set[:, [-1]]
X_train_label = X_train_label.reshape(train_set.shape[0], )
X_train = torch.from_numpy(X_train.astype(float))
X_train_label = torch.from_numpy(X_train_label.astype(float))
train_dataset = TensorDataset(X_train, X_train_label)

X_test = test_set[:, 0:-1]
X_test_label = test_set[:, [-1]]
X_test_label = X_test_label.reshape(test_set.shape[0], )
X_test = torch.from_numpy(X_test.astype(float))
X_test_label = torch.from_numpy(X_test_label.astype(float))
test_dataset = TensorDataset(X_test, X_test_label)

In [9]:
# Training settings
batch_size = 64
epochs = 30
learning_rate = 0.01

In [10]:
# Data loaders
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size,
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=batch_size,
                                          shuffle=False)


In [11]:
# Initialize model, loss, and optimizer
model = Net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.5)

In [12]:
# Training and testing
for epoch in range(1, epochs + 1):
    train(epoch, model, train_loader, criterion, optimizer)
    test(model, test_loader, criterion)

    # Save the model's state dictionary after each epoch
    torch.save(model.state_dict(), f"model_{epoch}.pth")

  data = torch.tensor(data, dtype=torch.float32)


Train Epoch: 1, Average loss: 0.0108
Test set: Average loss: 0.0108, Accuracy: 0.5420, Precision: nan, Sensitive: 0.0000, F1: nan



  data, target = Variable(data, volatile=True), Variable(target)
  data = torch.tensor(data, dtype=torch.float32)


Train Epoch: 2, Average loss: 0.0108
Test set: Average loss: 0.0108, Accuracy: 0.5420, Precision: nan, Sensitive: 0.0000, F1: nan

Train Epoch: 3, Average loss: 0.0108
Test set: Average loss: 0.0108, Accuracy: 0.5420, Precision: nan, Sensitive: 0.0000, F1: nan

Train Epoch: 4, Average loss: 0.0108
Test set: Average loss: 0.0107, Accuracy: 0.5420, Precision: nan, Sensitive: 0.0000, F1: nan

Train Epoch: 5, Average loss: 0.0107
Test set: Average loss: 0.0107, Accuracy: 0.5420, Precision: nan, Sensitive: 0.0000, F1: nan

Train Epoch: 6, Average loss: 0.0107
Test set: Average loss: 0.0107, Accuracy: 0.5420, Precision: nan, Sensitive: 0.0000, F1: nan

Train Epoch: 7, Average loss: 0.0106
Test set: Average loss: 0.0104, Accuracy: 0.6232, Precision: 0.7349, Sensitive: 0.2772, F1: 0.4026

Train Epoch: 8, Average loss: 0.0100
Test set: Average loss: 0.0095, Accuracy: 0.7209, Precision: 0.6623, Sensitive: 0.7971, F1: 0.7235

Train Epoch: 9, Average loss: 0.0088
Test set: Average loss: 0.0082, Ac

**To determine whether a bacterial amino acid is a QS enzyme or a receptor using the classifier The code takes an amino acid sequence as input and outputs a binary classification result:
●	1 for QS enzyme
●	0 for the receptor.
** *bold text*

In [13]:
# Load the saved model (specify the path)
model_path = 'model_30.pth'  # Adjust the filename if needed
model = Net()
model.load_state_dict(torch.load(model_path))
model.eval()

# Data loader for test dataset
batch_size = 64  # Adjust the batch size as needed
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=batch_size,
                                          shuffle=False)

predictions = []

# Use the model to make predictions on the test dataset
with torch.no_grad():
    for data, target in test_loader:
        data = data.to(torch.float32)
        output = model(data)

        # Assuming you're using softmax for classification
        probabilities = torch.nn.functional.softmax(output, dim=1)
        predicted_class = torch.argmax(probabilities, dim=1)

        # Append the predicted classes to the predictions list
        predictions.extend(predicted_class.cpu().numpy())

# 'predictions' now contains the predicted classes for your test dataset
print(predictions)


[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 

In [14]:
# Assuming you have the actual labels for your test data in 'true_labels'
true_labels = X_test_label.numpy()

# Calculate the accuracy by comparing predicted classes to true labels
correct_predictions = (predictions == true_labels).sum()
total_predictions = len(true_labels)
accuracy = correct_predictions / total_predictions

# Print the accuracy
print(f'Accuracy of predicted amino acid sequences: {accuracy:.4f}')


Accuracy of predicted amino acid sequences: 0.7866
