<a href="https://colab.research.google.com/github/castudil/bacteria-multi-label/blob/main/multilabel_bac.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Libraries used

In [12]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.multioutput import ClassifierChain
from sklearn.metrics import (f1_score, multilabel_confusion_matrix,
                             accuracy_score, hamming_loss, jaccard_score, make_scorer)

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from joblib import dump, load
import joblib

In [2]:
os.chdir("..")

In [3]:
train_file = "data/processed/raw/train_s_aureus_driams.csv"
train_bac = pd.read_csv(train_file)
train_bac

Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,...,9993,9994,9995,9996,9997,9998,9999,Oxacillin,Clindamycin,Fusidic acid
0,0.018721,0.016147,0.016983,0.021218,0.020846,0.019784,0.019405,0.023356,0.026224,0.026569,...,0.037966,0.030364,0.037545,0.040851,0.034176,0.046110,0.025638,0.0,0.0,0.0
1,0.009001,0.007475,0.006874,0.008575,0.009539,0.007894,0.008314,0.008013,0.008664,0.008923,...,0.014496,0.024966,0.027437,0.026541,0.022940,0.020572,0.032504,0.0,0.0,0.0
2,0.022354,0.020220,0.020910,0.024631,0.021436,0.021197,0.020229,0.018818,0.018637,0.018815,...,0.024620,0.022942,0.026715,0.032045,0.030431,0.029085,0.013117,0.0,0.0,0.0
3,0.017619,0.016073,0.016407,0.018011,0.019364,0.018950,0.017607,0.019116,0.023623,0.024492,...,0.051312,0.047458,0.049338,0.055039,0.054541,0.058643,0.058919,0.0,0.0,0.0
4,0.008264,0.008229,0.006753,0.006657,0.010107,0.007039,0.008250,0.010670,0.008134,0.006513,...,0.236769,0.217499,0.187244,0.216243,0.221910,0.226531,0.221965,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2819,0.056616,0.039011,0.040380,0.048517,0.050865,0.047771,0.049312,0.048257,0.049417,0.049000,...,0.021169,0.023617,0.033694,0.021037,0.018727,0.010641,0.009238,0.0,1.0,0.0
2820,0.125837,0.107712,0.109186,0.107613,0.109855,0.105060,0.099640,0.104169,0.120303,0.125067,...,0.082375,0.083446,0.096510,0.084883,0.092228,0.085599,0.042142,0.0,1.0,0.0
2821,0.000000,0.000000,0.035603,0.039994,0.042372,0.046666,0.045781,0.043914,0.039875,0.037170,...,0.006903,0.008322,0.011071,0.010274,0.004682,0.003547,0.001744,0.0,0.0,0.0
2822,0.005443,0.005998,0.003670,0.005588,0.006124,0.005019,0.004853,0.005400,0.004169,0.005151,...,0.049241,0.039586,0.050542,0.039139,0.046816,0.043036,0.037402,1.0,1.0,0.0


In [4]:
test_file = "data/processed/raw/test_s_aureus_driams.csv"
test_bac = pd.read_csv(test_file)
test_bac

Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,...,9993,9994,9995,9996,9997,9998,9999,Oxacillin,Clindamycin,Fusidic acid
0,0.044453,0.032486,0.032540,0.034223,0.037528,0.039503,0.031378,0.035506,0.037688,0.035658,...,0.247814,0.263833,0.279904,0.264432,0.241573,0.266020,0.231517,0.0,0.0,0.0
1,0.004318,0.001881,0.001274,0.000902,0.000892,0.000049,0.002188,0.002001,0.003081,0.003384,...,0.033364,0.042735,0.066426,0.057485,0.052903,0.050839,0.036631,0.0,0.0,0.0
2,0.026184,0.026459,0.025393,0.028609,0.031314,0.031739,0.033337,0.028051,0.028047,0.028978,...,0.086746,0.087719,0.094103,0.080969,0.073970,0.069047,0.070988,0.0,1.0,0.0
3,0.000000,0.015010,0.017782,0.014582,0.015084,0.018046,0.014461,0.014400,0.018769,0.018272,...,0.005062,0.006748,0.004573,0.007583,0.008427,0.004729,0.002394,0.0,0.0,1.0
4,0.060499,0.043034,0.030296,0.032635,0.031272,0.032762,0.031154,0.030382,0.030233,0.029438,...,0.029452,0.033288,0.039230,0.046477,0.037219,0.022227,0.019920,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
702,0.013092,0.011730,0.009902,0.013513,0.015317,0.011370,0.013338,0.010117,0.010994,0.009222,...,0.050851,0.046334,0.042840,0.061644,0.066948,0.061007,0.056178,0.0,0.0,0.0
703,0.021222,0.015896,0.015512,0.017995,0.018663,0.019427,0.018667,0.014589,0.016765,0.015071,...,0.047285,0.056455,0.058002,0.055528,0.043539,0.039962,0.034761,1.0,0.0,0.0
704,0.045613,0.041040,0.046177,0.050216,0.046525,0.048843,0.045920,0.044283,0.043983,0.046330,...,0.104234,0.094692,0.090012,0.096624,0.092228,0.100024,0.043682,0.0,0.0,1.0
705,0.015193,0.011922,0.010877,0.009975,0.010474,0.012171,0.010417,0.010783,0.013170,0.014157,...,0.051772,0.058255,0.074609,0.068249,0.049157,0.070229,0.043615,1.0,0.0,0.0


In [5]:
train_x = train_bac[train_bac.columns.drop(list(train_bac.filter(regex='[^0-9]')))]
test_x = test_bac[test_bac.columns.drop(list(test_bac.filter(regex='[^0-9]')))]

In [6]:
antibiotics = train_bac.columns.drop(train_x.columns)

In [7]:
train_y = train_bac[antibiotics]
test_y = test_bac[antibiotics]

In [8]:
train_x_tensor = torch.FloatTensor(train_x.to_numpy())
train_y_tensor = torch.LongTensor(train_y["Oxacillin"].to_numpy())
test_x_tensor = torch.FloatTensor(test_x.to_numpy())
test_y_tensor = torch.LongTensor(test_y["Oxacillin"].to_numpy())

In [17]:
class BacteriaDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

In [19]:
train_dataset = BacteriaDataset(train_x_tensor, train_y_tensor)
test_dataset = BacteriaDataset(test_x_tensor, test_y_tensor)

In [28]:
class NeuralNetworkClassificationModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, output_dim),
        )
    
    
    def forward(self, x):
        out = self.linear_relu_stack(x)
        return out

In [29]:
input_dim  = len(train_x.columns)
output_dim = len(np.unique(train_y))
model = NeuralNetworkClassificationModel(input_dim,output_dim)

In [40]:
learning_rate = 1e-3
batch_size = 64

epochs = 100

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [31]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [47]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    # Set the model to training mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 10 == 0:
            print(batch)
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"Loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test_loop(dataloader, model, loss_fn):
    # Set the model to evaluation mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    # Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
    # also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [48]:
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    test_loop(test_dataloader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
0
Loss: 0.036095  [   64/ 2824]
10
Loss: 0.055754  [  704/ 2824]
20
Loss: 0.104066  [ 1344/ 2824]
30
Loss: 0.105920  [ 1984/ 2824]
40
Loss: 0.082711  [ 2624/ 2824]
Test Error: 
 Accuracy: 90.1%, Avg loss: 0.361223 

Epoch 2
-------------------------------
0
Loss: 0.053719  [   64/ 2824]
10
Loss: 0.061618  [  704/ 2824]
20
Loss: 0.068064  [ 1344/ 2824]
30
Loss: 0.040853  [ 1984/ 2824]
40
Loss: 0.072162  [ 2624/ 2824]
Test Error: 
 Accuracy: 90.1%, Avg loss: 0.418862 

Epoch 3
-------------------------------
0
Loss: 0.025805  [   64/ 2824]
10
Loss: 0.042629  [  704/ 2824]
20
Loss: 0.088553  [ 1344/ 2824]
30
Loss: 0.070769  [ 1984/ 2824]
40
Loss: 0.044931  [ 2624/ 2824]
Test Error: 
 Accuracy: 90.1%, Avg loss: 0.337535 

Epoch 4
-------------------------------
0
Loss: 0.038643  [   64/ 2824]
10
Loss: 0.059210  [  704/ 2824]
20
Loss: 0.041099  [ 1344/ 2824]
30
Loss: 0.036805  [ 1984/ 2824]
40
Loss: 0.039695  [ 2624/ 2824]
Test Error: 
 Accuracy: 90.1

10
Loss: 0.105259  [  704/ 2824]
20
Loss: 0.125956  [ 1344/ 2824]
30
Loss: 0.112142  [ 1984/ 2824]
40
Loss: 0.061841  [ 2624/ 2824]
Test Error: 
 Accuracy: 90.1%, Avg loss: 0.437109 

Epoch 34
-------------------------------
0
Loss: 0.073299  [   64/ 2824]
10
Loss: 0.068720  [  704/ 2824]
20
Loss: 0.040497  [ 1344/ 2824]
30
Loss: 0.037913  [ 1984/ 2824]
40
Loss: 0.040317  [ 2624/ 2824]
Test Error: 
 Accuracy: 90.1%, Avg loss: 0.338874 

Epoch 35
-------------------------------
0
Loss: 0.054370  [   64/ 2824]
10
Loss: 0.035698  [  704/ 2824]
20
Loss: 0.037612  [ 1344/ 2824]
30
Loss: 0.045491  [ 1984/ 2824]
40
Loss: 0.067575  [ 2624/ 2824]
Test Error: 
 Accuracy: 90.1%, Avg loss: 0.336391 

Epoch 36
-------------------------------
0
Loss: 0.079877  [   64/ 2824]
10
Loss: 0.061422  [  704/ 2824]
20
Loss: 0.146791  [ 1344/ 2824]
30
Loss: 0.051612  [ 1984/ 2824]
40
Loss: 0.055910  [ 2624/ 2824]
Test Error: 
 Accuracy: 90.1%, Avg loss: 0.449353 

Epoch 37
-------------------------------
0
Lo

40
Loss: 0.059665  [ 2624/ 2824]
Test Error: 
 Accuracy: 90.1%, Avg loss: 0.338148 

Epoch 66
-------------------------------
0
Loss: 0.111756  [   64/ 2824]
10
Loss: 0.096104  [  704/ 2824]
20
Loss: 0.038139  [ 1344/ 2824]
30
Loss: 0.072330  [ 1984/ 2824]
40
Loss: 0.015943  [ 2624/ 2824]
Test Error: 
 Accuracy: 90.0%, Avg loss: 0.339235 

Epoch 67
-------------------------------
0
Loss: 0.064416  [   64/ 2824]
10
Loss: 0.045532  [  704/ 2824]
20
Loss: 0.172003  [ 1344/ 2824]
30
Loss: 0.129772  [ 1984/ 2824]
40
Loss: 0.047040  [ 2624/ 2824]
Test Error: 
 Accuracy: 90.0%, Avg loss: 0.401866 

Epoch 68
-------------------------------
0
Loss: 0.114997  [   64/ 2824]
10
Loss: 0.027294  [  704/ 2824]
20
Loss: 0.048927  [ 1344/ 2824]
30
Loss: 0.041594  [ 1984/ 2824]
40
Loss: 0.066924  [ 2624/ 2824]
Test Error: 
 Accuracy: 90.2%, Avg loss: 0.422296 

Epoch 69
-------------------------------
0
Loss: 0.043304  [   64/ 2824]
10
Loss: 0.020999  [  704/ 2824]
20
Loss: 0.078229  [ 1344/ 2824]
30
Lo

20
Loss: 0.036672  [ 1344/ 2824]
30
Loss: 0.033114  [ 1984/ 2824]
40
Loss: 0.066379  [ 2624/ 2824]
Test Error: 
 Accuracy: 90.0%, Avg loss: 0.341119 

Epoch 99
-------------------------------
0
Loss: 0.021369  [   64/ 2824]
10
Loss: 0.038266  [  704/ 2824]
20
Loss: 0.055385  [ 1344/ 2824]
30
Loss: 0.039993  [ 1984/ 2824]
40
Loss: 0.037112  [ 2624/ 2824]
Test Error: 
 Accuracy: 89.8%, Avg loss: 0.421687 

Epoch 100
-------------------------------
0
Loss: 0.090615  [   64/ 2824]
10
Loss: 0.068235  [  704/ 2824]
20
Loss: 0.144229  [ 1344/ 2824]
30
Loss: 0.061213  [ 1984/ 2824]
40
Loss: 0.055019  [ 2624/ 2824]
Test Error: 
 Accuracy: 90.0%, Avg loss: 0.522031 

Done!


In [49]:
predictions_train = []
predictions_test =  []
with torch.no_grad():
    predictions_train = model(train_x_tensor)
    predictions_test = model(test_x_tensor)

In [50]:
def get_accuracy_multiclass(pred_arr,original_arr):
    if len(pred_arr)!=len(original_arr):
        return False
    pred_arr = pred_arr.numpy()
    original_arr = original_arr.numpy()
    final_pred= []
   
    for i in range(len(pred_arr)):
        final_pred.append(np.argmax(pred_arr[i]))
    final_pred = np.array(final_pred)
    count = 0
    #here we are doing a simple comparison between the predicted_arr and the original_arr to get the final accuracy
    for i in range(len(original_arr)):
        if final_pred[i] == original_arr[i]:
            count+=1
    return count/len(final_pred)

In [51]:
train_acc = get_accuracy_multiclass(predictions_train,train_y_tensor)
test_acc  = get_accuracy_multiclass(predictions_test,test_y_tensor)

In [52]:
print(f"Training Accuracy: {round(train_acc*100,3)}")
print(f"Test Accuracy: {round(test_acc*100,3)}")

Training Accuracy: 98.123
Test Accuracy: 89.958


In [None]:
# bayesopt = BayesSearchCV(
#     ClassifierChain(xgb.XGBClassifier(), random_state=0),
#     {
#         "base_estimator__objective": Categorical(["binary:logistic"]),
#         "base_estimator__max_depth": Integer(1, 10),
#         "base_estimator__min_child_weight": Real(1e-6, 10, prior="log-uniform"),
#         "base_estimator__max_delta_step": Real(1e-6, 10, prior="log-uniform"),
#         "base_estimator__subsample": Real(1e-6, 1, prior="log-uniform"),
#         "base_estimator__tree_method": Categorical(["exact", "approx", "hist"]),
#         "base_estimator__scale_pos_weight": Real(1e-6, 10, prior="log-uniform"),
#         "base_estimator__gamma": Real(1e-6, 10, prior="log-uniform"),
#         "base_estimator__eta": Real(1e-6, 1, prior="log-uniform")
#     },
#     n_iter=250,
#     cv=5,
#     random_state=0,
#     n_jobs=1,
#     n_points=1,
#     scoring=make_scorer(multilabel_f1_wrapper),
#     verbose=1,
# )

# bayesopt.fit(train_x, train_y)

In [None]:
# best_iteration = 0
# for i in range(0, 250):
#     if bayesopt.cv_results_["mean_test_score"][i] == bayesopt.best_score_:
#         best_iteration = i
# print("Best iteration:", best_iteration)
# print("Split scores:")
# for i in range(0, 5):
#     print("", i, bayesopt.cv_results_["split"+str(i)+"_test_score"][best_iteration])
    
# print("Mean score:", bayesopt.best_score_)
# print("Best parameter combination found:", bayesopt.best_params_)

In [None]:
# model = bayesopt.best_estimator_
# model.fit(train_x, train_y) 
# pred = model.predict_proba(test_x)
# model_hl, model_acc, model_f1 = report(test_y, (pred > 0.5))

In [None]:
# dump(model, 'nn_s_aureus_raw.joblib') 

In [None]:
# fig, axes = plt.subplots(1, len(antibiotics), figsize=(len(antibiotics)*5, 5))
# fig.supxlabel("Predicted Label")
# fig.supylabel("True Label")

# cm_svm_c = multilabel_confusion_matrix(test_y, (pred > 0.5))

# for i in range(len(antibiotics)):
#   sns.heatmap(ax=axes[i], data=cm_svm_c[i], annot=True, fmt='d', cbar=None, cmap="Blues", xticklabels=["S", "R"], yticklabels=["S", "R"]).set(title=antibiotics[i])