⚠️ If you are mounting your google drive in Colab, run the following cell.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [26]:
# ! cp /content/drive/MyDrive/FYP/FYP/test/output/data.npz ./output/data.npz
# ! cp /content/drive/MyDrive/FYP/FYP/test/output/labels.npy ./output/labels.npy
! cp /content/drive/MyDrive/FYP/FYP/test/output/train_idx.npy ./output/train_idx.npy
! cp /content/drive/MyDrive/FYP/FYP/test/output/test_idx.npy ./output/test_idx.npy

In [2]:
import pandas as pd
import numpy as np

In [3]:
import torch

In [4]:
tv = torch.__version__
!pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-"$tv".html
!pip install torch-summary
!pip install biopython

Looking in links: https://data.pyg.org/whl/torch-2.2.1+cu121.html


In [5]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv, GCNConv, GATConv
from torch_geometric.data import Data, Dataset
from torch_geometric.data import NeighborSampler
import torch_geometric.transforms as T
from torch_cluster import random_walk
import torch.optim as optim
from torchsummary import summary

import os
import numpy as np
from tqdm import tqdm
from collections import defaultdict, Counter
import matplotlib.pyplot as plt
import seaborn as sns
from Bio import SeqIO

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [6]:
result_path = f"output"
result_path

'output'

In [7]:
import os
os.listdir(result_path)

['data.npz',
 'model.pkl',
 'train_idx.npy',
 'ground_truth.txt',
 'sample_weights.npy',
 'test_idx.npy',
 'labels.npy',
 'read_ids']

In [8]:
labels = np.load(f"{result_path}/labels.npy",allow_pickle=True)
data = np.load(f'{result_path}/data.npz')

In [9]:
ground_truth = np.array(open(f"{result_path}/ground_truth.txt").read().strip().split("\n"))
ground_truth.shape

(179244,)

In [10]:
sample_weights = np.load(f"{result_path}/sample_weights.npy")
sample_weights.shape

(173603,)

In [11]:
def get_idx_maps(read_ids_file_path, truth):
    reads_truth = {}
    read_id_idx = {}
    # global read_id_idx
    with open(read_ids_file_path) as read_ids_file:
        for t, rid in tqdm(zip(truth, read_ids_file)):
            rid = rid.strip().split()[0][1:]
            reads_truth[rid] = t
            read_id_idx[rid] = len(read_id_idx)

    return reads_truth, read_id_idx

In [12]:
reads_truth, read_id_idx = get_idx_maps(f"{result_path}/read_ids", labels)

179244it [00:00, 390646.27it/s]


In [13]:
all(np.array(list(reads_truth.values())) == labels)

True

In [14]:
edges = data['edges']
comp = data['scaled']

In [15]:
comp = torch.from_numpy(comp).float()

In [16]:
id_list = np.array(list(read_id_idx.items()))
id_list

array([['08628297-d792-4b1b-8d58-40e7232f28d0', '0'],
       ['86fd9e27-f495-4b6e-8124-0c3dbdcd2c9b', '1'],
       ['62cf08b0-4463-479f-b041-f4cdbaa1c3ed', '2'],
       ...,
       ['a6741036-5494-4df6-a42c-7174d639d50c', '179241'],
       ['9a8fa5ea-bb75-44de-9789-4289e804c35f', '179242'],
       ['95e17a7a-19aa-4018-8618-ef6c4dca75d8', '179243']], dtype='<U36')

In [17]:
edges.shape

(3412328, 2)

In [18]:
edge_index = torch.tensor(edges, dtype=torch.long)
edge_index.shape

torch.Size([3412328, 2])

In [19]:
train_idx = np.load(f"{result_path}/train_idx.npy")
test_idx = np.load(f"{result_path}/test_idx.npy")
train_idx.shape, test_idx.shape

((173603,), (5641,))

In [20]:
def get_train_data(truth, mask):
    lb = LabelEncoder()
    lb.fit(truth[mask])

    y = np.full(len(truth), -1)

    y[train_idx] = lb.transform(truth[train_idx])
    y = torch.tensor(y, dtype=torch.long)

    no_classes = len(set(truth[train_idx]))

    return y, no_classes, lb

y, no_classes, encoder = get_train_data(labels, train_idx)

In [21]:
no_classes

328

In [22]:
np.unique(y[test_idx])

array([-1])

In [23]:
np.unique(y[train_idx])

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [24]:
train_idx, val_idx, weight_idx_train, weight_idx_val = train_test_split(train_idx, np.arange(len(train_idx)), test_size=0.1, random_state=42)
train_idx.shape, val_idx.shape, weight_idx_train.shape, weight_idx_val.shape

((156242,), (17361,), (156242,), (17361,))

In [25]:
sample_weights_train = sample_weights[weight_idx_train]
sample_weights_val = sample_weights[weight_idx_val]
sample_weights_train.shape, sample_weights_val.shape

((156242,), (17361,))

In [26]:
def get_graph_data(features, edges,y,train_idx,test_idx,val_idx):
    edge_index = torch.tensor(edges, dtype=torch.long)
    edge_index = edge_index.t().contiguous()

    train_indices = torch.tensor(train_idx, dtype=torch.long)
    test_indices = torch.tensor(test_idx, dtype=torch.long)
    val_indices = torch.tensor(val_idx, dtype=torch.long)

    data = Data(x=features, edge_index=edge_index, y=y)

    train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    val_mask = torch.zeros(data.num_nodes, dtype=torch.bool)

    train_mask[train_indices] = True
    test_mask[test_indices] = True
    val_mask[val_indices] = True

    data.train_mask = train_mask
    data.test_mask = test_mask
    data.val_mask = val_mask


    # split_ = T.RandomNodeSplit(num_val=0.1, num_test=0.2)
    # data = split_(data)
    # data = split_(data)

    return data

data = get_graph_data(comp, edges,y,train_idx,test_idx,val_idx)

In [27]:
data

Data(x=[179244, 136], edge_index=[2, 3412328], y=[179244], train_mask=[179244], test_mask=[179244], val_mask=[179244])

In [51]:
def test_accuracy(model, graph):
  model.eval()
  pred = model(data).argmax(dim=1)
  y_pred = encoder.inverse_transform(pred[data.test_mask].cpu())
  y_true = ground_truth[test_idx]
  mask_ = y_true != 'None'
  y_pred = y_pred[mask_]
  y_true = y_true[mask_]
  correct = (y_pred == y_true).sum()
  acc = int(correct) / int(mask_.sum())
  return acc

In [52]:
def train_node_classifier(model, graph, optimizer, criterion, n_epochs=200):
    train_losses = []
    val_losses = []

    for epoch in range(1, n_epochs + 1):
        model.train()
        optimizer.zero_grad()
        out = model(graph)
        loss = criterion(out[graph.train_mask], graph.y[graph.train_mask])
        # print(loss)
        weighted_train_loss = torch.mean(loss * torch.tensor(sample_weights_train))
        weighted_train_loss.backward()
        optimizer.step()

        pred = out.argmax(dim=1)
        acc = eval_node_classifier(model, graph, graph.val_mask)

        val_loss = criterion(out[graph.val_mask], graph.y[graph.val_mask])
        weighted_val_loss = torch.mean(val_loss * torch.tensor(sample_weights_val))

        train_losses.append(weighted_train_loss.item())
        val_losses.append(weighted_val_loss.item())


        weighted_train_loss_np = weighted_train_loss.detach().numpy()
        weighted_val_loss_np = weighted_val_loss.detach().numpy()

        if epoch % 1 == 0:
            print(f'Epoch: {epoch:03d}, Train Loss: {weighted_train_loss_np:.4f}, Val Acc: {acc:.4f}, Val Loss: {weighted_val_loss_np:.4f}')

        if epoch % 10 == 0:
            print(test_accuracy(model, graph))

        if acc > 0.995:
            break

    return model, train_losses, val_losses


def eval_node_classifier(model, graph, mask):
    model.eval()
    pred = model(graph).argmax(dim=1)
    correct = (pred[mask] == graph.y[mask]).sum()
    acc = int(correct) / int(mask.sum())
    return acc

In [29]:
# def train_node_classifier(model, graph, optimizer, criterion, batch_size=128, n_epochs=200):
#     train_losses = []
#     val_losses = []

#     for epoch in range(1, n_epochs + 1):
#         model.train()
#         epoch_train_loss = 0.0
#         num_batches = len(graph.train_mask) // batch_size + 1

#         for batch_start in range(0, len(graph.train_mask), batch_size):
#             batch_end = min(batch_start + batch_size, len(graph.train_mask))
#             batch_mask = graph.train_mask[batch_start:batch_end]

#             optimizer.zero_grad()
#             out = model(graph)
#             print(out.shape)
#             loss = criterion(out, graph.y[batch_mask])
#             weighted_train_loss = torch.mean(loss * torch.tensor(sample_weights_train[batch_start:batch_end]))
#             weighted_train_loss.backward()
#             optimizer.step()

#             epoch_train_loss += weighted_train_loss.item()

#         train_losses.append(epoch_train_loss / num_batches)

#         acc = eval_node_classifier(model, graph, graph.val_mask)
#         val_loss = criterion(model(graph)[graph.val_mask], graph.y[graph.val_mask])
#         weighted_val_loss = torch.mean(val_loss * torch.tensor(sample_weights_val))

#         val_losses.append(weighted_val_loss.item())

#         if epoch % 1 == 0:
#             print(f'Epoch: {epoch:03d}, Train Loss: {train_losses[-1]:.4f}, Val Acc: {acc:.4f}, Val Loss: {val_losses[-1]:.4f}')

#         if acc > 0.995:
#             break

#     return model, train_losses, val_losses


# def eval_node_classifier(model, graph, mask):
#     model.eval()
#     pred = model(graph).argmax(dim=1)
#     correct = (pred[mask] == graph.y[mask]).sum()
#     acc = int(correct) / int(mask.sum())
#     return acc

In [30]:
class GNNModel(torch.nn.Module):
    def __init__(self, in_channels, out_channels, num_layers, device):
        super(GNNModel,self).__init__()

        self.num_layers = num_layers
        # hidden_channels = (in_channels + out_channels)//2
        hidden_channels = 128

        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)

        self.fc1 = torch.nn.Linear(hidden_channels, hidden_channels//2)
        self.fc2 = torch.nn.Linear(hidden_channels//2, out_channels)

        self.device = device

        self.to(device)

    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = x.to(self.device)
        edge_index = edge_index.to(self.device)

        # print(x.shape, edge_index.shape)

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.fc1(x)
        x = F.relu(x)
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.fc2(x)
        x = F.log_softmax(x, dim=1)
        return x

In [31]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_layers = 2

model = GNNModel(data.x.shape[1], no_classes, num_layers, device)

In [32]:
criterion = nn.CrossEntropyLoss(reduction='none')
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=10e-6)

In [33]:
data.to(device)

Data(x=[179244, 136], edge_index=[2, 3412328], y=[179244], train_mask=[179244], test_mask=[179244], val_mask=[179244])

In [34]:
model

GNNModel(
  (conv1): GCNConv(136, 128)
  (conv2): GCNConv(128, 128)
  (fc1): Linear(in_features=128, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=328, bias=True)
)

In [None]:
history = train_node_classifier(model, data, optimizer, criterion, n_epochs=50)

Epoch: 001, Train Loss: 0.1320, Val Acc: 0.5888, Val Loss: 0.1315
Epoch: 002, Train Loss: 0.1310, Val Acc: 0.5960, Val Loss: 0.1305
Epoch: 003, Train Loss: 0.1296, Val Acc: 0.5949, Val Loss: 0.1298
Epoch: 004, Train Loss: 0.1280, Val Acc: 0.5916, Val Loss: 0.1284
Epoch: 005, Train Loss: 0.1272, Val Acc: 0.5986, Val Loss: 0.1266
Epoch: 006, Train Loss: 0.1263, Val Acc: 0.6004, Val Loss: 0.1249
Epoch: 007, Train Loss: 0.1254, Val Acc: 0.5980, Val Loss: 0.1248
Epoch: 008, Train Loss: 0.1243, Val Acc: 0.6033, Val Loss: 0.1250
Epoch: 009, Train Loss: 0.1236, Val Acc: 0.6026, Val Loss: 0.1220
Epoch: 010, Train Loss: 0.1226, Val Acc: 0.6015, Val Loss: 0.1221
0.11870120652945351
Epoch: 011, Train Loss: 0.1221, Val Acc: 0.6061, Val Loss: 0.1224
Epoch: 012, Train Loss: 0.1212, Val Acc: 0.6116, Val Loss: 0.1215
Epoch: 013, Train Loss: 0.1201, Val Acc: 0.6090, Val Loss: 0.1198
Epoch: 014, Train Loss: 0.1197, Val Acc: 0.6119, Val Loss: 0.1185
Epoch: 015, Train Loss: 0.1187, Val Acc: 0.6148, Val Los

In [36]:
torch.save(model, f'{result_path}/model.pkl')

In [37]:
# plot_losses(history[1], history[2])

In [38]:
test_acc = eval_node_classifier(model, data, data.test_mask)
print(f'Test Acc: {test_acc:.4f}')

Test Acc: 0.0000


In [39]:
model.eval()
pred = model(data).argmax(dim=1)

In [40]:
pred[data.test_mask], data.y[data.test_mask]

(tensor([ 23,  95,  90,  ..., 269,  23,  23]),
 tensor([-1, -1, -1,  ..., -1, -1, -1]))

In [41]:
encoder.inverse_transform(pred[data.test_mask].cpu())

array(['Bacillus spizizenii', 'Escherichia coli', 'Enterococcus faecalis',
       ..., 'Staphylococcus aureus', 'Bacillus spizizenii',
       'Bacillus spizizenii'], dtype=object)

In [42]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [43]:
mask = ground_truth != 'None'

In [44]:
y_true = ground_truth[test_idx]

In [45]:
y_pred = encoder.inverse_transform(pred[data.test_mask].cpu())
# y_true = encoder.inverse_transform(data.y[data.test_mask].cpu())

In [46]:
mask_ = y_true != 'None'
y_pred = y_pred[mask_]
y_true = y_true[mask_]

In [47]:
print(classification_report(y_true, y_pred,digits=4))

                               precision    recall  f1-score   support

          Bacillus spizizenii     0.1370    0.1598    0.1475       751
      Cryptococcus neoformans     0.0000    0.0000    0.0000       130
        Enterococcus faecalis     0.1026    0.4139    0.1644       604
             Escherichia coli     0.1405    0.0990    0.1162       788
Limosilactobacillus fermentum     0.0000    0.0000    0.0000       417
       Listeria monocytogenes     1.0000    0.0014    0.0028       722
       Pseudomonas aeruginosa     0.1140    0.0438    0.0633       707
     Saccharomyces cerevisiae     0.0000    0.0000    0.0000       103
          Salmonella enterica     0.1297    0.0917    0.1074       709
        Staphylococcus aureus     0.1368    0.1929    0.1601       705

                     accuracy                         0.1208      5636
                    macro avg     0.1761    0.1002    0.0762      5636
                 weighted avg     0.2247    0.1208    0.0954      5636



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
