In [1]:
from biodatasets import list_datasets, load_dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, random_split
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from torch.utils.tensorboard import SummaryWriter
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
# Loading data into numpy array
pathogen = load_dataset("pathogen")

X, y = pathogen.to_npy_arrays(input_names=["sequence"], target_names=["class"])

pathogen.display_description()

Dataset(name=pathogen, available_columns=['sequence', 'sequence_id', 'class'], available_embeddings={('sequence', 'protbert', 'cls')})
# "Human vs pathogen" dataset


## Dataset Description
Human vs pathogen clasification dataset.

### Dataset Summary

Human vs pathogen clasification dataset. 96k protein sequences (50% human, 50% pathogens). Extracted from Uniprot. Embeddings available calculated with ProtBert

Features:
 - sequence
 - sequence_id

Embeddings:
 - CLS embeddings - 1024-dim

Label:
 - class
  - 0: human
  - 1: pathogen

### Usage
```
from biodatasets import load_dataset

pathogen_dataset = load_dataset("pathogen")

X, y = pathogen_dataset.to_npy_array(input_names=["sequence"], target_names=["class"])
cls_embeddings = pathogen_dataset.get_embeddings("sequence", "protbert", "cls")
```

### Supported Tasks
 - clasification
 - inmunogenecity

### Model used to calculate Embeddings
 - ProtBert

### Libraries used to calculate embeddings
 - Pytorch


### Source Data

[Uniprot]

In [3]:
# Encoding Amino Acids to number
def get_seq_column_map(X):
    unique = set()
    for idx, sequence in enumerate(X[0]):
        unique.update(list(sequence))
    
    return dict(zip(unique, list(range(len(unique)))))
    
pathogen_map = get_seq_column_map(X)
print(pathogen_map)

{'T': 0, 'Y': 1, 'M': 2, 'L': 3, 'K': 4, 'H': 5, 'Z': 6, 'R': 7, 'A': 8, 'P': 9, 'F': 10, 'C': 11, 'V': 12, 'I': 13, 'S': 14, 'U': 15, 'Q': 16, 'D': 17, 'N': 18, 'B': 19, 'E': 20, 'G': 21, 'X': 22, 'W': 23}


In [4]:
class PathogenDataset(Dataset):
    
    def __init__(self, pathogen_map, data):
        self.pathogen_map = pathogen_map
        self.X = data[0]
        self.Y = self.__one_hot(data[1])
        
    def __one_hot(self, Y):
        one_hot_Y = np.zeros((Y.size, Y.max() + 1))
        one_hot_Y[np.arange(Y.size), Y] = 1
        one_hot_Y = one_hot_Y
        return one_hot_Y.astype(np.float64)
    
    def __len__(self):
        return len(self.Y)
    
    def __getitem__(self, idx):
        X = torch.as_tensor([self.pathogen_map[e] for e in list(self.X[idx])]) 
        Y = self.Y[idx]
        return X, Y

def collate_padd(batch):
        x = [row[0] for row in batch]
        y = [row[1] for row in batch]
        x =  pad_sequence(x, batch_first=True)
        return torch.as_tensor(x).to(torch.float32), torch.as_tensor(y).to(torch.float32)
    
# Split ~ 80% 10% 10%
training_set = PathogenDataset(pathogen_map,(X[0][:80000], y[0][:80000]))
training_loader = DataLoader(training_set, batch_size=8, shuffle=True, collate_fn=collate_padd)

validation_set = PathogenDataset(pathogen_map,(X[0][80000:90000], y[0][80000:90000]))
validation_loader = DataLoader(validation_set, batch_size=8, collate_fn=collate_padd)

testing_set = PathogenDataset(pathogen_map,(X[0][90000:], y[0][90000:]))
testing_loader = DataLoader(testing_set, batch_size=8, collate_fn=collate_padd)

next(iter(training_loader))

(tensor([[ 2.,  4., 17.,  ...,  0.,  0.,  0.],
         [ 2., 13., 18.,  ...,  0.,  0.,  0.],
         [ 2.,  3., 12.,  ...,  0.,  0.,  0.],
         ...,
         [ 2.,  2.,  3.,  ...,  0.,  0.,  0.],
         [17., 10., 12.,  ...,  0.,  0.,  0.],
         [ 2., 20.,  8.,  ..., 20.,  4.,  9.]]),
 tensor([[0., 1.],
         [0., 1.],
         [1., 0.],
         [0., 1.],
         [0., 1.],
         [0., 1.],
         [1., 0.],
         [1., 0.]]))

In [18]:
class Net(nn.Module):
    
    def __init__(self, input_dim=len(pathogen_map)):
        super().__init__()
        
        self.embed = nn.Embedding(
            num_embeddings=input_dim,
            embedding_dim=512,
        )
        
        self.lstm = nn.LSTM(
            input_size=512,
            hidden_size=256,
            num_layers=1,
            batch_first=True,
        )
        
        self.linear_1 = nn.Linear(
            in_features=256,
            out_features=128,
        )
        
        self.dropout = nn.Dropout(p=0.25)
        
        self.linear_2 = nn.Linear(
            in_features=128,
            out_features=2,
        )
        
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):  
        embed = self.embed(x)
        
        lstm_1_seq, (lstm_1_h, lstm1_c) = self.lstm(embed)
        
        linear_1 = self.linear_1(torch.squeeze(lstm_1_h))
        dropout = self.dropout(linear_1)
        linear_2 = self.linear_2(dropout)
        
        softmax = self.softmax(linear_2)
        
        return softmax

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"device : {device}")
torch.cuda.get_device_name()

device : cuda


'NVIDIA GeForce RTX 2060 SUPER'

In [19]:
model = Net().cuda()
print(model)

writer = SummaryWriter()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(5):
    model.train()
    tqdm_bar = tqdm(training_loader, desc=f"epoch {epoch}", position=0)
    
    # Training
    for idx, (inputs, labels) in enumerate(tqdm_bar):
        inputs = inputs.cuda()
        labels = labels.cuda()
        
        optimizer.zero_grad()
        
        outputs = model(inputs.to(torch.int32))
        
        loss = criterion(outputs, labels).to(torch.float32)
        loss.backward()
        
        
        writer.add_scalar('Loss/train', loss, idx)
        
        # Training Accuracy
        correct, total = 0, 0
        _, predicted = torch.max(outputs, 1)
        _, y = torch.max(labels, 1)

        total += labels.size(0)
        correct += (predicted == y).sum().item()
        writer.add_scalar('accuracy/train', correct/total, idx)
        
        optimizer.step()
        
    # Validation Accuracy
    model.eval()
    with torch.no_grad():
        for idx, (inputs, labels) in enumerate(validation_loader):
            correct, total = 0, 0
            inputs = inputs.cuda()
            labels = labels.cuda()

            outputs = model(inputs.to(torch.int32))

            _, predicted = torch.max(outputs, 1)
            _, y = torch.max(labels, 1)

            total += labels.size(0)
            correct += (predicted == y).sum().item()
            writer.add_scalar('accuracy/validation', correct/total, idx)

writer.close()

Net(
  (embed): Embedding(24, 512)
  (lstm): LSTM(512, 256, batch_first=True)
  (linear_1): Linear(in_features=256, out_features=128, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
  (linear_2): Linear(in_features=128, out_features=2, bias=True)
  (softmax): Softmax(dim=1)
)


epoch 0: 100%|████████████████████████████| 10000/10000 [06:15<00:00, 26.65it/s]
epoch 1: 100%|████████████████████████████| 10000/10000 [05:39<00:00, 29.43it/s]
epoch 2: 100%|████████████████████████████| 10000/10000 [05:36<00:00, 29.74it/s]
epoch 3: 100%|████████████████████████████| 10000/10000 [05:36<00:00, 29.71it/s]
epoch 4: 100%|████████████████████████████| 10000/10000 [05:36<00:00, 29.71it/s]


In [20]:
!tensorboard --logdir=runs

2022-07-18 13:42:04.056767: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-07-18 13:42:04.056787: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-07-18 13:42:05.143385: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-18 13:42:05.143653: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-07-18 13:42:05.143687: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.s

In [22]:
PATH = './pathogen_net.pth'
torch.save(model.state_dict(), PATH)

In [23]:
model = Net(input_dim).cuda()
model.load_state_dict(torch.load(PATH))
model.eval()


# Testing Accuracy
correct, total = 0, 0
with torch.no_grad():
    for (inputs, labels) in testing_loader:
        inputs = inputs.cuda()
        labels = labels.cuda()
        
        outputs = model(inputs.to(torch.int32))
        
        _, predicted = torch.max(outputs, 1)
        _, y = torch.max(labels, 1)
        
        total += labels.size(0)
        correct += (predicted == y).sum().item()
        
print(f'Accuracy of nn: {correct / total}')

Accuracy of nn: 0.8643934632306726
