In [1]:
from biodatasets import list_datasets, load_dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, random_split
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from torch.utils.tensorboard import SummaryWriter
import pandas as pd
import numpy as np
from tqdm import tqdm
import math
import time
from sklearn.metrics import confusion_matrix

In [2]:
# Loading data into numpy array
pathogen = load_dataset("pathogen")

X, y = pathogen.to_npy_arrays(input_names=["sequence"], target_names=["class"])

pathogen.display_description()

# "Human vs pathogen" dataset


## Dataset Description
Human vs pathogen clasification dataset.

### Dataset Summary

Human vs pathogen clasification dataset. 96k protein sequences (50% human, 50% pathogens). Extracted from Uniprot. Embeddings available calculated with ProtBert

Features:
 - sequence
 - sequence_id

Embeddings:
 - CLS embeddings - 1024-dim

Label:
 - class
  - 0: human
  - 1: pathogen

### Usage
```
from biodatasets import load_dataset

pathogen_dataset = load_dataset("pathogen")

X, y = pathogen_dataset.to_npy_array(input_names=["sequence"], target_names=["class"])
cls_embeddings = pathogen_dataset.get_embeddings("sequence", "protbert", "cls")
```

### Supported Tasks
 - clasification
 - inmunogenecity

### Model used to calculate Embeddings
 - ProtBert

### Libraries used to calculate embeddings
 - Pytorch


### Source Data

[Uniprot](https://www.uniprot.org/)


### Dataset Curators

[DeepChain team](https://deepchain.bio)

### Licensing Information
[Creative Commons

In [3]:
# Encoding Amino Acids to number
def get_seq_column_map(X):
    unique = set()
    for idx, sequence in enumerate(X[0]):
        unique.update(list(sequence))
    
    return dict(zip(unique, list(range(len(unique)))))
    
pathogen_map = get_seq_column_map(X)
print(pathogen_map)

{'G': 0, 'Q': 1, 'H': 2, 'D': 3, 'Y': 4, 'B': 5, 'F': 6, 'E': 7, 'Z': 8, 'X': 9, 'S': 10, 'U': 11, 'A': 12, 'M': 13, 'N': 14, 'V': 15, 'W': 16, 'T': 17, 'L': 18, 'P': 19, 'I': 20, 'K': 21, 'R': 22, 'C': 23}


In [4]:
class PathogenDataset(Dataset):
    
    def __init__(self, pathogen_map, data):
        self.pathogen_map = pathogen_map
        self.X = data[0]
        self.Y = data[1]
    
    def __len__(self):
        return len(self.Y)
    
    def __getitem__(self, idx):
        X = torch.as_tensor([self.pathogen_map[e] for e in list(self.X[idx])]) 
        Y = self.Y[idx]
        return X, Y

def collate_padd(batch):
        x = [row[0] for row in batch]
        y = [row[1] for row in batch]
        
        sequence_len = [len(row) for row in x]
        x =  pad_sequence(x, batch_first=True)
        return (torch.as_tensor(x).to(torch.float32), torch.as_tensor(sequence_len)), torch.as_tensor(y).to(torch.float32)
    
# Split ~ 80% 10% 10%
training_set = PathogenDataset(pathogen_map,(X[0][:80000], y[0][:80000]))
training_loader = DataLoader(training_set, batch_size=4, shuffle=True, collate_fn=collate_padd)

validation_set = PathogenDataset(pathogen_map,(X[0][80000:90000], y[0][80000:90000]))
validation_loader = DataLoader(validation_set, batch_size=8, collate_fn=collate_padd)

testing_set = PathogenDataset(pathogen_map,(X[0][90000:], y[0][90000:]))
testing_loader = DataLoader(testing_set, batch_size=8, collate_fn=collate_padd)

next(iter(training_loader))

((tensor([[13., 18., 10.,  ...,  0.,  0.,  0.],
          [13.,  0., 12.,  ...,  0.,  0.,  0.],
          [ 3., 20., 15.,  ...,  0.,  0.,  0.],
          [13., 18., 18.,  ...,  3., 21., 12.]]),
  tensor([ 186,  507,  113, 1145])),
 tensor([1., 1., 0., 1.]))

In [5]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, max_len=5000, dropout=0.1):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [6]:
class Net(nn.Module):
    """
    Text classifier based on a pytorch TransformerEncoder.
    """

    def __init__(
        self,
        vocab_size,
        d_model,
        nhead=8,
        dim_feedforward=512,
        num_layers=6,
        activation="relu",
        dropout=0.1,
    ):

        super().__init__()

        #vocab_size, d_model = embeddings.size()
        assert d_model % nhead == 0, "nheads must divide evenly into d_model"

        #self.emb = nn.Embedding.from_pretrained(embeddings, freeze=False)

        self.embed = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=d_model,
        )
        
        self.pos_encoder = PositionalEncoding(
            d_model=d_model,
            max_len=11000,
        )

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer,
            num_layers=num_layers,
        )
        
        self.dropout = nn.Dropout(p=0.25)
        
        self.classifier = nn.Linear(d_model, 1)
        
        self.d_model = d_model

    def forward(self, x):
        x = self.embed(x) * math.sqrt(self.d_model)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x)
        x = x.mean(dim=1)
        
        x = self.dropout(x)
        
        x = self.classifier(x)

        return x


In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"device : {device}")
torch.cuda.get_device_name()

device : cuda


'NVIDIA GeForce RTX 2060 SUPER'

In [8]:
model = Net(
    vocab_size=len(pathogen_map),
    d_model=1024,
    nhead=8,  
    dim_feedforward=50,
    num_layers=6,
    dropout=0.5,
).to(device)

print(model)

writer = SummaryWriter()

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(3):
    tqdm_bar = tqdm(training_loader, desc=f"epoch {epoch}", position=0)
    
    long = []
    
    # Training
    model.train()
    for idx, ((inputs, sequence_len), labels) in enumerate(tqdm_bar):
        long.extend([i for i in sequence_len])
        
        inputs = inputs.cuda()
        labels = labels.cuda()
        
        optimizer.zero_grad()
        #sigmoid = nn.Sigmoid()
        outputs = model(inputs.to(torch.int32))
        
        #print(outputs.flatten(), labels)
        
        loss = criterion(outputs.flatten(), labels).to(torch.float32)
        loss.backward()
        
        optimizer.step()
        
        writer.add_scalar('Loss/train', loss, idx)
        
    # Training Accuracy
    correct, total = 0, 0
    predicted = torch.round(outputs.flatten())
    y = labels

    total += labels.size(0)
    correct += (predicted == y).sum().item()
    writer.add_scalar('accuracy/train', correct/total, idx)

    """
    # Validation Accuracy
    model.eval()
    with torch.no_grad():
        for idx, (inputs, labels) in enumerate(validation_loader):
            correct, total = 0, 0
            inputs = inputs.cuda()
            labels = labels.cuda()

            outputs = model(inputs.to(torch.int32))

            _, predicted = torch.max(outputs, 1)
            _, y = torch.max(labels, 1)

            total += labels.size(0)
            correct += (predicted == y).sum().item()
            writer.add_scalar('accuracy/validation', correct/total, idx)
    """
print(long)
writer.close()

Net(
  (embed): Embedding(24, 1024)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
        )
        (linear1): Linear(in_features=1024, out_features=50, bias=True)
        (dropout): Dropout(p=0.5, inplace=False)
        (linear2): Linear(in_features=50, out_features=1024, bias=True)
        (norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.5, inplace=False)
        (dropout2): Dropout(p=0.5, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=

2022-07-22 11:26:42.779584: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-07-22 11:26:42.779603: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
epoch 0:   3%|▉                             | 596/20000 [01:04<35:02,  9.23it/s]


RuntimeError: CUDA out of memory. Tried to allocate 112.00 MiB (GPU 0; 7.79 GiB total capacity; 5.64 GiB already allocated; 135.62 MiB free; 6.13 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
!tensorboard --logdir=runs

In [None]:
PATH = './pathogen_net_transformer.pth'
torch.save(model.state_dict(), PATH)

In [None]:
model = Net(
    vocab_size=len(pathogen_map),
    d_model=512,
    nhead=8,  
    dim_feedforward=50,
    num_layers=6,
    dropout=0.25
).to(device)
model.load_state_dict(torch.load(PATH))
model.eval()


# Testing Accuracy
correct, total = 0, 0
with torch.no_grad():
    all_predicted, all_y = [], []
    for ((inputs, sequence_len), labels) in testing_loader:
        inputs = inputs.cuda()
        labels = labels.cuda()
        
        sigmoid = nn.Sigmoid()
        outputs = sigmoid(model(inputs.to(torch.int32)))
        
        predicted = torch.round(outputs.flatten())
        y = labels
                
        all_predicted.extend(predicted.tolist())
        all_y.extend(y.tolist())
        
        total += labels.size(0)
        correct += (predicted == y).sum().item()

print(confusion_matrix(all_y, all_predicted))
print(f'Accuracy of nn: {correct / total}')