In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F

import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torchvision.transforms as transforms


from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from einops import rearrange  # ! pip install einops

In [2]:
class ImageDataset(Dataset):
    def __init__(self, csv, train, test):

        self.csv = csv
        self.train = train
        self.test = test
        self.all_image_names = self.csv[:]["filename"]
        self.all_labels = np.array(self.csv.drop(["filename", "ind"], axis=1))
        self.train_ratio = int(0.85 * len(self.csv))
        self.valid_ratio = len(self.csv) - self.train_ratio

        # set the training data images and labels
        if self.train == True:
            print(f"Number of training images: {self.train_ratio}")
            self.image_names = list(self.all_image_names[: self.train_ratio])
            self.labels = list(self.all_labels[: self.train_ratio])
            # define the training transforms
            self.transform = transforms.Compose([transforms.ToTensor(),])

        # set the validation data images and labels
        elif self.train == False and self.test == False:
            print(f"Number of validation images: {self.valid_ratio}")
            self.image_names = list(self.all_image_names[-self.valid_ratio : -10])
            self.labels = list(self.all_labels[-self.valid_ratio :])
            # define the validation transforms
            self.transform = transforms.Compose([transforms.ToTensor(),])

        # set the test data images and labels, only last 10 images
        # this, we will use in a separate inference script
        elif self.test == True and self.train == False:
            self.image_names = list(self.all_image_names[-10:])
            self.labels = list(self.all_labels[-10:])
            # define the test transforms
            self.transform = transforms.Compose([transforms.ToTensor(),])

    def __len__(self):
        return len(self.image_names)

    def __getitem__(self, index):
        image = cv2.imread(f"../data/raw_images/{self.image_names[index]}")
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        # apply image transforms
        image = self.transform(image)
        targets = self.labels[index]

        return {
            "image": torch.tensor(image, dtype=torch.float32),
            "label": torch.tensor(targets, dtype=torch.float32),
        }

In [3]:
# Used in creating data loaders as well as in train loops
BATCH_SIZE = 1

In [4]:
train_csv = pd.read_csv("../data/processed/train.csv")
# train dataset
train_data = ImageDataset(train_csv[:1000], train=True, test=False)
# validation dataset
valid_data = ImageDataset(train_csv[:1000], train=False, test=False)
# train data loader
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
# validation data loader
valid_loader = DataLoader(valid_data, batch_size=BATCH_SIZE, shuffle=False)

Number of training images: 850
Number of validation images: 150


### Model definition

In [5]:
N_EPHOCS = 50


LR = 0.001
PATCH_SIZE = 16

In [6]:
class Transformer_encoder(nn.Module):
    def __init__(self, hidden_size, nb_heads):
        super(Transformer_encoder, self).__init__()
        assert hidden_size == nb_heads * (
            hidden_size // nb_heads
        )  # check if hidden_size is divisible by nb_heads
        self.MHA = nn.MultiheadAttention(hidden_size, nb_heads)
        self.LLcat = nn.Linear(2 * hidden_size, hidden_size)
        self.LL1 = nn.Linear(hidden_size, hidden_size)
        self.LL2 = nn.Linear(hidden_size, hidden_size)
        self.LN1 = nn.LayerNorm(hidden_size)
        self.LN2 = nn.LayerNorm(hidden_size)

    def forward(self, h_cat, pos=None):
        #         seq_length = g_seq.size(0)
        #         bs = g_seq.size(1)
        #         pos = pos.unsqueeze(dim=1).repeat_interleave(
        #             bs, dim=1
        #         )  # size=(seq_length, bs, hidden_dim)
        #         h_cat = self.LLcat(
        #             torch.cat((g_seq, pos), dim=2)
        #         )  # size=(seq_length, bs, hidden_dim)

        h_MHA_seq, _ = self.MHA(
            h_cat, h_cat, h_cat
        )  # size=(seq_length, bs, hidden_dim)
        h = self.LN1(h_cat + h_MHA_seq)  # size=(1, bs, hidden_dim) 2
        h_MLP = self.LL2(torch.relu(self.LL1(h)))  # size=(1, bs, hidden_dim)
        h_seq = self.LN2(h + h_MLP)  # size=(1, bs, hidden_dim)
        return h_seq


class ANN(nn.Module):
    def __init__(self, hidden_size, nb_heads):
        super(ANN, self).__init__()
        self.encoder = Transformer_encoder(hidden_size, nb_heads)

    def forward(self, g_seq, pos=None):
        h_enc_seq = self.encoder(g_seq, pos)  # size=(seq_length, bs, hidden_dim)
        return h_enc_seq


class attention_net(nn.Module):
    def __init__(self, hidden_size, nb_heads, no_classes):
        super(attention_net, self).__init__()

        self.emb_dropout = nn.Dropout(0)
        self.cls_token = nn.Parameter(torch.randn(1, 1, hidden_size))
        self.pos_emb1D = nn.Parameter(torch.randn(144 + 1, hidden_size))
        self.mlp_head = nn.Linear(hidden_size, no_classes)

        self.learnable_patches = nn.Linear(hidden_size, hidden_size)
        self.layer2 = ANN(hidden_size, nb_heads)
        self.layer3 = nn.Linear(hidden_size, no_classes)

    def expand_cls_to_batch(self, batch):
        """
        Args:
            batch: batch size
        Returns: cls token expanded to the batch size
        """
        return self.cls_token.expand([batch, -1, -1])

    def forward(self, img, pos=None):
        bs = img.shape[0]
        img_patches = rearrange(
            img,
            "b c (patch_x x) (patch_y y) -> b (x y) (patch_x patch_y c)",
            patch_x=PATCH_SIZE,
            patch_y=PATCH_SIZE,
        )

        # positional embedding
        img_patches = self.learnable_patches(img_patches)
        img_patches = torch.cat((self.expand_cls_to_batch(bs), img_patches), dim=1)

        patch_embeddings = self.emb_dropout(img_patches + self.pos_emb1D)

        h_seq = self.layer2(patch_embeddings, pos)  # size=(seq_length, bs, hidden_dim)
        score_seq = self.layer3(h_seq[:, 0, :])  # size=(seq_length, bs, vocab_size)
        return score_seq

### Check if inference works correctly

In [7]:
nb_heads = 16
hidden_size = 768
no_classes = 6
net = attention_net(hidden_size, nb_heads, no_classes)
net

attention_net(
  (emb_dropout): Dropout(p=0, inplace=False)
  (mlp_head): Linear(in_features=768, out_features=6, bias=True)
  (learnable_patches): Linear(in_features=768, out_features=768, bias=True)
  (layer2): ANN(
    (encoder): Transformer_encoder(
      (MHA): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
      )
      (LLcat): Linear(in_features=1536, out_features=768, bias=True)
      (LL1): Linear(in_features=768, out_features=768, bias=True)
      (LL2): Linear(in_features=768, out_features=768, bias=True)
      (LN1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (LN2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
  )
  (layer3): Linear(in_features=768, out_features=6, bias=True)
)

In [8]:
dataiter = iter(train_loader)
images, labels = dataiter.next().values()
seq_length = 144
scores = net(images.view(1, 3, 144, 256))
scores.shape
print(scores)
scores = torch.sigmoid(scores)
scores

tensor([[ 0.1850, -0.8191,  0.5299, -0.1271,  0.7203, -0.7072]],
       grad_fn=<AddmmBackward>)




tensor([[0.5461, 0.3059, 0.6295, 0.4683, 0.6727, 0.3302]],
       grad_fn=<SigmoidBackward>)

### Model instantiation

In [9]:
import torch.optim as optim

# Device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Model instance
model = attention_net(hidden_size, nb_heads, no_classes)
model = model.to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=LR)


def get_error(outputs, labels, batch_size):
    lab = torch.where(outputs >= 0.5, 1, 0)
    indicator = torch.where(lab != labels, 1, 0)
    non_matches = torch.sum(indicator, axis=0)
    error = non_matches.float() / batch_size
    error = error.sum() / 6
    return error.item()

### Train

In [10]:
for epoch in range(50):  # loop over the dataset multiple times
    train_running_loss = 0.0
    train_err = 0.0
    model.train()

    # TRAINING ROUND
    for i, data in enumerate(train_loader):
        # zero the parameter gradients
        optimizer.zero_grad()

        # get the inputs
        inputs, labels = data.values()
        inputs = inputs.view(BATCH_SIZE, 3, 144, 256)
        inputs = inputs.to(device)
        labels = labels.to(device)
        # forward + backward + optimize
#         outputs = model(inputs.view(BATCH_SIZE, 3, 144, 256),)
        outputs = model(inputs,)
        outputs = torch.sigmoid(outputs)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_running_loss += loss.detach().item()
        train_err += get_error(outputs.detach(), labels, BATCH_SIZE)

    model.eval()
    print(
        "Epoch:  %d | Loss: %.4f | Train Error: %.4f"
        % (epoch, train_running_loss / i, train_err / i)
    )



Epoch:  0 | Loss: 0.4931 | Train Error: 0.2348
Epoch:  1 | Loss: 0.4563 | Train Error: 0.2236
Epoch:  2 | Loss: 0.4504 | Train Error: 0.2197
Epoch:  3 | Loss: 0.4481 | Train Error: 0.2216
Epoch:  4 | Loss: 0.4492 | Train Error: 0.2173
Epoch:  5 | Loss: 0.4456 | Train Error: 0.2187
Epoch:  6 | Loss: 0.4458 | Train Error: 0.2173
Epoch:  7 | Loss: 0.4452 | Train Error: 0.2152
Epoch:  8 | Loss: 0.4455 | Train Error: 0.2173
Epoch:  9 | Loss: 0.4431 | Train Error: 0.2155
Epoch:  10 | Loss: 0.4438 | Train Error: 0.2132
Epoch:  11 | Loss: 0.4458 | Train Error: 0.2167
Epoch:  12 | Loss: 0.4434 | Train Error: 0.2140
Epoch:  13 | Loss: 0.4435 | Train Error: 0.2163
Epoch:  14 | Loss: 0.4432 | Train Error: 0.2197
Epoch:  15 | Loss: 0.4423 | Train Error: 0.2167
Epoch:  16 | Loss: 0.4431 | Train Error: 0.2173
Epoch:  17 | Loss: 0.4432 | Train Error: 0.2159
Epoch:  18 | Loss: 0.4427 | Train Error: 0.2140
Epoch:  19 | Loss: 0.4426 | Train Error: 0.2126
Epoch:  20 | Loss: 0.4429 | Train Error: 0.2155
Ep

### Test Accuracy

In [11]:
test_err = 0.0
for i, data in enumerate(valid_loader, 0):
    inputs, labels = data.values()

    outputs = model(inputs.view(BATCH_SIZE, 3, 144, 256).to(device))
    outputs = torch.sigmoid(outputs)
    #     print((outputs.detach() > 0.5).float(), labels)

    test_err += get_error(outputs.detach(), labels, BATCH_SIZE)

print("Test Error: %.4f" % (test_err / i))



RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking arugment for argument mat2 in method wrapper_mm)