In [1]:
from pathlib import Path
from collections import OrderedDict as odict

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import torch
from torch.utils.data import DataLoader, Subset
from torch import nn, optim
from torchvision import transforms, models

from utils import ProteinDataset


import resource
rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
resource.setrlimit(resource.RLIMIT_NOFILE, (2048, rlimit[1]))

In [2]:
root_dir = Path('/home/xmiler/projects/human-protein-atlas-image-classification/input/')
arch = models.resnet50(pretrained=True)
device_id = 0
score_threshold = 0.5

In [3]:
# DATALOADER
# split
dataset_size = pd.read_csv(root_dir/'train.csv').shape[0]
idxs = {'train': None, 'val': None}
idxs['train'], idxs['val'] = train_test_split(np.arange(dataset_size), random_state=0, test_size=0.1)

# transforms
data_transform = transforms.Compose([    
    transforms.Resize(224),
    transforms.ToTensor()
])

# datasets
datasets = {phase: ProteinDataset(root_dir / 'train.csv', root_dir / 'train', colors=['red', 'green', 'blue'],
                                  idxs=idxs[phase], transforms=data_transform) for phase in ['train', 'val']}

# dataloaders

dataloaders = {phase: DataLoader(datasets[phase], batch_size=16, num_workers=2, shuffle=(phase=='train')) 
               for phase in ['train', 'val']}

In [4]:
# MODEL

class Flatten(nn.Module):
    "Flatten `x` to a single dimension, often used at the end of a model."
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return x.view((x.size(0), -1)) 

body = nn.Sequential(*list(arch.children())[:-1])
head = nn.Sequential(Flatten(), nn.Linear(2048, 28))

model = nn.Sequential(odict([('body', body), ('head', head)]))
model.to(device_id)
pass

In [5]:
num_epochs = 1
lrs = [0., 0.1]

# optimizer
assert len(lrs) == 2
optimizer = optim.SGD([{'params': model.body.parameters(), 'lr': None},
                       {'params': model.head.parameters(), 'lr': None}],
                      momentum=0.9)

criterion = nn.MultiLabelSoftMarginLoss()

In [6]:
class Trainer:
    def __init__(self, dataloaders, model, optimizer, criterion):
        self._dataloaders = dataloaders
        
        self._model=model
        
        self._optimizer = optimizer
        self._criterion = criterion
        
        self._iter_num = -1
        
        self.statistics = {
            'batch': {'train': {'iter': [], 'lrs': [], 'loss': []}},
            'epoch': {'train': {'iter': [], 'loss': [], 'f1_macro': []},
                      'val': {'iter': [], 'loss': [], 'f1_macro': []}}
        }
            
    def get_iteration(self):
        return self._iter_num
    
    def train_iteration(self, inputs, labels, lrs):
        self._iter_num += 1
        
        inputs = inputs.to(device_id)
        labels = labels.float().to(device_id)
        
        for i in range(2):
            optimizer.param_groups[i]['lr'] = lrs[i]
        
        self._optimizer.zero_grad()
        
        with torch.set_grad_enabled(True):
            outputs = self._model(inputs)
            loss = self._criterion(outputs, labels)

            loss.backward()
            self._optimizer.step()
            
            self.statistics['batch']['train']['iter'].append(self._iter_num)
            self.statistics['batch']['train']['lrs'].append(lrs)
            self.statistics['batch']['train']['loss'].append(loss.item())
            
    def test_epoch(self, phase):
        self.statistics['epoch'][phase]['iter'].append(self.get_iteration())
        
        loss_epoch = 0
        gt_epoch = []
        preds_epoch = []
        with torch.set_grad_enabled(False):
            for inputs, labels in self._dataloaders[phase]:
                gt_epoch.append(labels.data.numpy())                
                
                inputs = inputs.to(device_id)
                labels = labels.float().to(device_id)
                
                outputs = model(inputs)
                
                preds_epoch.append(outputs.sigmoid().cpu().data.numpy())
                
                loss = criterion(outputs, labels)
                
                loss_epoch += loss.item() * inputs.size(0)
                
        loss_epoch /= len(self._dataloaders[phase].dataset)                
        self.statistics['epoch'][phase]['loss'].append(loss_epoch)
        
        preds_epoch = np.concatenate(preds_epoch)
        gt_epoch = np.concatenate(gt_epoch)        
        self.statistics['epoch'][phase]['f1_macro'].append(f1_score(gt_epoch, preds_epoch > score_threshold, average='macro'))
        

In [7]:
trainer = Trainer(dataloaders, model, optimizer, criterion)

In [8]:
%%time

trainer.test_epoch('train')

CPU times: user 59.2 s, sys: 19.9 s, total: 1min 19s
Wall time: 2min 11s


In [9]:
trainer.statistics

{'batch': {'train': {'iter': [], 'loss': [], 'lrs': []}},
 'epoch': {'train': {'f1_macro': [0.0895852309352233],
   'iter': [-1],
   'loss': [0.684914509066013]},
  'val': {'f1_macro': [], 'iter': [], 'loss': []}}}

In [10]:
%%time

trainer.test_epoch('val')

CPU times: user 6.3 s, sys: 2.26 s, total: 8.56 s
Wall time: 18.1 s


In [11]:
trainer.statistics

{'batch': {'train': {'iter': [], 'loss': [], 'lrs': []}},
 'epoch': {'train': {'f1_macro': [0.0895852309352233],
   'iter': [-1],
   'loss': [0.684914509066013]},
  'val': {'f1_macro': [0.08973321109446879],
   'iter': [-1],
   'loss': [0.6846367205125178]}}}