In [1]:
import numpy as np
import pandas as pd
import os
import cv2
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
%matplotlib inline
from torchsummary import summary

import scipy as sp
from functools import partial
from sklearn import metrics
from collections import Counter
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score, f1_score
import pretrainedmodels
import torch
from torch.utils.data import TensorDataset, DataLoader,Dataset
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
from torch.optim import lr_scheduler
import time 
from tqdm import tqdm_notebook as tqdm
from PIL import Image
train_on_gpu = True
from torch.utils.data.sampler import SubsetRandomSampler
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau, CosineAnnealingLR

import cv2
import albumentations
from albumentations import torch as AT

device = torch.device("cuda:0")

In [2]:
print(os.listdir('../'))

['aptos2019-blindness-detection.zip', 'test_images.zip', 'resnet50-regression.bin', 'train_images.zip', 'aptos']


In [3]:
PATH = "../aptos/"
input_size = (3, 256, 256)

In [4]:
train = pd.read_csv(PATH+'train.csv')
test = pd.read_csv(PATH+'test.csv')
sample_submission = pd.read_csv(PATH+'sample_submission.csv')

In [5]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
def prepare_labels(y):
    # From here: https://www.kaggle.com/pestipeti/keras-cnn-starter
    values = np.array(y)
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(values)

    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)

    y = onehot_encoded
    return y, label_encoder

In [6]:
y, le = prepare_labels(train['diagnosis'])

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [7]:
class GlassDataset(Dataset):
    def __init__(self, df, datatype='train', transform = transforms.Compose([transforms.CenterCrop(32),transforms.ToTensor()]), y = None):
        self.df = df
        self.datatype = datatype
        self.image_files_list = [f'../aptos/{self.datatype}_images/{i}.png' for i in df['id_code'].values]
        if self.datatype == 'train':
            self.labels = y
        else:
            self.labels = np.zeros((df.shape[0], 5))
        self.transform = transform

    def __len__(self):
        return len(self.image_files_list)

    def __getitem__(self, idx):
        img_name = self.image_files_list[idx]
        img = cv2.imread(img_name)
        if img.shape[2] == 1:
            img = np.stack((img[..., 0],) * 3, axis=-1)
            
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        image = self.transform(image=img)
        image = image['image']

        img_name_short = self.image_files_list[idx].split('.')[0]

        label = self.labels[idx]
        if self.datatype == 'test':
            return image, label, img_name
        else:
            return image, label

In [8]:
class model_(nn.Module):
    def __init__(self, feature=2048, num_classes=5, inchannels=3, model_name='resnet101',pooling='concat'):
        super().__init__()
        self.model_name = model_name
        self.feature = feature
        self.pooling = pooling
        
        if model_name in pretrainedmodels.model_names:
            self.base = pretrainedmodels.__dict__[self.model_name](num_classes=1000, pretrained='imagenet')
            #self.base = torchvision.models.resnet50(pretrained=False)
            self.base = torch.nn.Sequential(*(list(self.base.children())[:-2]))
            #self.base.load_state_dict(torch.load(PATH+"checkpoint/resnet50-regression-0.7.bin"))
        else:
            assert False, "{} is error".format(model_name)
        
        if self.pooling == 'concat':
            print('Concatenated pooling')
            self.ap = nn.AdaptiveAvgPool2d((1,1))
            self.mp = nn.AdaptiveMaxPool2d((1,1))
            self.bn0 = nn.BatchNorm1d(self.feature*2,eps=1e-05, momentum=0.1, affine=True)
            self.dropout0 = nn.Dropout(0.35)
            self.fc1 = nn.Linear(self.feature*2, int(self.feature/2))
            self.bn1 = nn.BatchNorm1d(int(self.feature/2),eps=1e-05, momentum=0.1, affine=True)
            self.dropout1 = nn.Dropout(0.35)
            self.fc2 = nn.Linear(int(self.feature/2), num_classes)
        else:
            if self.pooling == 'average': 
                self.ap = nn.AdaptiveAvgPool2d((1,1))
                print('Average pooling')
            if self.pooling == 'max': 
                self.mp = nn.AdaptiveMaxPool2d((1,1))
                print('Max pooling')
            self.bn0 = nn.BatchNorm1d(self.feature,eps=1e-05, momentum=0.1, affine=True)
            self.dropout0 = nn.Dropout(0.35)
            self.fc1 = nn.Linear(self.feature, int(self.feature/2))
            self.bn1 = nn.BatchNorm1d(int(self.feature/2),eps=1e-05, momentum=0.1, affine=True)
            self.dropout1 = nn.Dropout(0.35)
            self.fc2 = nn.Linear(int(self.feature/2), num_classes)
            
    def forward(self, x):
        x = self.base(x)
        
        if self.pooling == 'concat':
            ap = self.ap(x)
            mp = self.mp(x)
            x = torch.cat((ap,mp),dim=1)
            x = x.view(x.size(0), -1)  #Flatten
            
        if self.pooling == 'max':
            x = self.mp(x)
            x = x.view(x.size(0), -1)
            
        if self.pooling == 'average':
            x = self.ap(x)
            x = x.view(x.size(0), -1)
    
        x = self.bn0(x)
        x = self.dropout0(x)
        x = F.relu(self.fc1(x))
        x = self.bn1(x)
        x = self.dropout1(x)         
        x = self.fc2(x)
        
        return x


In [9]:
model_name = 'resnet50'
model = model_(num_classes=5,model_name=model_name,pooling='concat')
#print(model)
#model.load_state_dict(torch.load(PATH+"checkpoint/Classification-resnet50-0.7250374948870608.pt"))
model = model.to(device)
summary(model, input_size=input_size)

Concatenated pooling
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 128, 128]           9,408
       BatchNorm2d-2         [-1, 64, 128, 128]             128
              ReLU-3         [-1, 64, 128, 128]               0
         MaxPool2d-4           [-1, 64, 64, 64]               0
            Conv2d-5           [-1, 64, 64, 64]           4,096
       BatchNorm2d-6           [-1, 64, 64, 64]             128
              ReLU-7           [-1, 64, 64, 64]               0
            Conv2d-8           [-1, 64, 64, 64]          36,864
       BatchNorm2d-9           [-1, 64, 64, 64]             128
             ReLU-10           [-1, 64, 64, 64]               0
           Conv2d-11          [-1, 256, 64, 64]          16,384
      BatchNorm2d-12          [-1, 256, 64, 64]             512
           Conv2d-13          [-1, 256, 64, 64]          16,384
      BatchNorm2d-

In [11]:
print(model)

model_(
  (base): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace)
        (downsample): Sequential(
          (0): Conv2d(64, 256, kernel_si

In [10]:
data_transforms = albumentations.Compose([
    albumentations.Resize(input_size[1], input_size[2]),
    albumentations.HorizontalFlip(),
    albumentations.RandomBrightness(),
    albumentations.ShiftScaleRotate(rotate_limit=15, scale_limit=0.10),
    albumentations.JpegCompression(80),
    albumentations.HueSaturationValue(),
    albumentations.Normalize(),
    AT.ToTensor()
    ])
data_transforms_test = albumentations.Compose([
    albumentations.Resize(input_size[1], input_size[2]),
    albumentations.Normalize(),
    AT.ToTensor()
    ])

dataset = GlassDataset(df=train, datatype='train', transform=data_transforms, y=y)
test_set = GlassDataset(df=test, datatype='test', transform=data_transforms_test)
tr, val = train_test_split(train.diagnosis, stratify=train.diagnosis, test_size=0.15)
train_sampler = SubsetRandomSampler(list(tr.index))
valid_sampler = SubsetRandomSampler(list(val.index))
batch_size = 24
test_batch_size = 32
num_workers = 0
# prepare data loaders (combine dataset and sampler)
train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler, num_workers=num_workers)
valid_loader = torch.utils.data.DataLoader(dataset, batch_size=1, sampler=valid_sampler, num_workers=num_workers)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=test_batch_size, num_workers=num_workers)

In [11]:
model.cuda()
criterion = nn.BCEWithLogitsLoss()


#optimizer = optim.SGD(model_conv.fc.parameters(), lr=0.005, momentum=0.99)
#scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=2, )

plist = [
         {'params': model.base.parameters(), 'lr': 1e-4},
         {'params': model.fc1.parameters(), 'lr': 1e-3},
         {'params': model.fc2.parameters(), 'lr': 1e-3}
         ]

optimizer = optim.Adam(plist, weight_decay=0.0002)
scheduler = lr_scheduler.StepLR(optimizer, step_size = 10, gamma = 0.5)

In [12]:
valid_loss_min = np.Inf
kappa_max = 0.70
patience = 8
# current number of epochs, where validation loss didn't increase
p = 0
# whether training should be stopped
stop = False

# number of epochs to train the model
n_epochs = 20

for epoch in range(1, n_epochs+1):
    print(time.ctime(), 'Epoch:', epoch)
    model.train()
    
    if epoch==4:
        optimizer = optim.Adam(plist, lr=0.001)
        scheduler = lr_scheduler.StepLR(optimizer, step_size = 2, gamma = 0.5)
    if epoch==12:
        optimizer = optim.SGD(model.parameters(), lr=1e-6)
        scheduler = lr_scheduler.StepLR(optimizer, step_size = 2, gamma = 0.5)

    train_loss = []
    train_auc = []
    tk0 = tqdm(train_loader, total=int(len(train_loader)))
    for batch_i, (data, target) in enumerate(tk0):

        data, target = data.cuda(), target.cuda()

        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target.float())
        train_loss.append(loss.item())
        
        a = target.data.cpu().numpy()
        b = output.detach().cpu().numpy()
        
        #b = output[:,-1].detach().cpu().numpy()
        
        loss.backward()
        optimizer.step()
    
    model.eval()
    val_loss = []
    val_auc = []
    
    val_target = np.zeros(len(val))
    val_pred = np.zeros(len(val))
    tk1 = tqdm(valid_loader)
    for batch_i, (data, target) in enumerate(tk1):
        data, target = data.cuda(), target.cuda()
        output = model(data)

        loss = criterion(output, target.float())

        val_loss.append(loss.item()) 
        a = target.data.cpu().numpy()
        b = output.detach().cpu().numpy()
        val_target[batch_i] = le.inverse_transform([np.argmax(a)])
        val_pred[batch_i] = le.inverse_transform([np.argmax(b)])
        
    accuracy = accuracy_score(val_target, val_pred) 
    print('Accuracy : '+str(accuracy))
    f1 = f1_score(val_target, val_pred, labels=None, pos_label=1, average='weighted', sample_weight=None)
    print('F1 score: '+str(f1))  
    kappa = metrics.cohen_kappa_score(val_target, val_pred, labels=None, weights=None, sample_weight=None)
    print('Pour Hugo Kappa score: '+str(kappa))
    print(f'Epoch {epoch}, train loss: {np.mean(train_loss):.4f}, valid loss: {np.mean(val_loss):.4f}.')
    
    valid_loss = np.mean(val_loss)
    scheduler.step(valid_loss)
    if kappa_max <= kappa:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        kappa_max,
        kappa))
        torch.save(model.state_dict(), "../aptos/checkpoint/Classification-"+format(model_name)+"-"+format(kappa)+".pt")
        kappa_max = kappa
        p = 0

    # check if validation loss didn't improve
    if kappa_max > kappa:
        p += 1
        print(f'{p} epochs oµf increasing val loss')
        if p > patience:
            print('Stopping training')
            stop = True
            break        
            
    if stop:
        break

Thu Jul 11 15:36:08 2019 Epoch: 1


HBox(children=(IntProgress(value=0, max=130), HTML(value='')))




HBox(children=(IntProgress(value=0, max=550), HTML(value='')))


Accuracy : 0.7981818181818182
F1 score: 0.7775764998435042
Pour Hugo Kappa score: 0.6889156123088525
Epoch 1, train loss: 0.5193, valid loss: 0.4822.
1 epochs oµf increasing val loss
Thu Jul 11 15:40:00 2019 Epoch: 2


  'precision', 'predicted', average, warn_for)


HBox(children=(IntProgress(value=0, max=130), HTML(value='')))




HBox(children=(IntProgress(value=0, max=550), HTML(value='')))


Accuracy : 0.8
F1 score: 0.7879752383668518
Pour Hugo Kappa score: 0.6974954624319365
Epoch 2, train loss: 0.4671, valid loss: 0.4388.
2 epochs oµf increasing val loss
Thu Jul 11 15:43:54 2019 Epoch: 3


HBox(children=(IntProgress(value=0, max=130), HTML(value='')))




HBox(children=(IntProgress(value=0, max=550), HTML(value='')))


Accuracy : 0.7927272727272727
F1 score: 0.7913979875051727
Pour Hugo Kappa score: 0.6858418386519759
Epoch 3, train loss: 0.4309, valid loss: 0.4004.
3 epochs oµf increasing val loss
Thu Jul 11 15:47:48 2019 Epoch: 4


HBox(children=(IntProgress(value=0, max=130), HTML(value='')))




HBox(children=(IntProgress(value=0, max=550), HTML(value='')))


Accuracy : 0.7963636363636364
F1 score: 0.7915240006708989
Pour Hugo Kappa score: 0.6919722775049755
Epoch 4, train loss: 0.4047, valid loss: 0.3760.
4 epochs oµf increasing val loss
Thu Jul 11 15:51:42 2019 Epoch: 5


HBox(children=(IntProgress(value=0, max=130), HTML(value='')))




HBox(children=(IntProgress(value=0, max=550), HTML(value='')))


Accuracy : 0.7727272727272727
F1 score: 0.76832608184007
Pour Hugo Kappa score: 0.656728863235786
Epoch 5, train loss: 0.3697, valid loss: 0.3866.
5 epochs oµf increasing val loss
Thu Jul 11 15:55:36 2019 Epoch: 6


HBox(children=(IntProgress(value=0, max=130), HTML(value='')))




HBox(children=(IntProgress(value=0, max=550), HTML(value='')))


Accuracy : 0.7836363636363637
F1 score: 0.7776335551276846
Pour Hugo Kappa score: 0.6724879903923139
Epoch 6, train loss: 0.3443, valid loss: 0.3551.
6 epochs oµf increasing val loss
Thu Jul 11 15:59:32 2019 Epoch: 7


HBox(children=(IntProgress(value=0, max=130), HTML(value='')))




HBox(children=(IntProgress(value=0, max=550), HTML(value='')))


Accuracy : 0.8163636363636364
F1 score: 0.8023653166833622
Pour Hugo Kappa score: 0.7153282053121652
Epoch 7, train loss: 0.3222, valid loss: 0.3181.
Validation loss decreased (0.700000 --> 0.715328).  Saving model ...
Thu Jul 11 16:03:26 2019 Epoch: 8


HBox(children=(IntProgress(value=0, max=130), HTML(value='')))




HBox(children=(IntProgress(value=0, max=550), HTML(value='')))


Accuracy : 0.8090909090909091
F1 score: 0.7936631849341703
Pour Hugo Kappa score: 0.7114736079538357
Epoch 8, train loss: 0.3043, valid loss: 0.3060.
Validation loss decreased (0.700000 --> 0.711474).  Saving model ...
Thu Jul 11 16:07:20 2019 Epoch: 9


HBox(children=(IntProgress(value=0, max=130), HTML(value='')))




HBox(children=(IntProgress(value=0, max=550), HTML(value='')))


Accuracy : 0.8163636363636364
F1 score: 0.8129459451242992
Pour Hugo Kappa score: 0.7208935426171192
Epoch 9, train loss: 0.2846, valid loss: 0.2801.
Validation loss decreased (0.700000 --> 0.720894).  Saving model ...
Thu Jul 11 16:11:14 2019 Epoch: 10


HBox(children=(IntProgress(value=0, max=130), HTML(value='')))




HBox(children=(IntProgress(value=0, max=550), HTML(value='')))


Accuracy : 0.7872727272727272
F1 score: 0.788638288099518
Pour Hugo Kappa score: 0.6789899282154634
Epoch 10, train loss: 0.2627, valid loss: 0.2883.
1 epochs oµf increasing val loss
Thu Jul 11 16:15:08 2019 Epoch: 11


HBox(children=(IntProgress(value=0, max=130), HTML(value='')))




HBox(children=(IntProgress(value=0, max=550), HTML(value='')))


Accuracy : 0.8163636363636364
F1 score: 0.8055636900184552
Pour Hugo Kappa score: 0.7171790198254707
Epoch 11, train loss: 0.2540, valid loss: 0.2643.
Validation loss decreased (0.700000 --> 0.717179).  Saving model ...
Thu Jul 11 16:19:03 2019 Epoch: 12


HBox(children=(IntProgress(value=0, max=130), HTML(value='')))




HBox(children=(IntProgress(value=0, max=550), HTML(value='')))


Accuracy : 0.8018181818181818
F1 score: 0.7881560866612678
Pour Hugo Kappa score: 0.6962080480797004
Epoch 12, train loss: 0.2395, valid loss: 0.2727.
1 epochs oµf increasing val loss
Thu Jul 11 16:22:49 2019 Epoch: 13


HBox(children=(IntProgress(value=0, max=130), HTML(value='')))




HBox(children=(IntProgress(value=0, max=550), HTML(value='')))


Accuracy : 0.8018181818181818
F1 score: 0.7857984261963666
Pour Hugo Kappa score: 0.6963050019756638
Epoch 13, train loss: 0.2408, valid loss: 0.2714.
2 epochs oµf increasing val loss
Thu Jul 11 16:26:35 2019 Epoch: 14


HBox(children=(IntProgress(value=0, max=130), HTML(value='')))




HBox(children=(IntProgress(value=0, max=550), HTML(value='')))


Accuracy : 0.8218181818181818
F1 score: 0.808377225692402
Pour Hugo Kappa score: 0.7254538416087692
Epoch 14, train loss: 0.2403, valid loss: 0.2600.
Validation loss decreased (0.700000 --> 0.725454).  Saving model ...
Thu Jul 11 16:30:21 2019 Epoch: 15


HBox(children=(IntProgress(value=0, max=130), HTML(value='')))




HBox(children=(IntProgress(value=0, max=550), HTML(value='')))


Accuracy : 0.8145454545454546
F1 score: 0.7993826352683798
Pour Hugo Kappa score: 0.713172587275293
Epoch 15, train loss: 0.2424, valid loss: 0.2675.
Validation loss decreased (0.700000 --> 0.713173).  Saving model ...
Thu Jul 11 16:34:06 2019 Epoch: 16


HBox(children=(IntProgress(value=0, max=130), HTML(value='')))




HBox(children=(IntProgress(value=0, max=550), HTML(value='')))


Accuracy : 0.82
F1 score: 0.8034874095474499
Pour Hugo Kappa score: 0.7230388915451835
Epoch 16, train loss: 0.2429, valid loss: 0.2651.
Validation loss decreased (0.700000 --> 0.723039).  Saving model ...
Thu Jul 11 16:37:53 2019 Epoch: 17


HBox(children=(IntProgress(value=0, max=130), HTML(value='')))




HBox(children=(IntProgress(value=0, max=550), HTML(value='')))


Accuracy : 0.8
F1 score: 0.7835897634529843
Pour Hugo Kappa score: 0.692122926918634
Epoch 17, train loss: 0.2389, valid loss: 0.2727.
1 epochs oµf increasing val loss
Thu Jul 11 16:41:39 2019 Epoch: 18


HBox(children=(IntProgress(value=0, max=130), HTML(value='')))




HBox(children=(IntProgress(value=0, max=550), HTML(value='')))




HBox(children=(IntProgress(value=0, max=550), HTML(value='')))


Accuracy : 0.8218181818181818
F1 score: 0.8066865932562234
Pour Hugo Kappa score: 0.7249873718691164
Epoch 19, train loss: 0.2438, valid loss: 0.2729.
Validation loss decreased (0.700000 --> 0.724987).  Saving model ...
Thu Jul 11 16:49:11 2019 Epoch: 20


HBox(children=(IntProgress(value=0, max=130), HTML(value='')))




HBox(children=(IntProgress(value=0, max=550), HTML(value='')))


Accuracy : 0.8090909090909091
F1 score: 0.7952283625271181
Pour Hugo Kappa score: 0.706669646530575
Epoch 20, train loss: 0.2397, valid loss: 0.2764.
Validation loss decreased (0.700000 --> 0.706670).  Saving model ...


In [None]:
torch.cuda.empty_cache()

In [None]:
sub = pd.read_csv(PATH+'sample_submission.csv')

model.load_state_dict(torch.load(PATH+'model.pt'))
model.eval()

pred = np.zeros(len(test_loader),5)
for i,(data, target, name) in enumerate(test_loader):
    data = data.cuda()
    output = model(data)
    output = output.cpu().detach().numpy()
    valid_preds[i * test_batch_size:(i + 1) * test_batch_size]
    for i, (e, n) in enumerate(list(zip(output, name))):
        sub.loc[sub['id_code'] == n.split('/')[-1].split('.')[0], 'diagnosis'] = le.inverse_transform([np.argmax(e)])
        
sub.to_csv('submission.csv', index=False)

In [None]:
sub.head()