In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import torch, torchvision
import albumentations
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import pickle
from skimage import io
import random
import copy
import cv2

from tqdm import tqdm, tqdm_notebook
from PIL import Image
from pathlib import Path
from time import sleep

from torchvision import transforms
from multiprocessing.pool import ThreadPool
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models

from matplotlib import colors, pyplot as plt
%matplotlib inline

# в sklearn не все гладко, чтобы в colab удобно выводить картинки 
# мы будем игнорировать warnings
import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

In [None]:
RESCALE_SIZE = 224
train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ...')
    DEVICE = torch.device("cpu")
else:
    print('CUDA is available!  Training on GPU ...')
    DEVICE = torch.device("cuda")

In [None]:
from albumentations import (
    HorizontalFlip, IAAPerspective, ShiftScaleRotate, CLAHE, RandomRotate90, Resize, RandomCrop,
    Transpose, ShiftScaleRotate, Blur, OpticalDistortion, GridDistortion, HueSaturationValue,
    IAAAdditiveGaussianNoise, GaussNoise, MotionBlur, MedianBlur, RandomBrightnessContrast, IAAPiecewiseAffine,
    IAASharpen, IAAEmboss, Flip, OneOf, Compose, Rotate, RandomScale, RandomGridShuffle,
    RandomContrast, RandomGamma, RandomBrightness, CenterCrop, VerticalFlip,
    ChannelShuffle, InvertImg, RGBShift, ElasticTransform, Equalize, RandomResizedCrop, ChannelDropout
)

In [None]:
def train_aug1(p=.5):
    return Compose([
        RandomRotate90(),
        HorizontalFlip(),
        VerticalFlip(),
        Transpose(),
        #ElasticTransform(),
        ChannelShuffle(p = 0.5),
        RGBShift(r_shift_limit=127, g_shift_limit=127, b_shift_limit=127, p = 0.5),
        #RandomContrast(),
        RandomCrop(224,224),
        Rotate(border_mode=cv2.BORDER_CONSTANT, limit = 45, interpolation=4, p=.8, value = 0),
        #OneOf([
        #Rotate(border_mode=1, limit = 45, interpolation=4, p=.5),
        #Rotate(border_mode=4, limit = 45, interpolation=4, p=.5),
        #], p=0.9),
        #ChannelDropout(),
        #RandomGamma(),
        #RandomBrightness(),
        #RandomContrast(),
        #RandomBrightnessContrast(),
        #RandomScale(),
        #IAASharpen(),
        ChannelDropout(),
        #Equalize(p=1.),
        CLAHE(p=1.),
#        RandomGridShuffle()
    ], p=p)
train_aug_v1 = train_aug1(1.)

In [None]:
def train_aug2(p=.5):
    return Compose([
        RandomRotate90(),
        HorizontalFlip(),
        VerticalFlip(),
        Transpose(),
        #ElasticTransform(),
        RandomCrop(224,224),
        #Rotate(border_mode=cv2.BORDER_CONSTANT, limit = 45, interpolation=4, p=.8, value = 0),
        #RandomContrast(),
        #OneOf([
        #Rotate(border_mode=4, limit = 45, interpolation=4, p=.5),
        #], p=0.9),
#        OneOf([
#            ChannelShuffle(p = 0.5),
#            InvertImg(p=0.5)
#        ], p=0.5),
        ChannelShuffle(p = 0.5),
        RGBShift(r_shift_limit=127, g_shift_limit=127, b_shift_limit=127, p = 0.5),
#        RandomContrast(),
        ChannelDropout(),
        #Equalize(p=1.),
        CLAHE(p=1.),
    ], p=p)
train_aug_v2 = train_aug2(1.)

In [None]:
def val_aug(p=.5):
    return Compose([
        RandomRotate90(),
        HorizontalFlip(),
        VerticalFlip(),
        Transpose(),
        #Equalize(p=1.),
        CLAHE(p=1.),
#        ShiftScaleRotate(shift_limit=0.1, scale_limit = [-0.1, 0], rotate_limit = 0, interpolation=4, p=.5),
        #RandomGridShuffle(p = 0.3),
        #RandomBrightnessContrast(p=.5),
    ], p=p)
val_aug = val_aug(1.)

In [None]:
def test_aug():
    return CLAHE(p=1.)
test_aug = test_aug()

In [None]:
class DvsCDataset(Dataset):
    def __init__(self, files, mode):
        super().__init__()
        self.files = files
        # режим работы
        self.mode = mode            
        self.labels = [path.parent.name for path in self.files]
        self.len_ = len(self.files)
                      
    def __len__(self):
        return self.len_
      
    def load_sample(self, file):
        image = Image.open(file)
        image.load()
        return image
  
    def _get_label(self, idx):
        if self.mode != 'test':
            return self.labels[idx]

    def __getitem__(self, index):
        transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 
        ])
        x = self.load_sample(self.files[index])
        x = self._prepare_sample(x)
        x = np.array(x / 255., dtype='float32')
        x = transform(x)
        if self.mode == 'test':
            return x
        else:
            label = self.labels[index]
            y = 0
            if label == 'cleaned':
                y = 1
            return x, y
        
    def _prepare_sample(self, image):  
        if self.mode == 'train':
            random_value = random.random()
            if random_value < 0.5:
                image = image.resize((RESCALE_SIZE, RESCALE_SIZE))
                image = np.array(image)
                image = train_aug_v1(image=image)['image']
                
            else:
                image = np.array(image)
                image = train_aug_v2(image=image)['image']
                
        if self.mode == 'val':
            image = image.resize((RESCALE_SIZE, RESCALE_SIZE))
            image = np.array(image)
            image = val_aug(image=image)['image']
        if self.mode == 'test':
            image = image.resize((RESCALE_SIZE, RESCALE_SIZE))
            image = np.array(image)
            image = test_aug(image=image)['image']
        return image

In [None]:
def imshow(inp, title=None, plt_ax=plt, default=False):
    """Imshow для тензоров"""
    inp = inp.numpy().transpose((1, 2, 0))
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    inp = std * inp + mean
    inp = np.clip(inp, 0, 1)
    plt_ax.imshow(inp)
    if title is not None:
        plt_ax.set_title(title)
    plt_ax.grid(False)

In [None]:
from os.path import exists
if not exists('plates'):
  !unzip -q /kaggle/input/platesv2/plates.zip

In [None]:
TRAIN_DIR = Path('plates/train')
TEST_DIR = Path('plates/test')


train_val_files = sorted(list(TRAIN_DIR.rglob('*.jpg')))
test_files = sorted(list(TEST_DIR.rglob('*.jpg')))

In [None]:
# Let's look at chosen transformations effect

In [None]:
val_dataset = DvsCDataset(train_val_files, mode='val')
train_dataset = DvsCDataset(train_val_files, mode='train')

In [None]:
fig, ax = plt.subplots(nrows=5, ncols=5,figsize=(12, 12), \
                        sharey=True, sharex=True)
for fig_x in ax.flatten():
    random_plate = int(np.random.uniform(0,len(train_val_files)))
    im_val, label = train_dataset[random_plate]
    img_label = label
    imshow(im_val.data.cpu(), \
          title=img_label,plt_ax=fig_x)

In [None]:
fig, ax = plt.subplots(nrows=3, ncols=3,figsize=(8, 8), \
                        sharey=True, sharex=True)
for fig_x in ax.flatten():
    random_characters = int(np.random.uniform(0,len(train_val_files)))
    im_val, label = val_dataset[random_characters]
    img_label = label
    imshow(im_val.data.cpu(), \
          title=img_label,plt_ax=fig_x)

In [None]:
def fit_epoch(model, train_loader, criterion, optimizer):
    running_loss = 0.0
    running_corrects = 0
    processed_data = 0
  
    for inputs, labels in train_loader:
        inputs = inputs.to(DEVICE)
        labels = labels.float().unsqueeze(1).to(DEVICE)
        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        for param in model.fc.parameters():
            loss += 1e-4*torch.sum(torch.abs(param))
        loss.backward()
        optimizer.step()
        preds = torch.round(outputs).long()
        running_loss += loss.item() * inputs.size(0)
        running_corrects += torch.sum(preds == labels.data)
        processed_data += inputs.size(0)
              
    train_loss = running_loss / processed_data
    train_acc = running_corrects.cpu().numpy() / processed_data
    return train_loss, train_acc

In [None]:
def eval_epoch(model, val_loader, criterion):
    model.eval()
    running_loss = 0.0
    running_corrects = 0
    processed_size = 0

    for inputs, labels in val_loader:
        inputs = inputs.to(DEVICE)
        labels = labels.float().unsqueeze(1).to(DEVICE)

        with torch.set_grad_enabled(False):
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            preds = torch.round(outputs).long()

        running_loss += loss.item() * inputs.size(0)
        running_corrects += torch.sum(preds == labels.data)
        processed_size += inputs.size(0)
    val_loss = running_loss / processed_size
    val_acc = running_corrects.double() / processed_size
    return val_loss, val_acc

In [None]:
def train(train_dataset, val_dataset, model, epochs, batch_size):

    best_model = None
    epwi = 0
    best_val_loss = np.inf

    train_loader = DataLoader(train_dataset, batch_size=batch_size , shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    history = []
    log_template = "\nEpoch {ep:03d} train_loss: {t_loss:0.4f} \
    train_acc {t_acc:0.4f} \
    val_loss {v_loss:0.4f} val_acc {v_acc:0.4f}"

    #with tqdm(desc="epoch", total=epochs) as pbar_outer:
    opt = torch.optim.Adam(model.parameters(), lr=1e-3)
    #opt = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
    scheduler  = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode='min',
                                    factor=0.1, patience=5, threshold=0.0001, verbose = True,
                                    threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-08)
    #scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer = opt, milestones = [5, 10, 15, 20],
    #                                                 gamma=0.1, last_epoch=-1)
    #scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=10, gamma=0.1)
    criterion = nn.BCELoss()

    for epoch in range(epochs):
        train_loss, train_acc = fit_epoch(model, train_loader, criterion, opt)
        #print("loss", train_loss)

        val_loss, val_acc = eval_epoch(model, val_loader, criterion)
        scheduler.step(val_loss)
        history.append((train_loss, train_acc, val_loss, val_acc))
        if val_loss < best_val_loss:
            epwi = 0
            best_model = copy.deepcopy(model)
            best_val_loss = val_loss
        else:
            epwi += 1
            
            '''pbar_outer.update(1)
            tqdm.write(log_template.format(ep=epoch+1, t_loss=train_loss
                                           , v_loss=val_loss, t_acc=train_acc
                                           , v_acc=val_acc))'''
        print(log_template.format(ep=epoch+1, t_loss=train_loss
                                       , v_loss=val_loss, t_acc=train_acc
                                       , v_acc=val_acc))
        if epwi == 15:
            break
            
    return best_model, history

In [None]:
def predict(model, test_loader, plot_preds = False):
    with torch.no_grad():
        probs = []
    
        for inputs in test_loader:
            inputs = inputs.to(DEVICE)
            model.eval()
            outputs = model(inputs).tolist()
            outputs = [it[0] for it in outputs]
            probs += outputs
            if plot_preds:
                for n in range(inputs.shape[0]):
                    print('True pictute')
                    imshow(inputs[n].cpu())
                    plt.show(block=False)
                    sleep(0.1)
                    print('prediction:')
                    print(outputs[n])
            
        #probs = nn.functional.softmax(torch.cat(logits), dim=-1).numpy()
    return probs

In [None]:
EPOCHS = 200
BATCH_SIZE = 32

In [None]:
test_dataset = DvsCDataset(test_files, mode="test")
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=BATCH_SIZE)

In [None]:
submit = pd.DataFrame(columns=['id'])

In [None]:
rs = 0
random.seed(rs)
np.random.seed(rs)
torch.manual_seed(rs)
torch.cuda.manual_seed(rs)
torch.backends.cudnn.deterministic = True
val_set = np.array(train_val_files)
train_dataset = DvsCDataset(np.tile(np.array(train_val_files), 100), mode='train')
val_dataset = DvsCDataset(np.tile(val_set, 100), mode='val')

model = models.resnet34(pretrained=True)
for param in model.parameters():
    param.requires_grad = False
#model.fc = nn.Sequential(nn.Dropout(0.5), nn.Linear(2048, 1), nn.Sigmoid())
model.fc = nn.Sequential(nn.Linear(512, 1),  nn.Sigmoid())
#model.fc = nn.Sequential(nn.Linear(512, 64), 
#                                 nn.Dropout(0.5), nn.Linear(64, 1), nn.Sigmoid())
SAVE_PATH = 'best_model_fold.pth'

model = model.to(DEVICE)
#model.load_state_dict(torch.load(SAVE_PATH))
model, history = train(train_dataset, val_dataset, model=model, epochs=EPOCHS, batch_size=BATCH_SIZE)

after_train_dataset = DvsCDataset(val_set, mode="test")
after_train_loader = DataLoader(after_train_dataset, shuffle=False, batch_size=BATCH_SIZE)

after_train_probs = predict(model, after_train_loader)

for i in range(val_set.shape[0]):
    plt.imshow(plt.imread(val_set[i]))
    plt.show(block=False)
    sleep(0.1)
    print(val_set[i])
    print(after_train_probs[i])

loss, acc, val_loss, val_acc = zip(*history)
plt.figure(figsize=(15, 9))
plt.plot(loss, label="train_loss")
plt.plot(val_loss, label="val_loss")
plt.legend(loc='best')
plt.xlabel("epochs")
plt.ylabel("loss")
plt.show()
plt.close()

probs = predict(model, test_loader)
preds = ['cleaned']*len(probs)
for i, it in enumerate(probs):
    if it <= 0.5:
        preds[i] = 'dirty'

submit['label'] = preds

In [None]:
test_filenames = [path.name for path in test_dataset.files]
submit['id'] = test_filenames
submit['id'] = submit.apply(lambda x: x['id'].split('.')[0],axis = 1).astype('int')
submit.head()
submit.to_csv('submission.csv', index=False)