## Classifier: dynamic balance + albumentations

В этом ноутбуке производится обучение классификатора, который по изображению коктейля предсказывает набор ингредиентов. Использованы семплер для динамической балансировки классов из библиотеки Catalyst и аугментации из библиотеки albumentations.

In [1]:
from pprint import pprint
import random
from PIL import Image
import json

import pickle
import math
from math import log
from glob import glob
from tqdm.notebook import tqdm
from pathlib import Path

import numpy as np
import pandas as pd
from openvino.runtime import Core
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision.datasets import ImageFolder
import torchvision.transforms as tt
from torchvision.utils import make_grid
from torch.optim import lr_scheduler
import timm
from catalyst.data.sampler import DynamicBalanceClassSampler
import albumentations as A
from albumentations.pytorch import ToTensorV2

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import clear_output
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")
sns.set(style='darkgrid', font_scale=1.2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(device)
random.seed(0)
np.random.seed(0)

cuda


### Подготовка данных для обучения:

Получаем список коктейлей (наименование коктейля соответствует наименованию каталога)

In [2]:
DATA_DIR = '/home/maksim/Cocktails/Images/Coctails_raw/'
files = sorted(list(Path(DATA_DIR).rglob('*.*')))
full_labels = [path.parent.name for path in files]
cockt_list = sorted(set(full_labels))

Читаем из конфига (json заполняется руками) наименования ингредиентов на двух языках.

In [3]:
# Opening ingredients JSON config
with open('config/ingredients.json', 'r') as f:
    ingedients_config = json.load(f)

class_labels = ingedients_config["idx"]
id2rus_genitive = ingedients_config["id2rus_genitive"]
class_labels_ru = np.array([id2rus_genitive[idx] for idx in class_labels])

class_dict = dict()
for i in range(len(class_labels)):
    class_dict[class_labels[i]] = i

Проверяем, что русскоязычные наименования соответствуют идентификаторам.

In [None]:
for idx, rus in zip(class_labels, class_labels_ru):
    print(idx, ' = ', rus)

Читаем из конфига (json заполняется руками) рецепты коктейлей.

In [5]:
# Opening recipes JSON config
with open('config/recipes.json', 'r') as f:
    text_recipes = json.load(f)

Проверяем, что перечень ингредиентов в двух конфигах совпадает:

In [6]:
ing_set = set()
for rec in text_recipes:
    ing_set.update(text_recipes[rec])
    
print(all(a == b for a, b in zip(sorted(ing_set), sorted(class_labels))))
print(len(ing_set) == len(class_labels))

for a, b in zip(sorted(ing_set), sorted(class_labels)):
    if a != b:
        print(a, b)
print(f'Количество ингредиентов: {len(ing_set)}')

True
True
Количество ингредиентов: 45


Проверяем, что список коктейлей в каталоге на диске совпадает со списком в конфиге:

In [7]:
print(all(folder == conf for folder, conf in zip(cockt_list, text_recipes.keys())))
print(len(cockt_list) == len(text_recipes))

for folder, conf in zip(cockt_list, text_recipes.keys()):
    if folder!=conf:
        print(folder, conf)
print(f'Количество напитков: {len(cockt_list)}')

True
True
Количество напитков: 153


Формируем векторные представления рецептов (0 - ингредиент отсутствует в коктейле, 1 - ингредиент входит в состав).

In [8]:
recipes = dict()
for cocktail, recipe in text_recipes.items():
    arr = torch.zeros(len(class_labels), dtype=torch.int)
    arr[[class_dict[ingr] for ingr in recipe]] = 1
    recipes[cocktail] = arr

Задаём размер изображения, каталог с изображениями (убучающей выборкой) и константы нормализации:

In [9]:
# Opening model JSON config
with open('config/model_classifier.json', 'r') as f:
    model_conf = json.load(f)

In [10]:
image_size = model_conf['IMAGE_SIZE']
crop_size = model_conf['CROP_SIZE']
stats = (0.5, 0.5, 0.5), (0.5, 0.5, 0.5)
print(f'Imge size = {image_size}x{image_size}')
print(f'Crop size = {crop_size}x{crop_size}')

Imge size = 160x160
Crop size = 180x180


In [11]:
def show_generated_images(generated):
    clear_output(wait=True)
    plt.figure(figsize=(24, 8))
    for k in range(len(generated)):
        plt.subplot(1, len(generated), k+1)
        plt.imshow(denorm(np.rollaxis(generated[k].numpy(), 0, 3)))
        plt.axis('off')
    plt.show()

def denorm(img_tensors):
    return img_tensors * stats[1][0] + stats[0][0]

def show_images(images, nmax=200):
    fig, ax = plt.subplots(figsize=(60, 60))
    ax.set_xticks([]); ax.set_yticks([])
    ax.imshow(make_grid(denorm(images.detach()[:nmax]), nrow=8).permute(1, 2, 0))

def show_batch(dl, nmax=64):
    for images, _ in dl:
        show_images(images, nmax)
        break

#### Датасет и даталоадер:

In [12]:
# разные режимы датасета 
DATA_MODES = ['train', 'val', 'test']

class CocktailsDataset(Dataset):

    def __init__(self, files, recipes: dict[str: np.array], mode, transforms):
        super().__init__()
        self.transforms = transforms
        # список файлов для загрузки
        self.files = files
        # режим работы
        self.mode = mode
        if self.mode not in DATA_MODES:
            print(f"{self.mode} is not correct; correct modes: {DATA_MODES}")
            raise NameError
            
        str_labels = [path.parent.name for path in files]
        self.labels = [recipes[label] for label in str_labels]    
            
        return None
                      
    def __len__(self):
        return len(self.files)
      
    def load_sample(self, file):
        image = Image.open(file, formats=["JPEG", "PNG", "GIF", "WEBP"])
        image.draft('RGB', (crop_size*2, crop_size*2)) 
        return np.array(image.convert("RGB"))

    def __getitem__(self, index):
        x = self.load_sample(self.files[index])
        x = self.transforms(image=x)['image']
        if self.mode == 'test':
            return x
        else:
            y = self.labels[index]
            return x, y


In [13]:
files = sorted(list(Path(DATA_DIR).rglob('*.*')))

full_labels = [path.parent.name for path in files]
train_files, val_files = train_test_split(files, test_size=600, \
                                          stratify=full_labels)
train_labels = [path.parent.name for path in train_files]

transforms=A.Compose([A.SmallestMaxSize(max_size=crop_size*2),
                   A.CenterCrop(crop_size*2, crop_size*2),
                   A.RandomResizedCrop(image_size, 
                                       image_size, 
                                       scale=(image_size/crop_size, 1.0), 
                                       ratio=(0.85, 1.176)
                                      ),
                   A.HorizontalFlip(p=0.5),
                   A.ColorJitter(brightness=(0.96, 1.0),
                                 contrast=(0.96, 1.0),
                                 saturation=(0.93, 1.03), 
                                 hue=0.015, 
                                 p=0.7),
                   A.RandomToneCurve(scale=0.07, p=0.7),
                   A.Normalize(*stats),
                   ToTensorV2() ])

train_dataset = CocktailsDataset(train_files, 
                                 mode='train', 
                                 recipes=recipes,
                                 transforms=transforms)

val_dataset = CocktailsDataset(val_files, 
                               mode='val', 
                               recipes=recipes, 
                               transforms=A.Compose([A.SmallestMaxSize(max_size=crop_size),
                                                     A.CenterCrop(image_size, image_size),
                                                     A.Normalize(*stats),
                                                     ToTensorV2() ]) )

full_dataset = CocktailsDataset(files, 
                                mode='train', 
                                recipes=recipes,
                                transforms=transforms)


In [14]:
def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""
    def __init__(self, dataloader, device):
        self.dataloader = dataloader
        self.device = device
        
    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for batch in self.dataloader: 
            yield to_device(batch, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dataloader)

    
def get_dataloaders(image_size, batch_size, train_set, test_set, sampler=None):
    shuffle = sampler is None
    train_dataloader = DataLoader(train_set, batch_size, shuffle=shuffle, sampler=sampler, num_workers=6, pin_memory=True)
    test_dataloader = DataLoader(test_set, batch_size, shuffle=True, num_workers=6, pin_memory=True)
    return DeviceDataLoader(train_dataloader, device), DeviceDataLoader(test_dataloader, device)

Взглянем на аугментированные изображения:

In [15]:
batch_size = 200
train_loader, val_loader = get_dataloaders(image_size, 
                                           batch_size, 
                                           train_set=train_dataset, 
                                           test_set=val_dataset, 
                                           sampler=None)

first_batch = next(iter(train_loader))

In [None]:
show_images(first_batch[0].cpu())

### Обучение нейронной сети.

In [17]:
def fit_epoch(model, train_loader, criterion, optimizer, sheduler, threshold, label_smoothing):
    running_loss = 0.0
    running_recall = 0
    running_precision = 0
    processed_data = 0
  
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels * (1 - label_smoothing) + label_smoothing / 20
        labels = labels.to(device)
        
        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        preds = (outputs > -log(1 / threshold - 0.999)) * 1
        running_loss += loss.item() * inputs.size(0)
        running_recall += (torch.sum(preds.data * labels.data) + 0.1) / (torch.sum(labels.data) + 0.1) * inputs.size(0)
        running_precision += (torch.sum(preds.data * labels.data) + 0.1) / (torch.sum(preds.data) + 0.1) * inputs.size(0)
        processed_data += inputs.size(0)
    sheduler.step()
    train_loss = running_loss / processed_data
    recall = running_recall.double() / processed_data
    precision = running_precision.double() / processed_data
    train_acc = (2*recall*precision) / (precision+recall) 
    return train_loss, train_acc

def eval_epoch(model, val_loader, criterion, threshold, label_smoothing):
    model.eval()
    running_loss = 0.0
    running_recall = 0
    running_precision = 0
    processed_size = 0

    for inputs, labels in val_loader:
        inputs = inputs.to(device)
        labels = labels * (1 - label_smoothing) + label_smoothing / 20
        labels = labels.to(device)

        with torch.set_grad_enabled(False):
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            preds = (outputs > -log(1 / threshold - 0.999)) * 1

        running_loss += loss.item() * inputs.size(0)
        running_recall += (torch.sum(preds.data * labels.data) + 0.1) / (torch.sum(labels.data) + 0.1) * inputs.size(0)
        running_precision += (torch.sum(preds.data * labels.data) + 0.1) / (torch.sum(preds.data) + 0.1) * inputs.size(0)
        processed_size += inputs.size(0)
    val_loss = running_loss / processed_size
    recall = running_recall.double() / processed_size
    precision = running_precision.double() / processed_size
    val_acc = (2*recall*precision) / (precision+recall) 
    return val_loss, val_acc

def train(train_loader, val_loader, model, epochs, optimizer, gamma=0.95, threshold=0.5, label_smoothing=0.):

    model.to(device)
    history = []
    log_template = "\nEpoch {ep:03d} train_loss: {t_loss:0.4f} \
    val_loss {v_loss:0.4f} train_acc {t_acc:0.4f} val_acc {v_acc:0.4f}"

    with tqdm(desc="epoch", total=epochs) as pbar_outer:
        sheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=gamma)
        criterion = nn.BCEWithLogitsLoss()

        for epoch in range(epochs):
            train_loss, train_acc = fit_epoch(model, 
                                              train_loader, 
                                              criterion, 
                                              optimizer, 
                                              sheduler, 
                                              threshold, 
                                              label_smoothing)
            print("loss", train_loss)
            
            val_loss, val_acc = eval_epoch(model, val_loader, criterion, threshold, label_smoothing)
            history.append((train_loss, train_acc, val_loss, val_acc))
            
            pbar_outer.update(1)
            tqdm.write(log_template.format(ep=epoch+1, t_loss=train_loss,\
                                           v_loss=val_loss, t_acc=train_acc, v_acc=val_acc))
    return history


В качестве нейронной сети для обучения выбрана предобученная MobilenetV3. \
Немного меняем структуру сети: 
1. Послендий полносвязный слой заменяем двумя слоями, bottleneck размерности 14 и финальный классификатор с размерностью, равной количеству ингредиентов.
2. Немного уменьшаем размерность 6-го блока для того, чтобы снизить сложность сети и увеличить её стойкость к переобучению.

In [18]:
model = timm.create_model('mobilenetv3_large_100_miil', pretrained=True).to(device) 

model.blocks[6][0].conv = nn.Conv2d(160, 256, 1, 1).to(device)
model.blocks[6][0].bn1 = nn.BatchNorm2d(256).to(device)
model.conv_head = nn.Sequential(nn.Conv2d(256, 600, 1, 1) ).to(device)

model.classifier = nn.Sequential(
    nn.Dropout(0.75),
    nn.Linear(in_features=600, out_features=28, bias=False),
    nn.BatchNorm1d(28),
    nn.Dropout(0.3),
    nn.LeakyReLU(0.4),
    nn.Linear(in_features=28, out_features=len(class_labels))).to(device)


for name, param in model.named_parameters():
    param.requires_grad_(False)

Поучившаяся структура модели:

Размораживаем часть градиентов:

In [None]:
#model.blocks[1].requires_grad_(True)
model.blocks[2].requires_grad_(True)
model.blocks[3].requires_grad_(True)
model.blocks[4].requires_grad_(True)
model.blocks[5].requires_grad_(True)
model.blocks[6].requires_grad_(True)

model.conv_head.requires_grad_(True)
model.classifier.requires_grad_(True)

for name, param in model.named_parameters():
    print(name, param.requires_grad)

Для того чтобы максимально сохранить информацию в предобученной сети, устанавливаем разные скорости обучения для различных групп слоёв: чем ближе в выходу сети, тем выше скорость обучения. Вводим новый гиперпараметр lr_decay, который определяет, насколько будут отличаться веса на разных уровнях.

In [20]:
def get_optimizer(lr: float, lr_decay: float) -> object:
    optimizer = torch.optim.Adam([
        {'params': model.conv_stem.parameters()},
        {'params': model.bn1.parameters()},
        {'params': model.blocks[0].parameters(), 'lr': lr/lr_decay**7},
        {'params': model.blocks[1].parameters(), 'lr': lr/lr_decay**6},
        {'params': model.blocks[2].parameters(), 'lr': lr/lr_decay**5},
        {'params': model.blocks[3].parameters(), 'lr': lr/lr_decay**4},
        {'params': model.blocks[4].parameters(), 'lr': lr/lr_decay**3},
        {'params': model.blocks[5].parameters(), 'lr': lr/lr_decay**2},
        {'params': model.blocks[6].parameters(), 'lr': lr/lr_decay**1},
        {'params': model.conv_head.parameters(), 'lr': lr/lr_decay**1},
        {'params': model.classifier.parameters(), 'lr': lr},
    ], lr=lr/lr_decay**8)
    return optimizer
    

Обучаем сеть в несколько циклов с различными значениями lr, lr_decay, batch_size (сначала обучаем "голову" сети, потом учим "в глубину"):

In [21]:
lr = 7.0e-3
lr_decay = 6.0
batch_size = 200
l_sm=0.01
sampler = DynamicBalanceClassSampler(labels=full_labels, exp_lambda=0.98)

train_loader, val_loader = get_dataloaders(image_size, batch_size, full_dataset, val_dataset, sampler=sampler)
optimizer = get_optimizer(lr, lr_decay)

history1 = train(train_loader, val_loader, model, epochs=6, optimizer=optimizer, gamma=0.85, label_smoothing=l_sm)

the smallest class contains only 9 examples. At the end of training, epochs will contain only 1377 examples


epoch:   0%|          | 0/6 [00:00<?, ?it/s]

loss 0.26750840482825844

Epoch 001 train_loss: 0.2675     val_loss 0.1671 train_acc 0.3540 val_acc 0.5193
loss 0.14658140365470498

Epoch 002 train_loss: 0.1466     val_loss 0.1299 train_acc 0.6103 val_acc 0.6500
loss 0.12133446454513232

Epoch 003 train_loss: 0.1213     val_loss 0.1109 train_acc 0.6965 val_acc 0.7211
loss 0.10687974665650572

Epoch 004 train_loss: 0.1069     val_loss 0.0998 train_acc 0.7396 val_acc 0.7600
loss 0.09691511203343729

Epoch 005 train_loss: 0.0969     val_loss 0.0874 train_acc 0.7697 val_acc 0.7986
loss 0.0897391755313414

Epoch 006 train_loss: 0.0897     val_loss 0.0814 train_acc 0.7889 val_acc 0.8121


In [None]:
lr = 4e-3
lr_decay = 2.5
batch_size = 80
l_sm=0.03
sampler = DynamicBalanceClassSampler(labels=full_labels, exp_lambda=0.98)

train_loader, val_loader = get_dataloaders(image_size, batch_size, full_dataset, val_dataset, sampler=sampler)
optimizer = get_optimizer(lr, lr_decay)

history2 = train(train_loader, val_loader, model, epochs=5, optimizer=optimizer, gamma=0.85, label_smoothing=l_sm)

In [None]:
lr = 1.3e-3
lr_decay = 1.3
batch_size = 80
l_sm=0.03
sampler = DynamicBalanceClassSampler(labels=full_labels, exp_lambda=0.97)

train_loader, val_loader = get_dataloaders(image_size, batch_size, full_dataset, val_dataset, sampler=sampler)
optimizer = get_optimizer(lr, lr_decay)

history3 = train(train_loader, val_loader, model, epochs=24, optimizer=optimizer, gamma=0.9, label_smoothing=l_sm)

Сохраняем модель, веса. При необходимости, загружаем веса.

In [24]:
torch.save(model.state_dict(), 'classifier_state_dict_ls.pt')
torch.save(model, 'classifier_model_ls.pt')

Проверим работу модели на случайном изображении:

In [25]:
def predict_ingredients(path: str, model: callable, classes: np.array) -> list:
    try:
        image = Image.open(path)
    except:
        return []
    width, height = image.size  # Get dimensions
    size = min(width, height)
    
    left = (width - size) / 2
    top = (height - size) / 2
    right = (width + size) / 2
    bottom = (height + size) / 2

    # Crop the center of the image
    image = image.crop((left, top, right, bottom))
    img = np.asarray(image.resize((image_size, image_size))) / 127.5 - 1.0
    
    plt.figure(figsize=(2.5, 2.5))
    plt.imshow(denorm(img))
    plt.axis('off')
    plt.show()
    
    logits = model(torch.tensor(np.rollaxis(img, 2, 0)[None, :, :, :], dtype=torch.float).to(device))
    result = (logits.detach().cpu() > 0).nonzero()[:, 1].numpy()
    return classes[result]

In [None]:
predict_ingredients('/home/maksim/Cocktails/Images/Coctails_raw/Bloody_mary/AD4A0735.jpg', model, class_labels_ru)

### Конвертация в ONNX

Загружаем модель в pytorch и экспортируем в формат ONNX.

In [27]:
model.to('cpu')
model.load_state_dict(torch.load('classifier_state_dict_ls.pt'))
# Evaluate the model to switch some operations from training mode to inference.
model.eval()
# Create dummy input for the model. It will be used to run the model inside export function.
dummy_input = torch.randn(1, 3, image_size, image_size)
# Call the export function
torch.onnx.export(model,               
                  dummy_input,                         
                  'classifier_ls.onnx',   
                  export_params=True,        
                  opset_version=11,          
                  do_constant_folding=True,  
                  input_names = ['input'],   
                  output_names = ['logits']
                 )

[W NNPACK.cpp:51] Could not initialize NNPACK! Reason: Unsupported hardware.


Загружаем модель ONNX для проверки:

In [28]:
ie = Core()

Доступные для ONNX устройства:

In [29]:
devices = ie.available_devices

for dev in devices:
    device_name = ie.get_property(device_name=dev, name="FULL_DEVICE_NAME")
    print(f"{dev}: {device_name}")
    
onnx_model_path = 'classifier_ls.onnx'
model_onnx = ie.read_model(model=onnx_model_path)
compiled_model_onnx = ie.compile_model(model=model_onnx, device_name="CPU")

CPU: AMD FX(tm)-6300 Six-Core Processor             


Вход модели:

In [30]:
input_layer = compiled_model_onnx.input(0)

print(f"input precision: {input_layer.element_type}")
print(f"input shape: {input_layer.shape}")

input precision: <Type: 'float32'>
input shape: {1, 3, 160, 160}


Выход модели:

In [31]:
output_layer = compiled_model_onnx.output(0)

print(f"output precision: {output_layer.element_type}")
print(f"output shape: {output_layer.shape}")

output precision: <Type: 'float32'>
output shape: {1, 45}


Вся загрузка в одной ячейке:

In [32]:
ie = Core()
onnx_model_path = 'classifier_ls.onnx'
model_onnx = ie.read_model(model=onnx_model_path)
compiled_model_onnx = ie.compile_model(model=model_onnx, device_name="CPU")

input_layer = compiled_model_onnx.input(0)
output_layer = compiled_model_onnx.output(0)

Пробуем инференс на случайном изображении:

In [33]:
def predict_ingredients_onnx(path: str, class_labels: list) -> list:
    img = np.asarray(Image.open(path).resize((image_size, image_size))) / 255
    logits = compiled_model_onnx([np.rollaxis(img, 2, 0)[None, :, :, :]])[output_layer]
    result = (logits > 0.5).nonzero()[1]
    return class_labels_ru[result]

def generate_recipe(ingredients: list) -> str:
    return ', '.join(ingredients)

In [34]:
img_path = '/home/maksim/Cocktails/Images/Coctails_raw/Mojito/0346a20835_1000.jpg'
print(generate_recipe(predict_ingredients_onnx(img_path, class_labels=class_labels_ru)))

спрайт, ром, сахарный сироп, лайм, мяту, лёд


In [35]:
def predict_vector(path: str, class_labels: list) -> list:
    img = np.asarray(Image.open(path).resize((image_size, image_size))) / 255
    logits = compiled_model_onnx([np.rollaxis(img, 2, 0)[None, :, :, :]])[output_layer][0]
    probs = 1 / (1 + np.exp(-logits))
    pos_ind = (probs > 0.5).nonzero()[0]
    neg_ind = (probs < 0.5).nonzero()[0]
    
    return np.prod(probs[pos_ind])*np.prod(1-probs[neg_ind])