## Classifier Baseline

В этом ноутбуке производится обучение классификатора, который по изображению коктейля предсказывает набор ингредиентов.

In [None]:
from pprint import pprint
import random
from PIL import Image
import json

import pickle
import math
from math import log
from glob import glob
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd
from openvino.runtime import Core
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
import torchvision.transforms as tt
from torchvision.utils import make_grid
from torch.optim import lr_scheduler
import timm

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import clear_output
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")
sns.set(style='darkgrid', font_scale=1.2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(device)
random.seed(0)
np.random.seed(0)

### Подготовка данных для обучения:

Получаем список коктейлей (наименование коктейля соответствует наименованию каталога)

In [None]:
cockt_list = glob('/home/maksim/Cocktails/Images/Coctails_raw/*', )
cockt_list = sorted([x[43:] for x in cockt_list])

Читаем из конфига (json заполняется руками) наименования ингредиентов на двух языках.

In [None]:
# Opening ingredients JSON config
with open('config/ingredients.json', 'r') as f:
    ingedients_config = json.load(f)

class_labels = ingedients_config["idx"]
id2rus_genitive = ingedients_config["id2rus_genitive"]
class_labels_ru = np.array([id2rus_genitive[idx] for idx in class_labels])

class_dict = dict()
for i in range(len(class_labels)):
    class_dict[class_labels[i]] = i

Проверяем, что русскоязычные наименования соответствуют идентификаторам.

In [None]:
for idx, rus in zip(class_labels, class_labels_ru):
    print(idx, ' = ', rus)

Читаем из конфига (json заполняется руками) рецепты коктейлей.

In [None]:
# Opening recipes JSON config
with open('config/recipes.json', 'r') as f:
    text_recipes = json.load(f)

Проверяем, что перечень ингредиентов в двух конфигах совпадает:

In [None]:
ing_set = set()
for rec in text_recipes:
    ing_set.update(text_recipes[rec])
    
print(all(a == b for a, b in zip(sorted(ing_set), sorted(class_labels))))
print(len(ing_set) == len(class_labels))

Проверяем, что список коктейлей в каталоге совпадает со списком в конфиге:

In [None]:
print(all(folder == conf for folder, conf in zip(cockt_list, text_recipes.keys())))
print(len(cockt_list) == len(text_recipes))

Формируем векторные представления рецептов.

In [None]:
recipes = dict()
for cocktail, recipe in text_recipes.items():
    arr = torch.zeros(len(class_labels), dtype=torch.int)
    arr[[class_dict[ingr] for ingr in recipe]] = 1
    recipes[cocktail] = arr

Задаём размер изображения, каталог с изображениями (убучающей выборкой) и константы нормализации:

In [None]:
# Opening model JSON config
with open('config/model.json', 'r') as f:
    model_conf = json.load(f)

In [None]:
image_size = model_conf['image_size']
crop_size = model_conf['crop_size']
DATA_DIR = '/home/maksim/Cocktails/Images/Coctails_raw/'
stats = (0.5, 0.5, 0.5), (0.5, 0.5, 0.5)
print(f'Imge size = {image_size}x{image_size}')
print(f'Crop size = {crop_size}x{crop_size}')

In [None]:
def show_generated_images(generated):
    clear_output(wait=True)
    plt.figure(figsize=(24, 8))
    for k in range(len(generated)):
        plt.subplot(1, len(generated), k+1)
        plt.imshow(denorm(np.rollaxis(generated[k].numpy(), 0, 3)))
        plt.axis('off')
    plt.show()

def denorm(img_tensors):
    return img_tensors * stats[1][0] + stats[0][0]

def show_images(images, nmax=200):
    fig, ax = plt.subplots(figsize=(60, 60))
    ax.set_xticks([]); ax.set_yticks([])
    ax.imshow(make_grid(denorm(images.detach()[:nmax]), nrow=8).permute(1, 2, 0))

def show_batch(dl, nmax=64):
    for images, _ in dl:
        show_images(images, nmax)
        break

In [None]:
summ = 0
for cockt in recipes:
    summ += recipes[cockt].sum()
summ = summ / recipes[next(iter(recipes))].shape[0] / len(recipes)
summ

In [None]:
def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""
    def __init__(self, dataloader, device):
        self.dataloader = dataloader
        self.device = device
        
    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for batch in self.dataloader: 
            yield to_device(batch, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dataloader)

    
def get_dataloaders(image_size, batch_size):
    """
    Builds dataloader for training data.
    Use tt.Compose and tt.Resize for transformations
    :param image_size: height and wdith of the image
    :param batch_size: batch_size of the dataloader
    :returns: DeviceDataLoader object 
    """
  
    full_set = ImageFolder(DATA_DIR, 
                           transform=tt.Compose([
                                  tt.Resize(crop_size),
                                  tt.CenterCrop(crop_size),
                                  tt.ColorJitter(brightness=(0.8, 1.1), 
                                                 contrast=(0.8, 1.1), 
                                                 saturation=(0.8, 1.1), 
                                                 hue=0.015),
                                  tt.ToTensor(),
                                  tt.Normalize(*stats),
                                  tt.RandomCrop(image_size),
                                  tt.RandomHorizontalFlip() ])
                               )
    
    recipe_labels = torch.zeros((len(full_set.classes), len(class_labels)))
    for cockt, idx in full_set.class_to_idx.items():
        recipe_labels[idx] = recipes[cockt]
    
    
    n = len(full_set)  
    n_test = int(0.032 * n)  
    permutation = list(range(n))
    random.Random(42).shuffle(permutation)
    train_set = torch.utils.data.Subset(full_set, permutation[n_test: n])  
    test_set = torch.utils.data.Subset(full_set, permutation[:n_test])  
    
    train_dataloader = DataLoader(train_set, batch_size, shuffle=True, num_workers=5, pin_memory=True)
    test_dataloader = DataLoader(test_set, batch_size, shuffle=True, num_workers=5, pin_memory=True)

    return DeviceDataLoader(train_dataloader, device), DeviceDataLoader(test_dataloader, device), recipe_labels

Взглянем на аугментированные изображения:

In [None]:
batch_size = 200
train_loader, val_loader, recipe_labels = get_dataloaders(image_size, batch_size)
show_images(next(iter(val_loader))[0].cpu())

### Обучение нейронной сети.

In [None]:
def fit_epoch(model, train_loader, criterion, optimizer, sheduler, threshold, label_smoothing):
    running_loss = 0.0
    running_recall = 0
    running_precision = 0
    processed_data = 0
  
    for inputs, labels_idx in train_loader:
        inputs = inputs.to(device)
        labels = recipe_labels[labels_idx] * (1 - label_smoothing) + label_smoothing / 20
        labels = labels.to(device)
        
        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        preds = (outputs > -log(1 / threshold - 0.999)) * 1
        running_loss += loss.item() * inputs.size(0)
        running_recall += (torch.sum(preds.data * labels.data) + 0.1) / (torch.sum(labels.data) + 0.1) * inputs.size(0)
        running_precision += (torch.sum(preds.data * labels.data) + 0.1) / (torch.sum(preds.data) + 0.1) * inputs.size(0)
        processed_data += inputs.size(0)
    sheduler.step()
    train_loss = running_loss / processed_data
    recall = running_recall.double() / processed_data
    precision = running_precision.double() / processed_data
    train_acc = (2*recall*precision) / (precision+recall) 
    return train_loss, train_acc

def eval_epoch(model, val_loader, criterion, threshold, label_smoothing):
    model.eval()
    running_loss = 0.0
    running_recall = 0
    running_precision = 0
    processed_size = 0

    for inputs, labels_idx in val_loader:
        inputs = inputs.to(device)
        labels = recipe_labels[labels_idx] * (1 - label_smoothing) + label_smoothing / 20
        labels = labels.to(device)

        with torch.set_grad_enabled(False):
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            preds = (outputs > -log(1 / threshold - 0.999)) * 1

        running_loss += loss.item() * inputs.size(0)
        running_recall += (torch.sum(preds.data * labels.data) + 0.1) / (torch.sum(labels.data) + 0.1) * inputs.size(0)
        running_precision += (torch.sum(preds.data * labels.data) + 0.1) / (torch.sum(preds.data) + 0.1) * inputs.size(0)
        processed_size += inputs.size(0)
    val_loss = running_loss / processed_size
    recall = running_recall.double() / processed_size
    precision = running_precision.double() / processed_size
    val_acc = (2*recall*precision) / (precision+recall) 
    return val_loss, val_acc

def train(train_loader, val_loader, model, epochs, optimizer, gamma=0.95, threshold=0.5, label_smoothing=0.):

    history = []
    log_template = "\nEpoch {ep:03d} train_loss: {t_loss:0.4f} \
    val_loss {v_loss:0.4f} train_acc {t_acc:0.4f} val_acc {v_acc:0.4f}"

    with tqdm(desc="epoch", total=epochs) as pbar_outer:
        sheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=gamma)
        criterion = nn.BCEWithLogitsLoss()

        for epoch in range(epochs):
            train_loss, train_acc = fit_epoch(model, 
                                              train_loader, 
                                              criterion, 
                                              optimizer, 
                                              sheduler, 
                                              threshold, 
                                              label_smoothing)
            print("loss", train_loss)
            
            val_loss, val_acc = eval_epoch(model, val_loader, criterion, threshold, label_smoothing)
            history.append((train_loss, train_acc, val_loss, val_acc))
            
            pbar_outer.update(1)
            tqdm.write(log_template.format(ep=epoch+1, t_loss=train_loss,\
                                           v_loss=val_loss, t_acc=train_acc, v_acc=val_acc))
    return history


В качестве нейронной сети для обучения выбрана предобученная MobilenetV3. \
Немного меняем структуру сети: 
1. Послендий полносвязный слой заменяем двумя слоями, bottleneck размерности 14 и финальный классификатор с размерностью, равной количеству ингредиентов.
2. Немного уменьшаем размерность 6-го блока для того, чтобы снизить сложность сети и увеличить её стойкость к переобучению.

In [None]:
model = timm.create_model('mobilenetv3_large_100_miil', pretrained=True).to(device) 

model.blocks[6][0].conv = nn.Conv2d(160, 300, 1, 1).to(device)
model.blocks[6][0].bn1 = nn.BatchNorm2d(300).to(device)
model.conv_head = nn.Conv2d(300, 700, 1, 1).to(device)

model.classifier = nn.Sequential(
    nn.Dropout(0.75),
    nn.Linear(in_features=700, out_features=30, bias=False),
    nn.BatchNorm1d(30),
    nn.Dropout(0.5),
    nn.Linear(in_features=30, out_features=len(class_labels))).to(device)

for name, param in model.named_parameters():
    param.requires_grad_(False)

Поучившаяся структура модели:

In [None]:
model

Размораживаем часть градиентов:

In [None]:
#model.blocks[1].requires_grad_(True)
model.blocks[2].requires_grad_(True)
model.blocks[3].requires_grad_(True)
model.blocks[4].requires_grad_(True)
model.blocks[5].requires_grad_(True)
model.blocks[6].requires_grad_(True)

model.conv_head.requires_grad_(True)
model.classifier.requires_grad_(True)

for name, param in model.named_parameters():
    print(name, param.requires_grad)

Для того чтобы максимально сохранить информацию в предобученной сети, устанавливаем разные скорости обучения для различных групп слоёв: чем ближе в выходу сети, тем выше скорость обучения. Вводим новый гиперпараметр lr_decay, который определяет, насколько будут отличаться веса на разных уровнях.

In [None]:
def get_optimizer(lr: float, lr_decay: float) -> object:
    optimizer = torch.optim.Adam([
        {'params': model.conv_stem.parameters()},
        {'params': model.bn1.parameters()},
        {'params': model.blocks[0].parameters(), 'lr': lr/lr_decay**7},
        {'params': model.blocks[1].parameters(), 'lr': lr/lr_decay**6},
        {'params': model.blocks[2].parameters(), 'lr': lr/lr_decay**5},
        {'params': model.blocks[3].parameters(), 'lr': lr/lr_decay**4},
        {'params': model.blocks[4].parameters(), 'lr': lr/lr_decay**3},
        {'params': model.blocks[5].parameters(), 'lr': lr/lr_decay**2},
        {'params': model.blocks[6].parameters(), 'lr': lr/lr_decay**1},
        {'params': model.conv_head.parameters(), 'lr': lr/lr_decay**1},
        {'params': model.classifier.parameters(), 'lr': lr},
    ], lr=lr/lr_decay**8)
    return optimizer
    

Обучаем сеть в несколько циклов с различными значениями lr, lr_decay, batch_size:

In [None]:
lr = 5.0e-3
lr_decay = 6.0
batch_size = 200
l_sm=0.03

train_loader, val_loader, recipe_labels = get_dataloaders(image_size, batch_size)
optimizer = get_optimizer(lr, lr_decay)

history1 = train(train_loader, val_loader, model, epochs=5, optimizer=optimizer, gamma=0.9, label_smoothing=l_sm)

In [None]:
lr = 3.0e-3
lr_decay = 2.0
batch_size = 64
l_sm=0.03

train_loader, val_loader, recipe_labels = get_dataloaders(image_size, batch_size)
optimizer = get_optimizer(lr, lr_decay)

history4 = train(train_loader, val_loader, model, epochs=6, optimizer=optimizer, gamma=0.85, label_smoothing=l_sm)

In [None]:
lr = 3.5e-3
lr_decay = 2.2
batch_size = 64
l_sm=0.03

train_loader, val_loader, recipe_labels = get_dataloaders(image_size, batch_size)
optimizer = get_optimizer(lr, lr_decay)

history3 = train(train_loader, val_loader, model, epochs=12, optimizer=optimizer, gamma=0.9, label_smoothing=l_sm)

Сохраняем модель, веса. При необходимости, загружаем веса.

In [None]:
torch.save(model.state_dict(), 'classifier_state_dict_ls.pt')
torch.save(model, 'classifier_model_ls.pt')

Проверим работу модели на случайном изображении:

In [None]:
def predict_ingredients(path: str, model: callable, classes: np.array) -> list:
    try:
        image = Image.open(path)
    except:
        return []
    width, height = image.size  # Get dimensions
    size = min(width, height)
    
    left = (width - size) / 2
    top = (height - size) / 2
    right = (width + size) / 2
    bottom = (height + size) / 2

    # Crop the center of the image
    image = image.crop((left, top, right, bottom))
    img = np.asarray(image.resize((image_size, image_size))) / 127.5 - 1.0
    
    plt.figure(figsize=(2.5, 2.5))
    plt.imshow(denorm(img))
    plt.axis('off')
    plt.show()
    
    logits = model(torch.tensor(np.rollaxis(img, 2, 0)[None, :, :, :], dtype=torch.float).to(device))
    result = (logits.detach().cpu() > 0).nonzero()[:, 1].numpy()
    return classes[result]

In [None]:
predict_ingredients('/home/maksim/Cocktails/Images/Coctails_raw/Bloody_mary/AD4A0735.jpg', model, class_labels_ru)

### Конвертация в ONNX

Загружаем модель в pytorch и экспортируем в формат ONNX.

In [None]:
model.to('cpu')
model.load_state_dict(torch.load('classifier_state_dict_ls.pt'))
# Evaluate the model to switch some operations from training mode to inference.
model.eval()
# Create dummy input for the model. It will be used to run the model inside export function.
dummy_input = torch.randn(1, 3, image_size, image_size)
# Call the export function
torch.onnx.export(model,               
                  dummy_input,                         
                  'classifier_ls.onnx',   
                  export_params=True,        
                  opset_version=11,          
                  do_constant_folding=True,  
                  input_names = ['input'],   
                  output_names = ['logits']
                 )

Загружаем модель ONNX для проверки:

In [None]:
ie = Core()

Доступные для ONNX устройства:

In [None]:
devices = ie.available_devices

for dev in devices:
    device_name = ie.get_property(device_name=dev, name="FULL_DEVICE_NAME")
    print(f"{dev}: {device_name}")
    
onnx_model_path = 'classifier_ls.onnx'
model_onnx = ie.read_model(model=onnx_model_path)
compiled_model_onnx = ie.compile_model(model=model_onnx, device_name="CPU")

Вход модели:

In [None]:
input_layer = compiled_model_onnx.input(0)

print(f"input precision: {input_layer.element_type}")
print(f"input shape: {input_layer.shape}")

Выход модели:

In [None]:
output_layer = compiled_model_onnx.output(0)

print(f"output precision: {output_layer.element_type}")
print(f"output shape: {output_layer.shape}")

Вся загрузка в одной ячейке:

In [None]:
ie = Core()
onnx_model_path = 'classifier_ls.onnx'
model_onnx = ie.read_model(model=onnx_model_path)
compiled_model_onnx = ie.compile_model(model=model_onnx, device_name="CPU")

input_layer = compiled_model_onnx.input(0)
output_layer = compiled_model_onnx.output(0)

Пробуем инференс на случайном изображении:

In [None]:
def predict_ingredients_onnx(path: str, class_labels: list) -> list:
    img = np.asarray(Image.open(path).resize((image_size, image_size))) / 255
    logits = compiled_model_onnx([np.rollaxis(img, 2, 0)[None, :, :, :]])[output_layer]
    result = (logits > 0.5).nonzero()[1]
    return class_labels_ru[result]

def generate_recipe(ingredients: list) -> str:
    return ', '.join(ingredients)

In [None]:
img_path = '/home/maksim/Cocktails/Images/Coctails_raw/Mojito/0346a20835_1000.jpg'
print(generate_recipe(predict_ingredients_onnx(img_path, class_labels=class_labels_ru)))

In [None]:
def predict_vector(path: str, class_labels: list) -> list:
    img = np.asarray(Image.open(path).resize((image_size, image_size))) / 255
    logits = compiled_model_onnx([np.rollaxis(img, 2, 0)[None, :, :, :]])[output_layer][0]
    probs = 1 / (1 + np.exp(-logits))
    pos_ind = (probs > 0.5).nonzero()[0]
    neg_ind = (probs < 0.5).nonzero()[0]
    
    return np.prod(probs[pos_ind])*np.prod(1-probs[neg_ind])

In [None]:
ingr = predict_vector(img_path, class_labels=class_labels_ru)

In [None]:
ingr