In [164]:
import os
import json
import random
from tqdm.auto import tqdm
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
import torchvision.models as models

from torch.utils.data import Dataset, WeightedRandomSampler
from sklearn.model_selection import train_test_split

import PIL
from PIL import Image, ImageDraw

os.environ['KMP_DUPLICATE_LIB_OK']='TRUE'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [117]:
SEED = 42
IMAGE_FOLDER = './data/train/images'
DATA_MARKUP_FILE = './image_result_info.csv'
SIZE_PACK_IMAGES = 8

CNN_EMBEDDING_SIZE = 1280

RNN_CLASSES = 1
RNN_NUM_LAYERS = 2
RNN_HIDDEN_SIZE = 256

BATCH_SIZE = 8
LEARNING_RATE = 1e-3


def fix_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

#### MODEL

In [177]:
class VideoCNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(VideoCNN, self).__init__()
        
        self.CNN = models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.IMAGENET1K_V1)
        self.CNN.classifier[-1] = nn.Identity()

        self.RNN = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.FC = nn.Linear(hidden_size, num_classes)

    def forward(self, batch_size, pack_size, input):
        input = input.view(-1, 3, 256, 256)
        
        output = self.CNN(input).view(batch_size, pack_size, -1)
 
        output, _ = self.RNN(output)
        output = output[:, -1, :]
        output = self.FC(output)

        return output

#### DATALOADER

In [189]:
class VideoDataset(Dataset):
    def __init__(self, data_markup_file, image_folder, size_pack_images, transform, device='cpu'):
        
        self.size_pack_images = size_pack_images
        self.image_folder = image_folder
        self.transform = transform
        self.device = device
        
        data = pd.read_csv(data_markup_file)
        data['group'] = (data['target'] != data['target'].shift()).cumsum()

        result = data.groupby('group').agg({'image_name': list, 'target': 'first'}).reset_index(drop=True)
        result['pack_images'] = result['image_name'].apply(self.select_evenly)

        self.dataset = result[['pack_images', 'target']]

    def __getitem__(self, idx):        

        image_paths = self.dataset['pack_images'][idx]
        targets = torch.tensor(self.dataset['target'][idx]).unsqueeze(0).float()
        
        tensors = [self.load_image(path) for path in image_paths]
        tensors = torch.stack(tensors).squeeze(0)
        return tensors.to(device), targets.to(device)

    def select_evenly(self, lst):
        assert len(lst) >= self.size_pack_images, "The minimum number of frames is less than SIZE_PACK_IMAGES"
        indices = np.linspace(0, len(lst) - 1, self.size_pack_images, dtype=int)
        return [lst[i] for i in indices]


    def load_image(self, filename):
        with Image.open(self.image_folder + '/' + filename) as img:
            image = np.array(img)
            image = transform(image)
            return image

    def __len__(self):
        return len(self.dataset)

In [190]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
    transforms.Resize((256, 256))])

In [253]:
train_data = VideoDataset(
    DATA_MARKUP_FILE, 
    IMAGE_FOLDER, 
    SIZE_PACK_IMAGES, 
    transform, 
    device=device)

eval_data = VideoDataset(
    DATA_MARKUP_FILE, 
    IMAGE_FOLDER, 
    SIZE_PACK_IMAGES, 
    transform, 
    device=device)

train, test = train_test_split(train_data.dataset[['pack_images', 'target']], random_state=SEED)
train_data.dataset, eval_data.dataset = train.reset_index(drop=True), test.reset_index(drop=True)

train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
eval_dataloader = torch.utils.data.DataLoader(eval_data, batch_size=BATCH_SIZE, shuffle=False)

print(f'Length train dataset: {len(train_data)}, length evaluate dataset: {len(eval_data)}')

Length train dataset: 15, length evaluate dataset: 6


#### TRAIN LOOP

In [250]:
def evaluate(model, criterion, dataloader, verbose = False) -> float:
    model.eval()
    num_correct, num_all = 0, 0
    with torch.no_grad():
        for batch in dataloader:
            images, labels = batch
            batch_size = images.shape[0]
            logits = model(batch_size, SIZE_PACK_IMAGES, images)
            predictions = F.sigmoid(logits)
            predictions = (predictions >= 0.5) * 1.

            num_correct += torch.sum(predictions == labels)
            num_all += len(labels)
    
    accuracy = num_correct / num_all

    return accuracy

def train_model(model, criterion, optimizer, num_epoch, verbose=False):
    losses_train, losses_val = [], []
    accuracy_train, accuracy_val, min_accuracy = [], [], 0

    for epoch in range(num_epoch):
        train_loss, val_loss = 0., 0.

        model.train()
        for batch in train_dataloader:
            optimizer.zero_grad()
            
            images, labels = batch          
            batch_size = images.shape[0]

            logits = model(batch_size, SIZE_PACK_IMAGES, images)
            loss = criterion(logits, labels)

            loss.backward()
            optimizer.step()        

            train_loss += loss.item() / len(train_dataloader)

        model.eval()
        with torch.no_grad():
            for batch in eval_dataloader:
                images, labels = batch          
                batch_size = images.shape[0]
    
                logits = model(batch_size, SIZE_PACK_IMAGES, images)
                loss = criterion(logits, labels)

                val_loss += loss.item() / len(eval_dataloader)
        
        current_accuracy_train = evaluate(model, criterion, train_dataloader, verbose).item()
        current_accuracy_val = evaluate(model, criterion, eval_dataloader, verbose).item()
       
        losses_train.append(train_loss)
        losses_val.append(val_loss)
       
        accuracy_train.append(current_accuracy_train)
        accuracy_val.append(current_accuracy_val)

        if current_accuracy_val > min_accuracy:
            torch.save(model, 'CNN-RNN.pth')
            min_accuracy = current_accuracy_val
            
        print (f'Epoch {epoch + 1} : train_loss: {train_loss:.4f}, eval_loss: {val_loss:.4f}, train_accuracy: {accuracy_train[-1]:.4f}, val_accuracy: {accuracy_val[-1]:.4f}')
                
    return (losses_train, losses_val), (accuracy_train, accuracy_val), model

In [251]:
model = VideoCNN(
    input_size=CNN_EMBEDDING_SIZE, 
    hidden_size=RNN_HIDDEN_SIZE, 
    num_layers=RNN_NUM_LAYERS, 
    num_classes=RNN_CLASSES
).to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, betas=(0.9, 0.999), weight_decay=5e-4)

In [252]:
losses, accuracy, model = train_model(model, criterion, optimizer, 10)

Epoch 1 : train_loss: 0.6950, eval_loss: 0.6876, train_accuracy: 0.4000, val_accuracy: 0.6667
Epoch 2 : train_loss: 0.6963, eval_loss: 0.6878, train_accuracy: 0.3333, val_accuracy: 0.6667
Epoch 3 : train_loss: 0.6967, eval_loss: 0.6887, train_accuracy: 0.4667, val_accuracy: 0.5000
Epoch 4 : train_loss: 0.6903, eval_loss: 0.6908, train_accuracy: 0.8000, val_accuracy: 0.3333
Epoch 5 : train_loss: 0.6885, eval_loss: 0.6935, train_accuracy: 0.8000, val_accuracy: 0.1667
Epoch 6 : train_loss: 0.6784, eval_loss: 0.6964, train_accuracy: 0.8000, val_accuracy: 0.3333
Epoch 7 : train_loss: 0.6716, eval_loss: 0.6998, train_accuracy: 0.9333, val_accuracy: 0.3333
Epoch 8 : train_loss: 0.6632, eval_loss: 0.7036, train_accuracy: 1.0000, val_accuracy: 0.3333
Epoch 9 : train_loss: 0.6528, eval_loss: 0.7077, train_accuracy: 1.0000, val_accuracy: 0.3333
Epoch 10 : train_loss: 0.6363, eval_loss: 0.7127, train_accuracy: 1.0000, val_accuracy: 0.3333
