# Find Pikachu in images

In [None]:
from argparse import Namespace

import json

from datasets import Dataset
from datasets import Image

from PIL import ImageDraw
import PIL

import torch
from torchvision import transforms
from torch.optim.lr_scheduler import ExponentialLR

import wandb

import pandas as pd
import numpy as np

# from sklearn.model_selection import StratifiedKFold

In [None]:
from accelerate import Accelerator
from accelerate.utils import GradientAccumulationPlugin

In [None]:
from accelerate.utils import set_seed
from accelerate.utils import write_basic_config

write_basic_config()

In [None]:
DEVICE = torch.device(
    'cuda' if torch.cuda.is_available() \
        else 'mps' if torch.backends.mps.is_available() else 'cpu')
# DEVICE = 'cpu'

CONFIG = Namespace(
    run_name='pikachu-detector',
    model_name='pikachu-detector-baseline-model',
    image_size=256,
    hidden_dims=256,
    horizontal_flip_prob=0.5,
    gaussian_blur_kernel_size=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=15,
    learning_rate=4e-4,
    seed=1,
    beta_schedule='squaredcos_cap_v2',
    lr_exp_schedule_gamma=0.85,
    lr_warmup_steps=500,
    train_limit=-1,
    save_model=True,
    mixed_precision=None,
    grad_accumulation_steps=4
    )
CONFIG.device = DEVICE

In [None]:
def build_dataset():
    """
    Build HuggingFace dataset
    """

    json_data = None
    with open('archive/annotations.json', 'r') as f:
        json_data = json.load(f)

    dataframe = pd.DataFrame.from_dict(json_data).T.reset_index()
    dataframe.image = dataframe.image.map(lambda x: f"archive/images/{x}")
    dataframe = dataframe.drop(['id', 'index'], axis=1)
    dataframe['x_1'] = dataframe['loc'].map(lambda x: x[0])
    dataframe['y_1'] = dataframe['loc'].map(lambda x: x[1])
    dataframe['x_2'] = dataframe['loc'].map(lambda x: x[2])
    dataframe['y_2'] = dataframe['loc'].map(lambda x: x[3])

    dataframe['orientation_x'] = dataframe.x_2 - dataframe.x_1
    dataframe['orientation_y'] = dataframe.y_2 - dataframe.y_1

    orientations = []
    orientation_x = dataframe['orientation_x'].values
    orientation_y = dataframe['orientation_y'].values

    for x, y in zip(orientation_x, orientation_y):
        orientation = 0 if x > 0 and y > 0 else \
            1 if x < 0 and y > 0 else \
                2 if x < 0 and y < 0 else \
                    3 if x > 0 and y < 0 else -1
        orientations.append(orientation)
    
    dataframe['orientation'] = orientations
    # dataframe['orientation'] = pd.DataFrame.where(dataframe.orientation_x > 0 and dataframe.orientation_y > 0, 0)
    # Define orientation of the Pikachu

    dataset = Dataset.from_pandas(dataframe).cast_column('image', Image())
    return _, dataset

def prepare_dataloader(config: Namespace):
    """
    Prepare dataloader
    """

    train_preprocess = transforms.Compose(
        [
            transforms.Resize((config.image_size, config.image_size)),  # Resize
            transforms.RandomHorizontalFlip(p=config.horizontal_flip_prob),
            transforms.GaussianBlur(kernel_size=config.gaussian_blur_kernel_size),
            transforms.ToTensor(),  # Convert to tensor (0, 1)
            transforms.Normalize([0.5], [0.5]),  # Map to (-1, 1)
        ])

    val_preprocess = transforms.Compose(
        [
            transforms.Resize((config.image_size, config.image_size)),  # Resize
            transforms.ToTensor(),  # Convert to tensor (0, 1)
            transforms.Normalize([0.5], [0.5]),  # Map to (-1, 1)
        ])

    # For pre-processing original image for visualization in W&Bs
    preprocess_original = transforms.Compose(
        [
            transforms.Resize((512, 512)),  # Resize
            transforms.ToTensor(),  # Convert to tensor (0, 1)
        ])

    _, dataset = build_dataset()

    # Remove images that are 100x100 or below.
    dataset = \
        dataset.filter(
            lambda example: example['image'].size[0] > 100 and example['image'].size[1] > 100)

    def train_transform(examples):
        images = [train_preprocess(image.convert('RGB')) for image in examples['image']]
        original_images = [
            preprocess_original(image.convert('RGB')) \
                for image in examples['image']]

        bounding_boxes = torch.stack([torch.tensor(ex) for ex in examples['loc']], dim=0)
        return {'image': images,
                'bounding_box': bounding_boxes,
                'original-image': original_images
                }

    def val_transform(examples):
        images = [val_preprocess(image.convert('RGB')) for image in examples['image']]
        original_images = [
            preprocess_original(image.convert('RGB')) \
                for image in examples['image']]
        
        bounding_boxes = torch.stack([torch.tensor(ex) for ex in examples['loc']], dim=0)
        return {'image': images,
                'bounding_box': bounding_boxes,
                'original-image': original_images
                }

    # Split dataset into train + val. Balance train + val


    # num_points = len(dataset)

    # How do we balance when we have a regression problem?
    # labels = dataset['labels']

    # split_df = pd.DataFrame()
    # split_df['labels'] = labels
    # split_df['id'] = list(range(num_points))
    # split_df['fold'] = -1

    # cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=config.seed)
    # for i, (_, test_ids) in enumerate(cv.split(np.zeros(num_points), labels)):
    #     split_df.loc[test_ids, ['fold']] = i

    # split_df['split'] = 'train'
    # split_df.loc[split_df.fold == 0, ['split']] = 'val'

    # print(split_df[split_df['split'].str.fullmatch('train')].labels.value_counts())
    # print(split_df[split_df['split'].str.fullmatch('val')].labels.value_counts())

    # train_indices = split_df[split_df['split'].str.fullmatch('train')]['id']
    # val_indices = split_df[split_df['split'].str.fullmatch('val')]['id']

    # def train_generator():
    #     for idx in train_indices:
    #         yield dataset['train'][idx]

    # def val_generator():
    #     for idx in val_indices:
    #         yield dataset['train'][idx]

    np_generator = np.random.default_rng(config.seed)
    train_val_dataset = dataset.train_test_split(test_size=0.2, shuffle=True,
                                                 generator=np_generator)
    train_dataset = train_val_dataset['train']
    val_dataset = train_val_dataset['test']

    train_dataset.set_transform(train_transform)
    val_dataset.set_transform(val_transform)

    train_gen = torch.Generator().manual_seed(config.seed)
    val_gen = torch.Generator().manual_seed(config.seed)

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.per_device_train_batch_size,
        shuffle=True, generator=train_gen)
    
    val_dataloader = torch.utils.data.DataLoader(
        val_dataset, batch_size=config.per_device_eval_batch_size,
        shuffle=False, generator=val_gen)

    return train_dataloader, val_dataloader

In [None]:
# What type of models are used for detection?
# Input: Image
# Output: (x, y), length, width OR the diagnoals of the rectangular window (x1, y1), (x2, y2)

import torch
import numpy as np

class PikachuDetector(torch.nn.Module):

    def __init__(self, in_channels: int, dims: int):
        super().__init__()

        self.conv_1 = torch.nn.Conv2d(
            in_channels, dims, kernel_size=12)
        self.max_pool_1 = torch.nn.MaxPool2d(kernel_size=3)

        self.conv_2 = torch.nn.Conv2d(
            dims, 2*dims, kernel_size=5)
        self.max_pool_2 = torch.nn.MaxPool2d(kernel_size=3)

        self.conv_3 = torch.nn.Conv2d(
            2*dims, 2*dims, kernel_size=3)
        self.max_pool_3 = torch.nn.MaxPool2d(kernel_size=2)

        self.conv_4 = torch.nn.Conv2d(
            2*dims, 2*dims, kernel_size=3)
        self.max_pool_4 = torch.nn.MaxPool2d(kernel_size=2)

        self.flatten = torch.nn.Flatten()
        self.projection = torch.nn.LazyLinear(4*dims)

        # num_subsets = int(np.log2(dims)) + 2

        # print(f"Number of subsets: {num_subsets} - Dimensions: {4*dims}")

        # self.linear_layers = torch.nn.ModuleList()
        # for i in range(3, num_subsets+1):
        #     # print(f"Number of dimensions: {2**i}")
        #     self.linear_layers.append(
        #         torch.nn.Linear(2**i, num_labels))
        # Top-left corner: (x1, y1), and bottom-right corner: (x2, y2)
        self.regression_head = torch.nn.Linear(4*dims, 4)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x: torch.Tensor):
        """
        Forward pass
        """

        x_ = self.conv_1(x)
        x_ = self.max_pool_1(x_)
        # print(f"Output of conv & max pool 1: {x_.shape}")

        x_ = self.conv_2(x_)
        x_ = self.max_pool_2(x_)
        # print(f"Output of conv & max pool 2: {x_.shape}")

        x_ = self.conv_3(x_)
        x_ = self.max_pool_3(x_)
        # print(f"Output of conv & max pool 3: {x_.shape}")

        x_ = self.conv_4(x_)
        x_ = self.max_pool_4(x_)
        # print(f"Output of conv & max pool 4: {x_.shape}")

        x_ = self.flatten(x_)
        # print(f"Output of flatten: {x_.shape}")

        x_ = self.projection(x_)
        # print(f"Output of projection: {x_.shape}")

        output = self.regression_head(x_)

        # output = self.sigmoid(x_)
        # output = (output + 1)/2.0

        # output = []
        # for i, layer in enumerate(self.linear_layers):
        #     # print(f"Getting slice: {2**(3+i)} - {x_[:, 0:2**(3+i)].shape}")
        #     x__ = layer(x_[:, 0:2**(3+i)])
        #     # print(f"Subset output: {x__.shape}")
        #     output.append(x__)

        # output = torch.stack(output, dim=1)
        # print(f"Final output: {output.shape}")
        return output

def create_model(in_dimensions: int, dims: int):
    """
    Create model
    """

    model = PikachuDetector(in_dimensions, dims)
    return model

In [None]:
import PIL.Image


def compute_loss(preds: torch.Tensor, gts: torch.Tensor):
    """
    Compute MSE loss
    """

    # Sum over each subset & average over each batch
    loss_fn = torch.nn.MSELoss(reduction='mean')
    # Cross entropy loss require (batch_size, x1 y1 x2 y2, ...)
    loss = loss_fn(preds, gts)
    return loss

def add_bounding_box_to_image(img: PIL.Image, bounding_box: torch.Tensor,
                              img_height: int, img_width: int):
    """
    Add bounding box to an image
    """

    x_1 = bounding_box[0]*img_height
    y_1 = bounding_box[1]*img_width

    x_2 = bounding_box[2]*img_height
    y_2 = bounding_box[3]*img_width

    draw = ImageDraw.Draw(img)

    start_x = min(x_1, x_2)
    start_y = min(y_1, y_2)
    end_x = max(x_1, x_2)
    end_y = max(y_1, y_2)

    draw.rectangle(((start_x, start_y), (end_x, end_y)))

    return draw._image

@torch.no_grad()
def eval_loop(epoch: int, model, dataloader,
              wandb_run, accelerator: Accelerator):
    """
    Evaluation loop
    """

    tensor_to_pil = transforms.ToPILImage()

    columns = ['epoch', 'img_gt_bounding_box', 'img_pred_bounding_box',
               'model_image',  'pred_x1', 'pred_y1', 'pred_x2', 'pred_y2',
               'gt_x1', 'gt_y1', 'gt_x2', 'gt_y2']
    dataframe = []

    avg_loss = 0
    for _, batch in enumerate(dataloader):

        pred_bounding_box = model(batch['image'])
        gt_bounding_box = batch['bounding_box']

        loss = compute_loss(pred_bounding_box, gt_bounding_box)
        avg_loss += loss.item()

        images = []
        img_gt_bounding_box = []
        img_pred_bounding_box = []

        for j in range(batch['image'].shape[0]):
            images.append(tensor_to_pil(batch['image'][j,:]))

            gt_original_img = tensor_to_pil(batch['original-image'][j,:])
            pred_original_img = tensor_to_pil(batch['original-image'][j,:])

            gt_bound_box_img = add_bounding_box_to_image(gt_original_img, gt_bounding_box[j,:],
                                                         gt_original_img.height, gt_original_img.width)
            pred_bound_box_img = add_bounding_box_to_image(pred_original_img, pred_bounding_box[j,:],
                                                           pred_original_img.height, pred_original_img.width)

            img_gt_bounding_box.append(gt_bound_box_img)
            img_pred_bounding_box.append(pred_bound_box_img)

        batch_dataframe = pd.DataFrame(columns=columns)
        batch_dataframe['epoch'] = [epoch for _ in range(len(images))]
        batch_dataframe['model_image'] = \
            [wandb.Image(image) for image in images]
        batch_dataframe['img_gt_bounding_box'] = \
            [wandb.Image(image) for image in img_gt_bounding_box]
        batch_dataframe['img_pred_bounding_box'] = \
            [wandb.Image(image) for image in img_pred_bounding_box]

        batch_dataframe['pred_x1'] = pred_bounding_box[:, 0].tolist()
        batch_dataframe['pred_y1'] = pred_bounding_box[:, 1].tolist()
        batch_dataframe['pred_x2'] = pred_bounding_box[:, 2].tolist()
        batch_dataframe['pred_y2'] = pred_bounding_box[:, 3].tolist()

        batch_dataframe['gt_x1'] = gt_bounding_box[:, 0].tolist()
        batch_dataframe['gt_y1'] = gt_bounding_box[:, 1].tolist()
        batch_dataframe['gt_x2'] = gt_bounding_box[:, 2].tolist()
        batch_dataframe['gt_y2'] = gt_bounding_box[:, 3].tolist()
        dataframe.append(batch_dataframe)

    dataframe = pd.concat(dataframe, axis=0, ignore_index=True)
    # dataframe.to_csv('testing.csv')
    # Get average accuracy and loss
    # acc = (dataframe['gt'] == dataframe['pred']).mean()
    avg_loss = avg_loss/len(dataloader)

    accelerator.print(
        f"Val MSE loss: {avg_loss}")

    table = wandb.Table(data=dataframe)
    # wandb_run.log({'accuracy': acc}, commit=False)
    wandb_run.log({'val-mse-loss': loss}, commit=False)
    wandb_run.log({'eval-table': table})

def training_loop(config: Namespace):
    """
    Training loop
    """

    wandb_run = wandb.init(project='Pikachu-Detector', entity=None,
                           job_type='training',
                           name=config.run_name,
                           config=config)

    set_seed(config.seed)

    grad_accumulation_plugin = GradientAccumulationPlugin(
        num_steps=config.grad_accumulation_steps,
        adjust_scheduler=True,
        sync_with_dataloader=True)

    accelerator = Accelerator(
        mixed_precision=config.mixed_precision,
        gradient_accumulation_plugin=grad_accumulation_plugin,
        cpu=(config.device == 'cpu'))

    train_dataloader, val_dataloader = prepare_dataloader(config)    
    model = create_model(3, config.hidden_dims)

    optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)

#     scheduler = CosineAnnealingLR(
#         optimizer,
#         T_max=config.num_train_epochs)
    scheduler = ExponentialLR(
        optimizer,
        config.lr_exp_schedule_gamma)

#     scheduler = CosineAnnealingWarmRestarts(
#         optimizer,
#         T_0=config.lr_warmup_steps)
        # last_epoch=config.num_train_epochs*len(train_dataloader))

    model, optimizer, train_dataloader, val_dataloader, scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, val_dataloader, scheduler)

    num_steps = 0
    for epoch in range(config.num_train_epochs):
        model.train()

        accelerator.print(f"Epoch {epoch}")

        epoch_loss = 0
        num_iters = 0

        for _, batch in enumerate(train_dataloader):
            with accelerator.accumulate(model):
    
                optimizer.zero_grad()
                pred_bounding_box = model(batch['image'])
                gt_bounding_box = batch['bounding_box']

                loss = compute_loss(pred_bounding_box, gt_bounding_box)

                # accelerator.print(f"Loss: {loss.item()}")

                accelerator.backward(loss)
                accelerator.clip_grad_norm_(model.parameters(), 1.0)

                epoch_loss += loss.item()

                wandb_run.log({'loss': loss.item()}, commit=False, step=num_steps)
                wandb_run.log({'lr': scheduler.get_lr()[0]}, commit=False, step=num_steps)

                num_steps += 1
                num_iters += 1

                # Update the model parameters with the optimizer
                optimizer.step()
        scheduler.step()

        # Validate model
        accelerator.print("Evaluating model")
        eval_loop(epoch, model, val_dataloader, wandb_run, accelerator)

        wandb_run.log({'epoch-loss': epoch_loss/num_iters})

    if config.save_model:
        # Save model to W&Bs
        model_art = wandb.Artifact(config.model_name, type='model')
        torch.save(model.state_dict(), 'model.pt')

        model_art.add_file('model.pt')
        wandb_run.log_artifact(model_art)
    wandb_run.finish()

In [None]:
# # For debugging
# MODEL = create_model(3, CONFIG.hidden_dims)
# train_dataloader, val_dataloader = prepare_dataloader(CONFIG)
# eval_loop(0, MODEL, val_dataloader, None, None)

In [None]:
from accelerate import notebook_launcher

notebook_launcher(training_loop, (CONFIG, ), num_processes=1)