# Experiment

Here, I want to try to re-implement the whole WordDetectorNN in a single Jupyter Notebook to keep things simple. Let's see if I get that done :-D

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path

import torch
from torch.utils.data import Subset
import numpy as np
from torch.utils.data import DataLoader
from datetime import datetime
from torch.utils.tensorboard import SummaryWriter
import matplotlib.pyplot as plt

from my_code import IAM_Dataset
from my_code import ImageDimensions
from my_code import custom_collate_fn
from my_code import count_parameters
from my_code import fg_by_cc
from my_code import cluster_aabbs
from my_code import binary_classification_metrics
from my_code import draw_bboxes_on_image
from my_code import MapOrdering
from my_code import encode, decode, BoundingBox, ImageDimensions
from my_code import ModifiedResNet18
from my_code import WordDetectorNet
from my_code import compute_loss

## Architecture

Here, I note down how I build the project to remind myself and others in the future. Here we go:

```mermaid
graph TD
    A[IAM Folder] --> B[Train Dataset = Dtr]
    A --> C[Val Dataset = Dval]
    B --> D[Dtr w transform, img&aabb = Dtr']
    C --> E[Dval w transform, img&aabb = Dval']
    D --> F[Train DataLoader] 
    E --> G[Val DataLoader] 
    E --> H[no transform except normalisation]
    D --> I[geometric & photo]
```

- transform in Dataset
- encode in collate of DataLoader

## Encoding & Decoding

In [None]:
aabbs = [
    # BoundingBox( 0,  0, 10, 10, label='word1'),
    # BoundingBox( 5,  5, 15, 15, label='word2'),
    BoundingBox(20, 20, 30, 30, label='word3'),
]

In [None]:
input_size = ImageDimensions(100, 100)
output_size = ImageDimensions(50, 50)

In [None]:
aabbs_encoded = encode(input_size, output_size, aabbs)

In [None]:
plt.figure()
plt.imshow(aabbs_encoded[MapOrdering.SEG_WORD, :, :], cmap='gray')
plt.show()

In [None]:
assert input_size.width / output_size.width == input_size.height / output_size.height

In [None]:
decoded_aabbs = decode(aabbs_encoded, scale=input_size.width / output_size.width)

In [None]:
decoded_aabbs

## Dataset

First, create the dataset:

In [None]:
# Experiment w/ dataset class
data_path = Path.home() / 'Development/WordDetectorNN/data/train'
dataset = IAM_Dataset(
    root_dir=data_path,
    # input_size=ImageDimensions(width=640, height=448),
    input_size=ImageDimensions(width=400, height=600),
    output_size=ImageDimensions(width=200, height=300),
    force_rebuild_cache=True,
    transform=None,
)

Next, access an element:

In [None]:
idx = 578
idx = 0
idx = 325
sample = dataset[idx]

Then, plot a sample:

In [None]:
dataset.store_element_as_image(idx, Path('test.png'), draw_bboxes=True, store_gt_encoded=True)

Next, let's split the dataset into training and val datasets:

In [None]:
# This way to create the train and val datasets seems convoluted but is necessary to ensure
# that train and val get only their transforms. I know that it could be implemented more efficiently
# but that's not necessary give the small dataset.
#
# An alternative way to implement it is to build a TransformSubset which not only Subset's but also
# applies a separate transform.
#
# Note that it is not a good idea to hardcode these transforms b/c one might want to use the plain dataset,
# even if only for inspection

# Create datasets with different transforms
train_transform = None
val_transform = None
# TODO: ^ Implement the augmentations, w/ each changing at every batch

train_dataset = IAM_Dataset(
    root_dir=data_path,
    # input_size=ImageDimensions(width=640, height=448),
    input_size=ImageDimensions(width=400, height=600),
    output_size=ImageDimensions(width=200, height=300),
    force_rebuild_cache=False,
    transform=train_transform,
)
val_dataset = IAM_Dataset(
    root_dir=data_path,
    # input_size=ImageDimensions(width=640, height=448),
    input_size=ImageDimensions(width=400, height=600),
    output_size=ImageDimensions(width=200, height=300),
    force_rebuild_cache=False,
    transform=val_transform,
)

percent_train_data = 80

assert len(train_dataset) == len(val_dataset)

indices = list(range(len(train_dataset)))
np.random.seed(42)
np.random.shuffle(indices)
split = int(percent_train_data / 100 * len(indices))

train_indices = indices[:split]
val_indices = indices[split:]

train_subset = Subset(train_dataset, train_indices)
val_subset = Subset(val_dataset, val_indices)

train_filenames = [sample['filename'] for sample in train_subset]
val_filenames = [sample['filename'] for sample in val_subset]
# Check that no train samples are in val
assert len(set(train_filenames + val_filenames)) == len(train_filenames) + len(val_filenames)

assert len(dataset) == len(train_subset) + len(val_subset)

## Dataloader

In [None]:
shuffle_data_loader = True
batch_size = 32
num_workers = 1

In [None]:
dataloader_train = DataLoader(
    train_subset,
    batch_size=batch_size,
    shuffle=shuffle_data_loader,
    num_workers=num_workers,
    collate_fn=custom_collate_fn,  # or custom_collate_fn_with_padding
    pin_memory=True  # For faster GPU transfer
)

dataloader_val = DataLoader(
    val_subset,
    batch_size=batch_size,
    shuffle=shuffle_data_loader,
    num_workers=num_workers,
    collate_fn=custom_collate_fn,  # or custom_collate_fn_with_padding
    pin_memory=True  # For faster GPU transfer
)

Check lenghts:

In [None]:
len(dataloader_train), len(train_subset), len(train_subset) / batch_size

In [None]:
len(dataloader_val), len(val_subset), len(val_subset) / batch_size

Load a single batch for testing & inspect it:

In [None]:
batch_train = next(iter(dataloader_train))
batch_val = next(iter(dataloader_val))

In [None]:
batch_train.keys(), batch_val.keys()

In [None]:
batch_train['images'].shape, batch_val['images'].shape

In [None]:
batch_train['gt_encoded'].shape, batch_val['gt_encoded'].shape

In [None]:
len(batch_train['bounding_boxes']), len(batch_val['bounding_boxes'])

Iterate through whole dataloader once:

In [None]:
for batch in dataloader_train:
    pass

In [None]:
for batch in dataloader_val:
    pass

## Neural network

Try it out:

In [None]:
backbone = ModifiedResNet18()

H, W = 400, 500
H, W = 448, 448
H, W = 600, 600
test_input = torch.randn((1, 1, H, W))

output = backbone(test_input)
out5, out4, out3, out2, out1 = output

print("Print output sizes:")
for o in output:
    print("\t", o.shape)

nr_params = count_parameters(backbone)
print(f"Total params: {nr_params['total_params']}")
print(f"Trainable params: {nr_params['trainable_params']}")

Now off to the `WordDetectorNN` (for now just copied from external repo):

Now test it:

In [None]:
net = WordDetectorNet()

H, W = net.input_size
test_input = torch.randn((1, 1, H, W))

output = net(test_input)

print("Print output sizes:", output.shape)

nr_params = count_parameters(net)
print(f"Total params: {nr_params['total_params']}")
print(f"Trainable params: {nr_params['trainable_params']}")

Test neural network with dataloader item:

In [None]:
batch_item = next(iter(dataloader_train))

In [None]:
transform = None
dataset = IAM_Dataset(
    root_dir=data_path,
    input_size=ImageDimensions(width=448, height=448),
    output_size=ImageDimensions(width=224, height=224),
    force_rebuild_cache=True,
    transform=transform,
)

shuffle_data_loader = True
batch_size = 32
num_workers = 1
dataloader = DataLoader(
    dataset,
    batch_size=batch_size,
    shuffle=shuffle_data_loader,
    num_workers=num_workers,
    collate_fn=custom_collate_fn,  # or custom_collate_fn_with_padding
    pin_memory=True  # For faster GPU transfer
)

In [None]:
batch_item = next(iter(dataloader))

In [None]:
net = WordDetectorNet()

output = net(batch_item['images'])

print("Print output sizes:", output.shape)
print("`gt_encoded` shape:", batch_item['gt_encoded'].shape)


It turns out that we need that `1` dimension in the input because the backbone convolutional uses this.

Next, confirm that the above forward pass also works on GPU:

In [None]:
net = WordDetectorNet()
net.to('cuda')

images = batch_item['images']
images = images.to('cuda')

output = net(images)

print("Print output sizes:", output.shape)
print("`gt_encoded` shape:", batch_item['gt_encoded'].shape)


Yes, this seems to work, great! Interestingly, it is much faster than on CPU: 14.0s vs 0.1s.

## Loss

For now, just copied to first make it work and improve the implementation (maybe) later:

In [None]:
y = output.to('cuda')
gt_map = batch_item['gt_encoded'].to('cuda')
l = compute_loss(y, gt_map)

OK, this seems to work.