In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from icevision import show_records
from icevision.imports import *
from icevision.utils import *
from icevision.data import *
from icevision.metrics.metric import *
from icevision.models.utils import get_dataloaders
from fastai.callback.tracker import SaveModelCallback
from fastai.callback.tensorboard import TensorBoardCallback
from icevision.metrics import SimpleConfusionMatrix # make sure you have the SkyTruth fork of icevision installed
import torch
torch.cuda.empty_cache()

In [None]:
from ceruleanml.learner_config import (
    run_list,
    classes_to_keep,
    model_type,
    model,
    get_tfms,
    wd,
    record_collection_train,
    record_collection_val,
    record_collection_test,
    model_name,
    num_workers,
)
run_list

# Training Parameters

In [None]:
bs_d ={512:16, 256:16, 224:16, 128:32, 64:64} # Batch Size for each image size
lr_d = {512:1e-3, 256:1e-3, 224:1e-3, 128:1e-3, 64:1e-3} # Learning Rate for each image size
mins_d = {512:3.5, 256:2, 224:1.5, 128:1.5, 64:1.2}

### Important! 

Make sure you have copied the dataset to the local SSD of the VM at /root. Loading the data from a GCP bucket takes a full 2 minutes compared to 17 seconds when data is on the SSD.

You can run the following for example to copy a dataset from the bucket to the vm quickly.

In [None]:
# # looking at area distribution to find area threshold
# from ceruleanml import preprocess
# from ceruleanml.learner_config import (
#     coco_json_path_train,
#     tiled_images_folder_train,
# )
# print(coco_json_path_train, tiled_images_folder_train)
# df = preprocess.get_area_df(coco_json_path_train, tiled_images_folder_train, class_names=classes_to_keep)
# df['area']= df['area'].astype(float)
# df.plot.hist(bins=100)

This func is adapted in the icevision fork to only show the first channel of the three channel dataset in `draw_sample`

In [None]:
# # Make sure the records look reasonable
# show_records(record_collection_train[0:2], ncols=1, class_map=classes_to_keep, display_mask=True, display_bbox=False)

In [None]:
# # Confirm transforms are working for training data
# import skimage.io as skio
# import cv2
# from random import randint
# train_tfms, _ = get_tfms()
# train_ds = Dataset(record_collection_train, train_tfms)
# j = randint(0,len(record_collection_train)-1)
# test_record = train_ds[j]

# # show the memtile first
# skio.imshow_collection([cv2.imread(str(record_collection_train[j].common.filepath))[:,:,2], record_collection_train[j].detection.masks[0].to_mask(record_collection_train[j].common.img_size[1],record_collection_train[j].common.img_size[0]).data[0,:,:]], interpolation="nearest")

# # then show the rrctiles
# skio.imshow_collection([train_ds[j].img[:,:,0] for _ in range(4)], interpolation="nearest")
# skio.imshow_collection([test_record.img[:,:,0]]+[test_record.detection.mask_array[i].data[0,:,:] for i, _ in enumerate(test_record.detection.labels)], interpolation="nearest")
# print(test_record.detection.labels)

sourced from: https://airctic.com/0.8.1/getting_started_instance_segmentation/

In [None]:
_, dls = get_dataloaders(model_type, [record_collection_train, record_collection_val], get_tfms(), batch_size=bs_d[run_list[0][0]])
metrics = [SimpleConfusionMatrix(print_summary=True)]

learner = model_type.fastai.learner(dls=dls, model=model, cbs=[SaveModelCallback(min_delta=.01)], metrics=metrics, wd=wd)

1 train epoch is about 4 minutes. 1 validation epoch of 76 samples is also about a minute.

In [None]:
# learner.lr_find()

## Progressive Resizing

We save the model first if using savemodel callback or else there is an error saying model.pth does not exist

In [None]:
print(classes_to_keep)
print(model_name)

In [None]:
start_new = False
load_model_name = True

if start_new:
    print("Starting from scratch")
    learner.save("model")
elif load_model_name:
    print(f"Loading {model_name}")
    learner.load(model_name)
else:
    print("Continuing current training session")
    learner.load("model")
learner.save("model")

In [None]:
from icevision.engines.fastai import convert_dataloaders_to_fastai
running_total_epochs = {}

for size, total_train_time in run_list:
    epochs = max(int(total_train_time/mins_d[size]), 1)
    
    _, dls = get_dataloaders(model_type, [record_collection_train, record_collection_val], get_tfms(reduced_resolution_tile_size=size), batch_size=bs_d[size], num_workers=num_workers)
    learner.dls = convert_dataloaders_to_fastai(dls=dls)
    print(f"Training time is: {total_train_time} minutes")
    print("starting from running total", running_total_epochs)
    print("image size", size)
    print("epochs", epochs)

    learner.fine_tune(epochs, lr_d[size], freeze_epochs=0) # cbs=cbs

    running_total_epochs[size] = sum(filter(None,[running_total_epochs.get(size),epochs]))
    learner.save(model_name)

from datetime import datetime
dateTimeObj = datetime.now()
timestampStr = dateTimeObj.strftime("%Y_%m_%d_%H_%M_%S")
experiment_dir =  Path(f'/root/experiments/cv2/{timestampStr}_{model_name}_maskrcnn/')
experiment_dir.mkdir(exist_ok=True)
print(experiment_dir)
from ceruleanml.inference import save_icevision_model_state_dict_and_tracing, load_tracing_model, test_tracing_model_one_batch, logits_to_classes
save_template = "model.pt"
state_dict_pth, tracing_model_cpu_pth  = save_icevision_model_state_dict_and_tracing(learner, save_template, experiment_dir)
print(metrics[0].confusion_matrix)

# Instance Evaluation

In [None]:
learner.load(model_name)
from datetime import datetime
dateTimeObj = datetime.now()
timestampStr = dateTimeObj.strftime("%Y_%m_%d_%H_%M_%S")
experiment_dir =  Path(f'/root/experiments/cv2/{timestampStr}_{model_name}_maskrcnn/')
experiment_dir.mkdir(exist_ok=True)
print(experiment_dir)
from ceruleanml.inference import save_icevision_model_state_dict_and_tracing, load_tracing_model, test_tracing_model_one_batch, logits_to_classes
save_template = "model.pt"
state_dict_pth, tracing_model_cpu_pth  = save_icevision_model_state_dict_and_tracing(learner, save_template, experiment_dir)

In [None]:
learner.save("7hr")

In [None]:
learner.load("60min")

In [None]:
model_type = learner.load("model")

In [None]:
model_type.show_results(model, valid_ds, detection_threshold=.5)

TODO savemodel callback bugs this

In [None]:
validation = learner.validate()

# Exporting the model

In [None]:
from datetime import datetime
dateTimeObj = datetime.now()
timestampStr = dateTimeObj.strftime("%d_%b_%Y_%H_%M_%S")
experiment_dir =  Path(f'{mount_path}/experiments/cv2/'+timestampStr+'_icevision_maskrcnn/')
experiment_dir.mkdir(exist_ok=True)
print(experiment_dir)

In [None]:
from ceruleanml.inference import save_icevision_model_state_dict_and_tracing, load_tracing_model, test_tracing_model_one_batch, logits_to_classes
save_template = "model.pt"
state_dict_pth, tracing_model_cpu_pth  = save_icevision_model_state_dict_and_tracing(learner, save_template, experiment_dir)

# Tips

* `nvidia-smi -lms` reports at millisecond frequency and can reveal big gpu spikes
* a TODO is to debug the COCOMetric, it should not be -1 given that we are now acheiving detections that intersect with groundtruth. It's documented in icevision issues that the COCOMetric doesn't work for torchvision models because of a bounding box coordinate conversion error. They say it works for mmdet but mmdet does not support negative samples and was erroring saying there were negative samples even when none were included and no data transformations were done.
* for the icevision trainer, class mismatch from preprocess remap causes long pauses then training failure. restarting kernel causes . reboot causes driver removal and need to redeploy

# TODO

* save model, run inference
* remove classes from json and resave
* dicemulti metric for icevision