In [1]:
%load_ext autoreload
%autoreload 2
import json
import random as rd
import matplotlib.image as mpimg
import cv2
import wandb
import torch
import numpy as np
import math

from detectron2.utils.visualizer import Visualizer
from detectron2.utils.logger import setup_logger
from detectron2.data import MetadataCatalog, DatasetCatalog
from detectron2 import model_zoo
from detectron2.config import get_cfg
from detectron2.config.config import CfgNode as CN
from detectron2.modeling import build_model
from detectron2.evaluation import COCOEvaluator, inference_on_dataset
from detectron2.data import build_detection_test_loader

from src.globals import *
from src.visualization.show_image import show_image
from src.register_datasets import register_datasets, register_by_ids
from src.test import do_test
from src.train import do_train
from src.predict import predict_image_in_acdc

In [2]:
def build_config(config_name):
    cfg = get_cfg()
    cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
    cfg.NAME = config_name
    cfg.AL = CN()
    cfg.AL.DATASETS = CN()
    cfg.AL.DATASETS.TRAIN_UNLABELED = TRAIN_DATASET_FULL
    cfg.AL.MAX_LOOPS = 20
    cfg.AL.INIT_SIZE = 20
    cfg.AL.INCREMENT_SIZE = 20
    cfg.AL.QUERY_STRATEGY = RANDOM
    
    cfg.DATASETS.TRAIN = (TRAIN_DATASET_FULL,)    
    cfg.DATASETS.TEST = (VALIDATION_DATASET_SLIM,)
    cfg.DATALOADER.NUM_WORKERS = 2
    cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")  # Let training initialize from model zoo
    cfg.SOLVER.IMS_PER_BATCH = 2  # This is the real "batch size" commonly known to deep learning people
    cfg.SOLVER.BASE_LR = 0.0003  # pick a good LR
    cfg.SOLVER.MAX_ITER = 300    # 300 iterations seems good enough for this toy dataset; you will need to train longer for a practical dataset
    cfg.SOLVER.STEPS = []        # do not decay learning rate
    cfg.WARMUP_ITERS = 1
    cfg.EARLY_STOPPING_ROUNDS = 2
    cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128   # The "RoIHead batch size". 128 is faster, and good enough for this toy dataset (default: 512)
    cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1
    cfg.OUTPUT_DIR = "./output/" + cfg.NAME
    cfg.TEST.EVAL_PERIOD = 100
    
    print(cfg.WARMUP_ITERS)
    
    with open("./src/pipeline_configs/" + cfg.NAME + ".yaml","w") as file:
        file.write(cfg.dump())

def get_config(config_name):
    
    cfg = get_cfg()
    cfg.NAME = " "
    cfg.AL = CN()
    cfg.AL.DATASETS = CN()
    cfg.AL.DATASETS.TRAIN_UNLABELED = ""
    cfg.AL.MAX_LOOPS = 0
    cfg.AL.INIT_SIZE = 0
    cfg.AL.INCREMENT_SIZE = 0
    cfg.AL.QUERY_STRATEGY = ""
    cfg.WARMUP_ITERS = 1
    cfg.EARLY_STOPPING_ROUNDS = 2
    
    file_path = "src/pipeline_configs/" + config_name + ".yaml"
    cfg.merge_from_file(file_path)
    return cfg



In [3]:
import random as rd

from detectron2.data import MetadataCatalog, DatasetCatalog

from src.register_datasets import register_datasets, register_by_ids

class ActiveLearingDataset:
    
    def __init__(self, cfg):
        """
        
        
        """
        
        self.cfg = cfg
        
        register_datasets()

        # get ids of all images
        self.unlabeled_ids = [image["image_id"] for image in DatasetCatalog.get(cfg.AL.DATASETS.TRAIN_UNLABELED)]
        self.labeled_ids = []
        
        self.unlabeled_data_name = "temp_unlabeled_data_al"
        self.labeled_data_name = "temp_labeled_data_al"
        
        self.init_size = cfg.AL.INIT_SIZE
        self.increment_size = cfg.AL.INCREMENT_SIZE
        
        # set seed
        rd.seed(1337)
        sample_ids = rd.sample(self.unlabeled_ids, self.init_size)
        self.update_labeled_data(sample_ids)
        self.get_labeled_dataset()
        self.get_unlabled_dataset()
        
    
    def remove_data_from_catalog(self,name):
        
        if name in DatasetCatalog:
            DatasetCatalog.remove(name)
            MetadataCatalog.remove(name)
        
        
    def get_labeled_dataset(self):
        self.remove_data_from_catalog(self.labeled_data_name)
        register_by_ids(self.cfg, self.labeled_data_name, self.labeled_ids)
        self.cfg.DATASETS.TRAIN = (self.labeled_data_name,)
    
    def get_unlabled_dataset(self):
        self.remove_data_from_catalog(self.unlabeled_data_name)
        register_by_ids(self.cfg, self.unlabeled_data_name,self.unlabeled_ids)
        self.cfg.AL.DATASETS.TRAIN_UNLABELED = self.unlabeled_data_name
    
    def update_labeled_data(self, sample_ids):
        print("update_labeled_data")
        # check if sample_ids are in unlabeled_ids
        if not (set(sample_ids) <= set(self.unlabeled_ids)):
            raise Exception("Some ids ({}) in sample_ids are not contained in unlabeled data pool: {}".format(len(list(set(sample_ids) - set(self.unlabeled_ids))),list(set(sample_ids) - set(self.unlabeled_ids))[:5])) 

        self.labeled_ids += sample_ids
        self.unlabeled_ids = list(set(self.unlabeled_ids) - set(sample_ids))
        
        self.get_labeled_dataset()
        self.get_unlabled_dataset()
        



In [4]:
%%capture
class QueryStrategy(object):
    
    def __init__(self,cfg):
        
        self.cfg = cfg
        
    
    def sample(self,model, ids):
        pass
    
class RandomSampler(QueryStrategy):
    
    def sample(self,model, ids):
        num_samples = self.cfg.AL.INCREMENT_SIZE        
        samples = rd.sample(ids, num_samples)
        return samples

class GTknownSampler(QueryStrategy):
    
    def sample(self, model, ids):
        num_samples = self.cfg.AL.INCREMENT_SIZE
        
        id_pool = rd.sample(ids, min(600,len(ids)))
        
        register_by_ids(self.cfg,"GTknownSampler_DS",id_pool)

        
        evaluator = COCOEvaluator("GTknownSampler_DS", output_dir=self.cfg.OUTPUT_DIR)
        data_loader = build_detection_test_loader(self.cfg, "GTknownSampler_DS")
        inference_on_dataset(model, data_loader, evaluator)


        result_array = []
        image_ids = [image["image_id"] for image in DatasetCatalog.get("GTknownSampler_DS")]
        for image_id in image_ids:
            result = evaluator.evaluate(image_id)
            result_array.append(result)

        aps = np.array([result['segm']['AP'] for result in result_array])
        sample_ids = list(np.argsort(aps)[:num_samples])
        print("max aps: ", aps[sample_ids[0]])
        print("min aps: ", aps[list(np.argsort(aps)[:num_samples])[-1]])
        
        samples = [image_ids[id] for id in sample_ids]

        return samples
    

    
    

In [5]:

class ActiveLearningTrainer:
    
    def __init__(self, cfg):
        self.cfg = cfg
        
        # initialize weights and biases
        wandb.init(project="activeCell-ACDC", sync_tensorboard=True)
        
        self.logger = setup_logger(output="./log/main.log")
        self.logger.setLevel(10)
        
        self.al_dataset = ActiveLearingDataset(cfg)   
        self.model = build_model(cfg)
        self.query_strategy = GTknownSampler(cfg)
        
        
    def __del__(self):
        wandb.run.finish()
    
    def step(self, resume):
        
        len_ds_train = len(DatasetCatalog.get(self.cfg.DATASETS.TRAIN[0]))
        print("lenght of train data set: {}".format(len_ds_train))
        self.cfg.SOLVER.MAX_ITER = min(400 + len_ds_train*5, 1000)
        self.cfg.SOLVER.STEPS = [math.ceil(self.cfg.SOLVER.MAX_ITER/3),math.ceil(2*self.cfg.SOLVER.MAX_ITER/3)]
        
        if not resume:
            cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
            
        do_train(self.cfg, self.model, self.logger,resume=resume)
        result = do_test(self.cfg, self.model, self.logger)
        wandb.log(
            {
                "active_step_bbox_ap": result['bbox']['AP'],
                "active_step_segm_ap": result['segm']['AP']
            })
        

        sample_ids = self.query_strategy.sample(self.model, self.al_dataset.unlabeled_ids)
        self.al_dataset.update_labeled_data(sample_ids)
        
    
    def run(self):
        try:
            for i in range(self.cfg.AL.MAX_LOOPS):
                self.step(resume=False)    #(i>0))
        except Exception as e:
            wandb.run.finish()
            raise e
        

In [6]:
#build_config("al_pipeline_config2")

In [None]:

cfg = get_config("al_pipeline_config2")
al_trainer = ActiveLearningTrainer(cfg)
al_trainer.run()


[34m[1mwandb[0m: Currently logged in as: [33mflorian-bridges[0m. Use [1m`wandb login --relogin`[0m to force relogin


[32m[11/10 17:22:04 d2.data.datasets.coco]: [0mLoading ./data/dataInCOCO/train/cell_acdc_coco_ds.json takes 3.39 seconds.
Category ids in annotations are not in [1, #categories]! We'll apply a mapping for you.

[32m[11/10 17:22:04 d2.data.datasets.coco]: [0mLoaded 5784 images in COCO format from ./data/dataInCOCO/train/cell_acdc_coco_ds.json
update_labeled_data
[Errno 2] No such file or directory: './output/al_pipeline_config2/temp_labeled_data_al_coco_format.json'
[Errno 2] No such file or directory: './output/al_pipeline_config2/temp_unlabeled_data_al_coco_format.json'
[Errno 2] No such file or directory: './output/al_pipeline_config2/temp_labeled_data_al_coco_format.json'
[Errno 2] No such file or directory: './output/al_pipeline_config2/temp_unlabeled_data_al_coco_format.json'
[32m[11/10 17:22:12 d2.data.datasets.coco]: [0mLoading ./data/dataInCOCO/train/cell_acdc_coco_ds.json takes 3.41 seconds.
Category ids in annotations are not in [1, #categories]! We'll apply a mapping f

  wandb.config.update(yaml.load(cfg.dump()))
Skip loading parameter 'roi_heads.box_predictor.cls_score.weight' to the model due to incompatible shapes: (81, 1024) in the checkpoint but (2, 1024) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.box_predictor.cls_score.bias' to the model due to incompatible shapes: (81,) in the checkpoint but (2,) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.box_predictor.bbox_pred.weight' to the model due to incompatible shapes: (320, 1024) in the checkpoint but (4, 1024) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.box_predictor.bbox_pred.bias' to the model due to incompatible shapes: (320,) in the checkpoint but (4,) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.mask_head.predictor.weight' to the model due to incompatible shapes: (80, 256, 

[32m[11/10 17:22:16 d2.data.datasets.coco]: [0mLoading ./data/dataInCOCO/train/cell_acdc_coco_ds.json takes 2.64 seconds.
Category ids in annotations are not in [1, #categories]! We'll apply a mapping for you.

[32m[11/10 17:22:17 d2.data.datasets.coco]: [0mLoaded 5784 images in COCO format from ./data/dataInCOCO/train/cell_acdc_coco_ds.json
[32m[11/10 17:22:17 d2.data.build]: [0mRemoved 0 images with no usable annotations. 20 images left.
[32m[11/10 17:22:17 d2.data.build]: [0mDistribution of instances among all 1 categories:
[36m|  category  | #instances   |
|:----------:|:-------------|
|    cell    | 242          |
|            |              |[0m
[32m[11/10 17:22:17 d2.data.dataset_mapper]: [0m[DatasetMapper] Augmentations used in training: [ResizeShortestEdge(short_edge_length=(640, 672, 704, 736, 768, 800), max_size=1333, sample_style='choice'), RandomFlip()]
[32m[11/10 17:22:17 d2.data.build]: [0mUsing training sampler TrainingSampler
[32m[11/10 17:22:17 d2.data.

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


[32m[11/10 17:22:28 d2.utils.events]: [0m iter: 19  total_loss: 1.959  loss_cls: 0.4534  loss_box_reg: 0.8879  loss_mask: 0.5438  loss_rpn_cls: 0.05388  loss_rpn_loc: 0.03706  lr: 0.0003  max_mem: 2759M
[32m[11/10 17:22:37 d2.utils.events]: [0m eta: 0:03:20  iter: 39  total_loss: 1.317  loss_cls: 0.2297  loss_box_reg: 0.7228  loss_mask: 0.284  loss_rpn_cls: 0.009751  loss_rpn_loc: 0.03441  lr: 0.0003  max_mem: 2759M
[32m[11/10 17:22:55 d2.utils.events]: [0m eta: 0:06:37  iter: 59  total_loss: 0.7468  loss_cls: 0.1249  loss_box_reg: 0.3935  loss_mask: 0.1698  loss_rpn_cls: 0.005578  loss_rpn_loc: 0.02947  lr: 0.0003  max_mem: 2759M
[32m[11/10 17:23:35 d2.utils.events]: [0m eta: 0:14:01  iter: 79  total_loss: 0.5194  loss_cls: 0.0938  loss_box_reg: 0.2471  loss_mask: 0.1427  loss_rpn_cls: 0.004753  loss_rpn_loc: 0.02784  lr: 0.0003  max_mem: 2759M
Category ids in annotations are not in [1, #categories]! We'll apply a mapping for you.

[32m[11/10 17:24:17 d2.data.datasets.coco]: 