#### This notebook was inspired from [here](https://www.kaggle.com/code/ammarnassanalhajali/layout-parser-model-training)

# Detectron2

[Detectron2](https://detectron2.readthedocs.io/en/latest/index.html) is a popular open-source software library developed by Facebook AI Research (FAIR) for building computer vision models. It serves as a powerful framework for object detection, instance segmentation, and keypoint detection tasks. Detectron2 is built on top of PyTorch, geared towards a more convenient way to build modular, flexible pipelines for specific Computer Vision Tasks such as object detection, instance segmentation. 

Detectron2 has a collection of trained models for these tasks in their [model zoo](https://github.com/facebookresearch/detectron2/blob/main/MODEL_ZOO.md). We can also use detectron2 to train pre-implemented state-of-the-art models from scratch for new datasets, as we do in this notebook. 

Read the [documentation](https://detectron2.readthedocs.io/en/latest/index.html).

# 1 Install detectron2

## 1.1 Recommended Way (is not working on kaggle)

In [None]:
# !python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'

In [1]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

## 1.2 Fast Way
Ignore the warnings.

In [2]:
%%capture
import sys, os, distutils.core
# Note: This is a faster way to install detectron2 in Colab, but it does not include all functionalities (e.g. compiled operators).
# See https://detectron2.readthedocs.io/tutorials/install.html for full installation instructions
!git clone 'https://github.com/facebookresearch/detectron2'
dist = distutils.core.run_setup("./detectron2/setup.py")
!python -m pip install {' '.join([f"'{x}'" for x in dist.install_requires])}
sys.path.insert(0, os.path.abspath('./detectron2'))

# 2 Notebook Config

In [3]:

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"


from datetime import datetime

# if False, model is set to `PRETRAINED_PATH` model
is_train = True

# if True, evaluate on validation dataset
is_evaluate = True

# if True, run inference on test dataset
is_inference = True

# if True and `is_train` == True, `PRETRAINED_PATH` model is trained further
is_resume_training = False

# Perform augmentation
is_augment = False

SEED = 42
import random
import os
import numpy as np
import torch
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

"""## 2.2 Paths"""

from pathlib import Path

TRAIN_IMG_DIR = Path("/kaggle/input/binarizedbadlad/binTrain")

TRAIN_COCO_PATH = Path("/kaggle/input/dlsprint2/badlad/labels/coco_format/train/badlad-train-coco.json")

TEST_IMG_DIR = Path("/kaggle/input/dlsprint2/badlad/images/test")

TEST_METADATA_PATH = Path("/kaggle/input/dlsprint2/badlad/badlad-test-metadata.json")

# Training output directory
OUTPUT_DIR = Path("./output")
OUTPUT_MODEL = OUTPUT_DIR/"model_final.pth"

# Path to your pretrained model weights
PRETRAINED_PATH = Path("")

"""## 2.3 imports"""

# detectron2
from detectron2.utils.memory import retry_if_cuda_oom
from detectron2.utils.logger import setup_logger
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.modeling import build_model
from detectron2.evaluation import COCOEvaluator, inference_on_dataset
import detectron2.data.transforms as T
from detectron2.data import detection_utils as utils
from detectron2.data import DatasetCatalog, MetadataCatalog, build_detection_test_loader, build_detection_train_loader, DatasetMapper
from detectron2.utils.visualizer import Visualizer
from detectron2.structures import BoxMode
from detectron2.engine import DefaultPredictor, DefaultTrainer
from detectron2.config import get_cfg
from detectron2 import model_zoo

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm  # progress bar
import matplotlib.pyplot as plt
import json
import cv2
import copy
from typing import Optional

from IPython.display import FileLink

# torch
import torch
import os

import gc

import warnings
# Ignore "future" warnings and Data-Frame-Slicing warnings.
warnings.filterwarnings('ignore')

setup_logger()

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

"""# 3 COCO Annotations Data

## 3.1 Load
"""

with TEST_METADATA_PATH.open() as f:
    test_dict = json.load(f)


print("#### LABELS AND METADATA LOADED ####")

"""## 3.2 Observe"""

def organize_coco_data(data_dict: dict):
    thing_classes: list[str] = []

    # Map Category Names to IDs
    for cat in data_dict['categories']:
        thing_classes.append(cat['name'])

    print(thing_classes)

    # thing_classes = ['paragraph', 'text_box', 'image', 'table']
    # Images
    images_metadata: list[dict] = data_dict['images']

    # Convert COCO annotations to detectron2 annotations format
    data_annotations = []
    for ann in data_dict['annotations']:
        # coco format -> detectron2 format
        annot_obj = {
            # Annotation ID
            "id": ann['id'],

            # Segmentation Polygon (x, y) coords
            "gt_masks": ann['segmentation'],

            # Image ID for this annotation (Which image does this annotation belong to?)
            "image_id": ann['image_id'],

            # Category Label (0: paragraph, 1: text box, 2: image, 3: table)
            "category_id": ann['category_id'],

            "x_min": ann['bbox'][0],  # left
            "y_min": ann['bbox'][1],  # top
            "x_max": ann['bbox'][0] + ann['bbox'][2],  # left+width
            "y_max": ann['bbox'][1] + ann['bbox'][3]  # top+height
        }
        data_annotations.append(annot_obj)

    return thing_classes, images_metadata, data_annotations


thing_classes_test, images_metadata_test, _ = organize_coco_data(
    test_dict
)

thing_classes = thing_classes_test
print("THINGS CLASSES")
print(thing_classes)
test_metadata = pd.DataFrame(images_metadata_test)
test_metadata = test_metadata[['id', 'file_name', 'width', 'height']]
test_metadata = test_metadata.rename(columns={"id": "image_id"})
print("test_metadata size=", len(test_metadata))
test_metadata.head(5)

def convert_coco_to_detectron2_format(
    imgdir: Path,
    metadata_df: pd.DataFrame,
    annot_df: Optional[pd.DataFrame] = None,
    target_indices: Optional[np.ndarray] = None,
):

    dataset_dicts = []
    for _, train_meta_row in tqdm(metadata_df.iterrows(), total=len(metadata_df)):
        # Iterate over each image
        image_id, filename, width, height = train_meta_row.values

        annotations = []

        # If train/validation data, then there will be annotations
        if annot_df is not None:
            for _, ann in annot_df.query("image_id == @image_id").iterrows():
                # Get annotations of current iteration's image
                class_id = ann["category_id"]
                gt_masks = ann["gt_masks"]
                bbox_resized = [
                    float(ann["x_min"]),
                    float(ann["y_min"]),
                    float(ann["x_max"]),
                    float(ann["y_max"]),
                ]

                annotation = {
                    "bbox": bbox_resized,
                    "bbox_mode": BoxMode.XYXY_ABS,
                    "segmentation": gt_masks,
                    "category_id": class_id,
                }

                annotations.append(annotation)

        # coco format -> detectron2 format dict
        record = {
            "file_name": str(imgdir/filename),
            "image_id": image_id,
            "width": width,
            "height": height,
            "annotations": annotations
        }

        dataset_dicts.append(record)

    if target_indices is not None:
        dataset_dicts = [dataset_dicts[i] for i in target_indices]

    return dataset_dicts


"""## 4.3 Registering and Loading Data for `detectron2`"""
DATA_REGISTER_TEST     = "badlad_test"

# Register Test Inference data
DatasetCatalog.register(
    DATA_REGISTER_TEST,
    lambda: convert_coco_to_detectron2_format(
        TEST_IMG_DIR,
        test_metadata,
    )
)

# Set Test data categories
MetadataCatalog.get(DATA_REGISTER_TEST).set(
    thing_classes=thing_classes_test
)

# dataset_dicts_test = DatasetCatalog.get(DATA_REGISTER_TEST)
metadata_dicts_test = MetadataCatalog.get(DATA_REGISTER_TEST)

#### LABELS AND METADATA LOADED ####
['paragraph', 'text_box', 'image', 'table']
THINGS CLASSES
['paragraph', 'text_box', 'image', 'table']
test_metadata size= 13000


In [4]:
! git clone https://github.com/microsoft/unilm.git --depth=1 --quiet
! sed -i 's/from collections import Iterable/from collections.abc import Iterable/' unilm/dit/object_detection/ditod/table_evaluation/data_structure.py

In [5]:
import sys
sys.path.append("unilm")

import cv2

from unilm.dit.object_detection.ditod import add_vit_config

In [6]:
%%writefile cascade_dit_base.yaml
_BASE_: "/kaggle/input/dit-publay-finetuned/Base-RCNN-FPN.yaml"
MODEL:
  PIXEL_MEAN: [ 127.5, 127.5, 127.5 ]
  PIXEL_STD: [ 127.5, 127.5, 127.5 ]
  WEIGHTS: "https://layoutlm.blob.core.windows.net/dit/dit-pts/dit-base-224-p16-500k-62d53a.pth"
  VIT:
    NAME: "dit_base_patch16"
  ROI_HEADS:
    NAME: CascadeROIHeads
  ROI_BOX_HEAD:
    CLS_AGNOSTIC_BBOX_REG: True
  RPN:
    POST_NMS_TOPK_TRAIN: 2000
SOLVER:
  WARMUP_ITERS: 1000
  IMS_PER_BATCH: 16
  MAX_ITER: 60000
  CHECKPOINT_PERIOD: 2000
TEST:
  EVAL_PERIOD: 2000

Writing cascade_dit_base.yaml


In [7]:
torch.cuda.empty_cache()
gc.collect()

76

In [8]:
def rebuild_model():
    model = build_model(inf_cfg)
    _ = DetectionCheckpointer(model).load(inf_cfg.MODEL.WEIGHTS)
    return model

In [9]:
inf_cfg = get_cfg()
add_vit_config(inf_cfg)
inf_cfg.merge_from_file("/kaggle/working/cascade_dit_base.yaml")
inf_cfg.SOLVER.IMS_PER_BATCH = 64
inf_cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128
inf_cfg.MODEL.ROI_HEADS.NUM_CLASSES = 4
inf_cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.25
inf_cfg.MODEL.DEVICE = "cuda"
inf_cfg.DATALOADER.NUM_WORKERS = 2  # lower this if CUDA overflow occurs
inf_cfg.MODEL.WEIGHTS = str("/kaggle/input/dit-publay-finetuned/dit-pub-50000.pth")
inf_cfg.OUTPUT_DIR = str(OUTPUT_DIR)
print("creating cfg.OUTPUT_DIR -> ", inf_cfg.OUTPUT_DIR)
OUTPUT_DIR.mkdir(exist_ok=True)
model = rebuild_model()
model.eval()

creating cfg.OUTPUT_DIR ->  output
[32m[08/05 01:26:25 d2.checkpoint.detection_checkpoint]: [0m[DetectionCheckpointer] Loading from /kaggle/input/dit-publay-finetuned/dit-pub-50000.pth ...


GeneralizedRCNN(
  (backbone): FPN(
    (fpn_lateral2): Conv2d(768, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral3): Conv2d(768, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral4): Conv2d(768, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral5): Conv2d(768, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output5): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (top_block): LastLevelMaxPool()
    (bottom_up): VIT_Backbone(
      (backbone): BEiT(
        (patch_embed): PatchEmbed(
          (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
        )
        (pos_drop): Dropout(p=0.0, inplace=False)
        (blocks): ModuleList(
          (0): Block(
            (norm1

In [46]:
def binarize(image):
    # Convert image to grayscale if it has more than one channel
    if len(image.shape) > 2 and image.shape[2] in [3, 4]:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Binarize the grayscale image
    _, binary_image = cv2.threshold(image, 128, 255, cv2.THRESH_BINARY)

    # Convert the binary image back to 3-channel format (optional, but could be useful for consistency)
    binary_image = cv2.cvtColor(binary_image, cv2.COLOR_GRAY2RGB)

    return binary_image


In [11]:
class BinarizeDatasetMapper(DatasetMapper):
    def __init__(self, cfg, is_train=True):
        super().__init__(cfg, is_train)

    def __call__(self, dataset_dict):
        dataset_dict = super().__call__(dataset_dict)
        image = dataset_dict["image"].permute(1, 2, 0).cpu().numpy()
        binarized_image = binarize(image)
        dataset_dict["image"] = torch.tensor(binarized_image.transpose(2, 0, 1)).to(dataset_dict["image"].device)
        return dataset_dict

In [47]:
def collate_fn(batch):
    images = [data["image"].cpu().numpy() for data in batch]
    images = [binarize(image) for image in images]
    images = torch.stack([torch.tensor(image.transpose(2, 0, 1)) for image in images])
    batch[0]["image"] = images
    return batch


In [50]:
BATCH = 2  # lower this if CUDA overflow occurs
# mapper = BinarizeDatasetMapper(inf_cfg, is_train=False)
test_loader = build_detection_test_loader(inf_cfg, DATA_REGISTER_TEST,batch_size=BATCH)

  0%|          | 0/13000 [00:00<?, ?it/s]

[32m[08/05 02:00:12 d2.data.dataset_mapper]: [0m[DatasetMapper] Augmentations used in inference: [ResizeShortestEdge(short_edge_length=(800, 800), max_size=1333, sample_style='choice')]
[32m[08/05 02:00:12 d2.data.common]: [0mSerializing the dataset using: <class 'detectron2.data.common._TorchSerializedList'>
[32m[08/05 02:00:12 d2.data.common]: [0mSerializing 13000 elements to byte tensors and concatenating them all ...
[32m[08/05 02:00:12 d2.data.common]: [0mSerialized dataset takes 2.07 MiB


In [41]:
# ACCEPTANCE_THRESHOLD = 0.6

In [43]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
torch.cuda.empty_cache()
gc.collect()

0

In [30]:
def rle_encode(mask):
    pixels = mask.T.flatten()
    use_padding = False
    if pixels[0] or pixels[-1]:
        use_padding = True
        pixel_padded = np.zeros([len(pixels) + 2], dtype=pixels.dtype)
        pixel_padded[1:-1] = pixels
        pixels = pixel_padded
    rle = np.where(pixels[1:] != pixels[:-1])[0] + 2
    if use_padding:
        rle = rle - 1
    rle[1::2] = rle[1::2] - rle[:-1:2]
    return ' '.join(str(x) for x in rle)

In [31]:
ACCEPTANCE_THRESHOLDS = {
    "paragraph": 0.5,
    "text_box": 0.3,
    "image": 0.5,
    "table": 0.55,
}

# @retry_if_cuda_oom
# def get_masks(prediction):
#     # get masks for each category
#     pred_masks = (prediction.pred_masks != 0)
#     pred_classes = prediction.pred_classes

#     rles = []
#     for cat in range(len(thing_classes_test)):
#         pred_mask = pred_masks[pred_classes == cat]
#         pred_mask = torch.any(pred_mask, dim=0)
        
#         threshold = ACCEPTANCE_THRESHOLDS[thing_classes[cat]]
#         take = prediction.scores >= threshold
#         pred_mask = pred_mask & take
        
#         rles.append(rle_encode(pred_mask.short().to("cpu").numpy()))

#     return rles

def get_masks(prediction):
    # get masks for each category
    rles = []
    for cat in range(len(thing_classes)):
        threshold = ACCEPTANCE_THRESHOLDS.get(thing_classes[cat], 0.4)  # Get threshold or set to 0.4 if not present
        if threshold==0.4:
            print("thresh : 0.4")
        take = prediction.scores >= threshold
        pred_masks = (prediction.pred_masks[take] != 0)
        pred_classes = prediction.pred_classes[take]
        
        pred_mask = torch.any(pred_masks[pred_classes == cat], dim=0)
        rles.append(rle_encode(pred_mask.short().to("cpu").numpy()))

    return rles


# def get_masks(prediction):
#     # get masks for each category
#     take = prediction.scores >= ACCEPTANCE_THRESHOLD
#     pred_masks = (prediction.pred_masks[take] != 0)
#     pred_classes = prediction.pred_classes[take]
  
#     rles = []
#     for cat in range(len(thing_classes)):
#         pred_mask = pred_masks[pred_classes == cat]
        
#         # pred_mask = retry_if_cuda_oom(torch.any)(pred_mask, dim=0)
#         pred_mask = torch.any(pred_mask, dim=0)
#         rles.append(rle_encode(pred_mask.short().to("cpu").numpy()))

#     return rles

In [32]:
def run_inference(data):
    results = []
    with torch.no_grad():
        outputs = model(data)
        if torch.cuda.is_available():
            torch.cuda.synchronize()

        for idx, output in enumerate(outputs):
            output = output["instances"]

            rles = get_masks(output)

            result = [
                f"{data[idx]['image_id']}_{cat},{rles[cat]}\n"
                for cat in range(len(thing_classes))
            ]

            results.extend(result)

        del outputs, output

    return results

In [51]:
submission_file = open("submission.csv", "w")
submission_file.write("Id,Predicted\n")

results: list[str] = []

for i, data in enumerate(tqdm(test_loader)):
    res = run_inference(data)
    results.extend(res)

    if i % (500 // BATCH) == 0:
        print(f"Inference on batch {i}/{len(test_loader)} done")
        submission_file.writelines(results)
        results = []

submission_file.writelines(results)
submission_file.close()

  0%|          | 0/6500 [00:00<?, ?it/s]

Inference on batch 0/6500 done


KeyboardInterrupt: 

In [None]:
if Path("submission.csv").exists:
    display(FileLink("submission.csv"))

In [None]:
!rm -r detectron2/