## Setup & Installation

In [1]:
%cd ~/../kaggle/working
!git clone https://github.com/facebookresearch/paco.git

/kaggle/working
fatal: destination path 'paco' already exists and is not an empty directory.


In [2]:
%cd paco
!ls ./

/kaggle/working/paco
CODE_OF_CONDUCT.md  LICENSE	   README.md	     state.db
configs		    notebooks	   requirements.txt  tools
CONTRIBUTING.md     paco	   scripts	     tranform.py
docs		    paco.egg-info  setup.py	     transform.py


In [3]:
!pip install -r requirements.txt
!pip install -e .

Collecting detectron2@ git+https://github.com/facebookresearch/detectron2.git@0703e08a5f589f7503a3fbfce41309c80204eec8 (from -r requirements.txt (line 1))
  Cloning https://github.com/facebookresearch/detectron2.git (to revision 0703e08a5f589f7503a3fbfce41309c80204eec8) to /tmp/pip-install-v9xvm7ib/detectron2_3c50867b1b47419cb5fc367d6d28fe58
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/detectron2.git /tmp/pip-install-v9xvm7ib/detectron2_3c50867b1b47419cb5fc367d6d28fe58
  Running command git rev-parse -q --verify 'sha^0703e08a5f589f7503a3fbfce41309c80204eec8'
  Running command git fetch -q https://github.com/facebookresearch/detectron2.git 0703e08a5f589f7503a3fbfce41309c80204eec8
  Running command git checkout -q 0703e08a5f589f7503a3fbfce41309c80204eec8
  Resolved https://github.com/facebookresearch/detectron2.git to commit 0703e08a5f589f7503a3fbfce41309c80204eec8
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ego4d (from -r

In [4]:
### All imports
import os
import paco
import json
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from collections import defaultdict
from IPython.display import Markdown
from detectron2.data import MetadataCatalog
from detectron2.utils.visualizer import Visualizer

### Since a older version of transform.py exists in kaggle, it is modified below to work properly

In [6]:
%cd ~/../usr/local/lib/python3.11/dist-packages/detectron2/data/transforms/

/usr/local/lib/python3.11/dist-packages/detectron2/data/transforms


In [7]:
%%writefile transform.py
# %load transform.py
# Copyright (c) Facebook, Inc. and its affiliates.

"""
See "Data Augmentation" tutorial for an overview of the system:
https://detectron2.readthedocs.io/tutorials/augmentation.html
"""

import numpy as np
import torch
import torch.nn.functional as F
from fvcore.transforms.transform import (
    CropTransform,
    HFlipTransform,
    NoOpTransform,
    Transform,
    TransformList,
)
from PIL import Image

try:
    import cv2  # noqa
except ImportError:
    # OpenCV is an optional dependency at the moment
    pass

__all__ = [
    "ExtentTransform",
    "ResizeTransform",
    "RotationTransform",
    "ColorTransform",
    "PILColorTransform",
]


class ExtentTransform(Transform):
    """
    Extracts a subregion from the source image and scales it to the output size.

    The fill color is used to map pixels from the source rect that fall outside
    the source image.

    See: https://pillow.readthedocs.io/en/latest/PIL.html#PIL.ImageTransform.ExtentTransform
    """

    def __init__(self, src_rect, output_size, interp=Image.BILINEAR, fill=0):
        """
        Args:
            src_rect (x0, y0, x1, y1): src coordinates
            output_size (h, w): dst image size
            interp: PIL interpolation methods
            fill: Fill color used when src_rect extends outside image
        """
        super().__init__()
        self._set_attributes(locals())

    def apply_image(self, img, interp=None):
        h, w = self.output_size
        if len(img.shape) > 2 and img.shape[2] == 1:
            pil_image = Image.fromarray(img[:, :, 0], mode="L")
        else:
            pil_image = Image.fromarray(img)
        pil_image = pil_image.transform(
            size=(w, h),
            method=Image.EXTENT,
            data=self.src_rect,
            resample=interp if interp else self.interp,
            fill=self.fill,
        )
        ret = np.asarray(pil_image)
        if len(img.shape) > 2 and img.shape[2] == 1:
            ret = np.expand_dims(ret, -1)
        return ret

    def apply_coords(self, coords):
        # Transform image center from source coordinates into output coordinates
        # and then map the new origin to the corner of the output image.
        h, w = self.output_size
        x0, y0, x1, y1 = self.src_rect
        new_coords = coords.astype(np.float32)
        new_coords[:, 0] -= 0.5 * (x0 + x1)
        new_coords[:, 1] -= 0.5 * (y0 + y1)
        new_coords[:, 0] *= w / (x1 - x0)
        new_coords[:, 1] *= h / (y1 - y0)
        new_coords[:, 0] += 0.5 * w
        new_coords[:, 1] += 0.5 * h
        return new_coords

    def apply_segmentation(self, segmentation):
        segmentation = self.apply_image(segmentation, interp=Image.NEAREST)
        return segmentation


class ResizeTransform(Transform):
    """
    Resize the image to a target size.
    """

    def __init__(self, h, w, new_h, new_w, interp=None):
        """
        Args:
            h, w (int): original image size
            new_h, new_w (int): new image size
            interp: PIL interpolation methods, defaults to bilinear.
        """
        # TODO decide on PIL vs opencv
        super().__init__()
        if interp is None:
            interp = Image.BILINEAR
        self._set_attributes(locals())

    def apply_image(self, img, interp=None):
        assert img.shape[:2] == (self.h, self.w)
        assert len(img.shape) <= 4
        interp_method = interp if interp is not None else self.interp

        if img.dtype == np.uint8:
            if len(img.shape) > 2 and img.shape[2] == 1:
                pil_image = Image.fromarray(img[:, :, 0], mode="L")
            else:
                pil_image = Image.fromarray(img)
            pil_image = pil_image.resize((self.new_w, self.new_h), interp_method)
            ret = np.asarray(pil_image)
            if len(img.shape) > 2 and img.shape[2] == 1:
                ret = np.expand_dims(ret, -1)
        else:
            # PIL only supports uint8
            if any(x < 0 for x in img.strides):
                img = np.ascontiguousarray(img)
            img = torch.from_numpy(img)
            shape = list(img.shape)
            shape_4d = shape[:2] + [1] * (4 - len(shape)) + shape[2:]
            img = img.view(shape_4d).permute(2, 3, 0, 1)  # hw(c) -> nchw
            _PIL_RESIZE_TO_INTERPOLATE_MODE = {
                Image.NEAREST: "nearest",
                Image.BILINEAR: "bilinear",
                Image.BICUBIC: "bicubic",
            }
            mode = _PIL_RESIZE_TO_INTERPOLATE_MODE[interp_method]
            align_corners = None if mode == "nearest" else False
            img = F.interpolate(
                img, (self.new_h, self.new_w), mode=mode, align_corners=align_corners
            )
            shape[:2] = (self.new_h, self.new_w)
            ret = img.permute(2, 3, 0, 1).view(shape).numpy()  # nchw -> hw(c)

        return ret

    def apply_coords(self, coords):
        coords[:, 0] = coords[:, 0] * (self.new_w * 1.0 / self.w)
        coords[:, 1] = coords[:, 1] * (self.new_h * 1.0 / self.h)
        return coords

    def apply_segmentation(self, segmentation):
        segmentation = self.apply_image(segmentation, interp=Image.NEAREST)
        return segmentation

    def inverse(self):
        return ResizeTransform(self.new_h, self.new_w, self.h, self.w, self.interp)


class RotationTransform(Transform):
    """
    This method returns a copy of this image, rotated the given
    number of degrees counter clockwise around its center.
    """

    def __init__(self, h, w, angle, expand=True, center=None, interp=None):
        """
        Args:
            h, w (int): original image size
            angle (float): degrees for rotation
            expand (bool): choose if the image should be resized to fit the whole
                rotated image (default), or simply cropped
            center (tuple (width, height)): coordinates of the rotation center
                if left to None, the center will be fit to the center of each image
                center has no effect if expand=True because it only affects shifting
            interp: cv2 interpolation method, default cv2.INTER_LINEAR
        """
        super().__init__()
        image_center = np.array((w / 2, h / 2))
        if center is None:
            center = image_center
        if interp is None:
            interp = cv2.INTER_LINEAR
        abs_cos, abs_sin = (abs(np.cos(np.deg2rad(angle))), abs(np.sin(np.deg2rad(angle))))
        if expand:
            # find the new width and height bounds
            bound_w, bound_h = np.rint(
                [h * abs_sin + w * abs_cos, h * abs_cos + w * abs_sin]
            ).astype(int)
        else:
            bound_w, bound_h = w, h

        self._set_attributes(locals())
        self.rm_coords = self.create_rotation_matrix()
        # Needed because of this problem https://github.com/opencv/opencv/issues/11784
        self.rm_image = self.create_rotation_matrix(offset=-0.5)

    def apply_image(self, img, interp=None):
        """
        img should be a numpy array, formatted as Height * Width * Nchannels
        """
        if len(img) == 0 or self.angle % 360 == 0:
            return img
        assert img.shape[:2] == (self.h, self.w)
        interp = interp if interp is not None else self.interp
        return cv2.warpAffine(img, self.rm_image, (self.bound_w, self.bound_h), flags=interp)

    def apply_coords(self, coords):
        """
        coords should be a N * 2 array-like, containing N couples of (x, y) points
        """
        coords = np.asarray(coords, dtype=float)
        if len(coords) == 0 or self.angle % 360 == 0:
            return coords
        return cv2.transform(coords[:, np.newaxis, :], self.rm_coords)[:, 0, :]

    def apply_segmentation(self, segmentation):
        segmentation = self.apply_image(segmentation, interp=cv2.INTER_NEAREST)
        return segmentation

    def create_rotation_matrix(self, offset=0):
        center = (self.center[0] + offset, self.center[1] + offset)
        rm = cv2.getRotationMatrix2D(tuple(center), self.angle, 1)
        if self.expand:
            # Find the coordinates of the center of rotation in the new image
            # The only point for which we know the future coordinates is the center of the image
            rot_im_center = cv2.transform(self.image_center[None, None, :] + offset, rm)[0, 0, :]
            new_center = np.array([self.bound_w / 2, self.bound_h / 2]) + offset - rot_im_center
            # shift the rotation center to the new coordinates
            rm[:, 2] += new_center
        return rm

    def inverse(self):
        """
        The inverse is to rotate it back with expand, and crop to get the original shape.
        """
        if not self.expand:  # Not possible to inverse if a part of the image is lost
            raise NotImplementedError()
        rotation = RotationTransform(
            self.bound_h, self.bound_w, -self.angle, True, None, self.interp
        )
        crop = CropTransform(
            (rotation.bound_w - self.w) // 2, (rotation.bound_h - self.h) // 2, self.w, self.h
        )
        return TransformList([rotation, crop])


class ColorTransform(Transform):
    """
    Generic wrapper for any photometric transforms.
    These transformations should only affect the color space and
        not the coordinate space of the image (e.g. annotation
        coordinates such as bounding boxes should not be changed)
    """

    def __init__(self, op):
        """
        Args:
            op (Callable): operation to be applied to the image,
                which takes in an ndarray and returns an ndarray.
        """
        if not callable(op):
            raise ValueError("op parameter should be callable")
        super().__init__()
        self._set_attributes(locals())

    def apply_image(self, img):
        return self.op(img)

    def apply_coords(self, coords):
        return coords

    def inverse(self):
        return NoOpTransform()

    def apply_segmentation(self, segmentation):
        return segmentation


class PILColorTransform(ColorTransform):
    """
    Generic wrapper for PIL Photometric image transforms,
        which affect the color space and not the coordinate
        space of the image
    """

    def __init__(self, op):
        """
        Args:
            op (Callable): operation to be applied to the image,
                which takes in a PIL Image and returns a transformed
                PIL Image.
                For reference on possible operations see:
                - https://pillow.readthedocs.io/en/stable/
        """
        if not callable(op):
            raise ValueError("op parameter should be callable")
        super().__init__(op)

    def apply_image(self, img):
        img = Image.fromarray(img)
        return np.asarray(super().apply_image(img))


def HFlip_rotated_box(transform, rotated_boxes):
    """
    Apply the horizontal flip transform on rotated boxes.

    Args:
        rotated_boxes (ndarray): Nx5 floating point array of
            (x_center, y_center, width, height, angle_degrees) format
            in absolute coordinates.
    """
    # Transform x_center
    rotated_boxes[:, 0] = transform.width - rotated_boxes[:, 0]
    # Transform angle
    rotated_boxes[:, 4] = -rotated_boxes[:, 4]
    return rotated_boxes


def Resize_rotated_box(transform, rotated_boxes):
    """
    Apply the resizing transform on rotated boxes. For details of how these (approximation)
    formulas are derived, please refer to :meth:`RotatedBoxes.scale`.

    Args:
        rotated_boxes (ndarray): Nx5 floating point array of
            (x_center, y_center, width, height, angle_degrees) format
            in absolute coordinates.
    """
    scale_factor_x = transform.new_w * 1.0 / transform.w
    scale_factor_y = transform.new_h * 1.0 / transform.h
    rotated_boxes[:, 0] *= scale_factor_x
    rotated_boxes[:, 1] *= scale_factor_y
    theta = rotated_boxes[:, 4] * np.pi / 180.0
    c = np.cos(theta)
    s = np.sin(theta)
    rotated_boxes[:, 2] *= np.sqrt(np.square(scale_factor_x * c) + np.square(scale_factor_y * s))
    rotated_boxes[:, 3] *= np.sqrt(np.square(scale_factor_x * s) + np.square(scale_factor_y * c))
    rotated_boxes[:, 4] = np.arctan2(scale_factor_x * s, scale_factor_y * c) * 180 / np.pi

    return rotated_boxes


HFlipTransform.register_type("rotated_box", HFlip_rotated_box)
ResizeTransform.register_type("rotated_box", Resize_rotated_box)

# not necessary any more with latest fvcore
NoOpTransform.register_type("rotated_box", lambda t, x: x)


Overwriting transform.py


In [8]:
%cd ~/../kaggle/working/paco
os.listdir()

/kaggle/working/paco


['tools',
 'scripts',
 'transform.py',
 'README.md',
 'state.db',
 'LICENSE',
 '.pre-commit-config.yaml',
 'configs',
 '.flake8',
 'CONTRIBUTING.md',
 '.eggs',
 'paco',
 'tranform.py',
 '.git',
 '.gitignore',
 'requirements.txt',
 'docs',
 'paco.egg-info',
 'setup.py',
 'notebooks',
 'CODE_OF_CONDUCT.md']

## Load Dataset for training

In [9]:
##### Change the paths accordingly!
dataset_file_name = "../../input/paco-annotations/paco_lvis_v1/paco_lvis_v1_val.json"
image_root_dir = "../../input/coco-2017-dataset/coco2017"
# Load dataset
with open(dataset_file_name) as f:
    dataset = json.load(f)

In [None]:
def get_masks(im_fn, anns, cat_id_to_name, mask_type="part"):
    """
    Reads the image, extract all its masks, decode them and convert them into binary masks 
    Returns a numpy array with an all the masks in RGB format
    """
    # Load image.
    im = np.asarray(Image.open(im_fn))
    im = np.zeros_like(im)
    overlay = im.copy()

    # Build overlay masks and labels.
    masks = []
    labels = []
    for ann, part_anns in anns:
        if mask_type == "part":
            for part_ann in part_anns:
                if part_ann["segmentation"] != []:
                    masks.append(part_ann["segmentation"])
                    labels.append(cat_id_to_name[part_ann["category_id"]].split(":")[-1])
        else:
            if ann["segmentation"] != []:
                masks.append(ann["segmentation"])
                labels.append(cat_id_to_name[ann["category_id"]].split("_(")[0])
    
    ans_masks = []
    for mask_rle in masks:
        # Decode RLE to binary mask
        binary_mask = mask_util.decode(mask_rle).astype(bool)  # shape: (H, W)
        if(binary_mask is None or binary_mask.ndim <= 1): continue
        Mask = np.stack([binary_mask] * 3, axis = -1).astype(np.uint8) # Stack to convert the binary mask back into RGB format
        Mask *= 255
        ans_masks.append(Mask)
    return ans_masks
    
   

In [None]:
%cd ~/../kaggle/working/paco
os.listdir()

# Training Loop

In [29]:
vis_offset = 0                        # Offset into the list of images to display
vis_num_im = 29598                    # Number of images to display
vis_num_cats = set(range(15))        # Include only images that have the number of categories in this set, set to None to disable
vis_num_boxes = set(range(15))       # Include only images that have the number of boxes in this set, set to None to disable
vis_num_parts = set(range(2, 15))    # Include only images that have the number of parts in this set, set to None to disable
vis_mask_type = "part"                # Mask type, one of "part" or "obj"


# Start with all images.
im_ids = im_id_to_anns.keys()
# Select images satisfying a limit on the number of categories.
if vis_num_cats is not None:
    im_ids = [im_id for im_id in im_ids if len(im_id_to_cats[im_id]) in vis_num_cats]
# Further select a subset of images satisfying a limit on the number of boxes.
if vis_num_boxes is not None:
    im_ids = [im_id for im_id in im_ids if len(im_id_to_anns[im_id]) in vis_num_boxes]
# Narrow down further by limiting the number of part annotations.
if vis_num_parts is not None:
    im_ids = [im_id for im_id in im_ids if len(sum(list(zip(*im_id_to_anns[im_id]))[1], start=[])) in vis_num_parts]
# Sort by box area.
im_ids = set(im_ids)
im_ids = [im_id for im_id, mean_box_area in sorted(im_id_to_mean_box_area.items(), key=lambda x: x[1], reverse=True) if im_id in im_ids]

print("Number of images to visualize:", len(im_ids))

count = 0
indx = 0
## MAIN TRAINING LOOP
for im_id in im_ids[vis_offset:vis_offset+vis_num_im]:
    im_fn = image_id_to_image_file_name[im_id]
    anns = im_id_to_anns[im_id]
    
    im2 = get_masks(im_fn, anns, cat_id_to_name, vis_mask_type)
    for mask in im2:
        try:
            # mask = Image.fromarray(mask)
            # plt.imshow(mask,interpolation='none')
            mask.save(f"../reupdated-paco-masks/train/{im_id}_{indx}.jpg")
            indx+= 1
        except:
            continue
    if(count % 500 == 0):
        print(count)
    count+= 1

Number of images to visualize: 29598
0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500
18000
18500
19000
19500
20000
20500
21000
21500
22000
22500
23000
23500
24000
24500
25000
25500
26000
26500
27000
27500
28000
28500
29000
29500


## Downloading all the processed masks

In [None]:
%cd ..
!mkdir paco-masks-val
%cd paco-masks-val
!mkdir val
%cd ../../paco
!ls

In [30]:
%cd ..
os.listdir()

/kaggle/working


['state.db',
 'reupdated-paco-masks',
 'paco-masks-fixed',
 'fole.zip',
 '.virtual_documents',
 'fele.zip',
 'updated-paco-masks',
 'file.zip',
 'paco',
 'paco-masks']

In [21]:
!ls ./

file.zip  paco	      paco-masks-fixed	    state.db
fole.zip  paco-masks  reupdated-paco-masks  updated-paco-masks


In [None]:
!zip -r fele.zip /kaggle/working/reupdated-paco-masks

updating: kaggle/working/reupdated-paco-masks/ (stored 0%)
  adding: kaggle/working/reupdated-paco-masks/train/ (stored 0%)
  adding: kaggle/working/reupdated-paco-masks/train/516668_27348.jpg (deflated 77%)
  adding: kaggle/working/reupdated-paco-masks/train/450383_119082.jpg (deflated 89%)
  adding: kaggle/working/reupdated-paco-masks/train/52611_26895.jpg (deflated 70%)
  adding: kaggle/working/reupdated-paco-masks/train/49371_24711.jpg (deflated 78%)
  adding: kaggle/working/reupdated-paco-masks/train/360271_45198.jpg (deflated 89%)
  adding: kaggle/working/reupdated-paco-masks/train/516625_60841.jpg (deflated 71%)
  adding: kaggle/working/reupdated-paco-masks/train/281188_368.jpg (deflated 59%)
  adding: kaggle/working/reupdated-paco-masks/train/357526_105183.jpg (deflated 69%)
  adding: kaggle/working/reupdated-paco-masks/train/167028_155802.jpg (deflated 86%)
  adding: kaggle/working/reupdated-paco-masks/train/521428_94419.jpg (deflated 64%)
  adding: kaggle/working/reupdated-pa

In [32]:
from IPython.display import FileLink
FileLink(r'fele.zip')

In [None]:
!ls ./kaggle/working