In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
from PIL import Image
import json
from matplotlib import pyplot as plt
import os
from composite import *
from tqdm import tqdm
import cv2
import albumentations as A
import pandas as pd
import random

import shutil
from sahi.utils.file import load_json, save_json
from sahi.utils.coco import Coco, CocoCategory, CocoImage, CocoAnnotation, merge_from_list
from sahi.utils.cv import get_coco_segmentation_from_bool_mask, get_bbox_from_bool_mask
from sklearn.model_selection import GroupKFold

## Load image path as well as annotations based on which we want to generate more synthetic defect

In [3]:
image_path = "./Datasets/ps5_dataset/images/"
annotation_file = "./Datasets/ps5_dataset/ps5_annotations.json"
with open(annotation_file, 'r') as f:
    annotations = json.load(f)

In [4]:
"""
The structure of the json annotation file is as follows:
{
    image_name: [
        {
            bbox: [x, y, w, h],  # bounding box coordinate
            seg1: [[x1, y1, x2, y2, ...], [x1, y1, x2, y2, ...], ...],  # polygon annotation from annotator 1, can have multiple polygons
            seg2: [[x1, y1, x2, y2, ...], [x1, y1, x2, y2, ...], ...],  # polygon annotation from annotator 2, can have multiple polygons
            category: a string representing defect category
        },
        ...
    ]  # list of defect annotations
}

Note: either or both of seg1 and seg2 annotation could be empty for some defects that are hard to identify from image alone, we can choose to ignore these defects for now
"""
print()




### 1. Training and test (val) split. 
In the augmentation process, we will only augment the training set and test on the same validation set

In [5]:
# Merge the annotation masks from the two annotators 
cat_id = {'Collision': 0, 'Dirty': 1, 'Scratch': 2}
rows = []
defect_id = 0
images = sorted(list(annotations.keys()))
for image_id, image_name in enumerate(images):
    image = Image.open(os.path.join(image_path, image_name)).convert('RGB')
    defects = annotations[image_name]
    for defect in defects:
        mask = np.zeros((image.height, image.width)).astype(np.uint8)
        seg1 = [np.array(poly) for poly in defect['seg1']]
        mask = cv2.fillPoly(mask, seg1, 255, lineType=cv2.LINE_AA)
        # visualize annotator 2 polygons in blue channel
        seg2 = [np.array(poly) for poly in defect['seg2']]
        mask = cv2.fillPoly(mask, seg2, 255, lineType=cv2.LINE_AA)
        rows.append({
            'defect_id': defect_id,
            'image_id': image_id,
            'image_name':image_name, # I added a image name
            'category': defect['category'],
            'category_id': cat_id[defect['category']],
            'bbox': get_bbox_from_bool_mask(mask),
            'seg': get_coco_segmentation_from_bool_mask(mask),
        })
df = pd.DataFrame(rows)

In [6]:
# Stratefied data split over defect types
gkf = GroupKFold(n_splits=5)
train, valid = next(gkf.split(df.defect_id, df.category_id, groups=df.image_id))
print("%s %s" % (len(train), len(valid)))
train_imgs = df.iloc[train]['image_name'].unique()
valid_imgs = df.iloc[valid]['image_name'].unique()
print(train_imgs)
print(valid_imgs)
assert set(train_imgs).isdisjoint(valid_imgs)

440 110
['300000000010_3_1_TA07_2_20220601171213802_03.jpg'
 '300000000010_3_2_TA07_2_20220602103535889_03.jpg'
 '300000000010_3_5_TA07_2_20220602171036647_03.jpg'
 '300000000010_3_8_TA07_2_20220605153523678_09.jpg'
 '300000000010_3_9_TA07_2_20220605155520020_01.jpg'
 '300000000011_3_1_TA07_2_20220601171323197_06.jpg'
 '300000000011_3_3_TA07_2_20220602110348434_06.jpg'
 '300000000011_3_5_TA07_2_20220602171111989_06.jpg'
 '300000000011_3_7_TA07_2_20220605150424328_06.jpg'
 '300000000011_3_8_TA07_2_20220605153537096_06.jpg'
 '300000000011_3_9_TA07_2_20220605155533358_06.jpg'
 '300000000012_3_1_TA07_2_20220601171358055_06.jpg'
 '300000000012_3_2_TA07_2_20220602103627282_09.jpg'
 '300000000012_3_6_TA07_2_20220605143753833_06.jpg'
 '300000000013_3_3_TA07_2_20220602110450822_03.jpg'
 '300000000013_3_4_TA07_2_20220602155807636_06.jpg'
 '300000000013_3_5_TA07_2_20220602171240943_02.jpg'
 '300000000013_3_6_TA07_2_20220605143901549_03.jpg'
 '300000000013_3_7_TA07_2_20220605150638599_02.jpg'
 '30

### 1.1  Save a raw the training and validation sets as a baseline
The save path is ```'./Raw_dataset/ps5_seg_coco/'```

In [None]:
# Generate Coco Style Annotation with Sahi
data_path = './Raw_dataset/ps5_seg_coco/'
os.makedirs(data_path, exist_ok= True)
coco_train = Coco()
coco_valid = Coco()

for category, i in cat_id.items():
    coco_train.add_category(CocoCategory(id=i, name=category))
    coco_valid.add_category(CocoCategory(id=i, name=category))

for image_id, image_name in enumerate(images):
    image = Image.open(os.path.join(image_path, image_name)).convert('RGB')
    coco_image = CocoImage(file_name=image_name, height=image.height, width=image.width)

    defects = df[df.image_name == image_name]
    for _, defect in defects.iterrows():
        # print(defect.bbox, defect.seg, defect.category_id, defect.category)
        if defect.bbox:
            coco_image.add_annotation(
                CocoAnnotation(
                    bbox=defect.bbox,
                    segmentation=defect.seg,
                    category_id=defect.category_id,
                    category_name=defect.category
                )
            )
    if image_name in train_imgs:
        shutil.copy(os.path.join(image_path, image_name), os.path.join(data_path, 'train', image_name))
        coco_train.add_image(coco_image)
    elif image_name in valid_imgs:
        shutil.copy(os.path.join(image_path, image_name), os.path.join(data_path, 'valid', image_name))
        coco_valid.add_image(coco_image)
print(coco_train.stats["num_annotations_per_category"])
print(coco_valid.stats["num_annotations_per_category"])
save_json(coco_train.json, os.path.join(data_path, 'train', '_annotations.coco.json'))
save_json(coco_valid.json, os.path.join(data_path, 'valid', '_annotations.coco.json'))

# Create Sliced Datasets with Sahi
from sahi.slicing import slice_coco
from sahi.utils.file import load_json, save_json


data_path = './Raw_dataset/ps5_seg_coco/'
slice_path = './Raw_dataset/ps5_sliced/'
train_sliced_coco_dict, _ = slice_coco(
    coco_annotation_file_path=os.path.join(data_path, 'train', '_annotations.coco.json'),
    image_dir=os.path.join(data_path, 'train'),
    output_coco_annotation_file_name=None,
    ignore_negative_samples=False,
    output_dir=os.path.join(slice_path, 'train'),
    slice_height=512,
    slice_width=512,
    overlap_height_ratio=0.2,
    overlap_width_ratio=0.2,
    min_area_ratio=0.1,
    verbose=False
)
save_json(train_sliced_coco_dict, os.path.join(slice_path, 'train', '_annotations.coco.json'))


valid_sliced_coco_dict, _ = slice_coco(
    coco_annotation_file_path=os.path.join(data_path, 'valid', '_annotations.coco.json'),
    image_dir=os.path.join(data_path, 'valid'),
    output_coco_annotation_file_name=None,
    ignore_negative_samples=False,
    output_dir=os.path.join(slice_path, 'valid'),
    slice_height=512,
    slice_width=512,
    overlap_height_ratio=0.2,
    overlap_width_ratio=0.2,
    min_area_ratio=0.1,
    verbose=False
)
save_json(valid_sliced_coco_dict, os.path.join(slice_path, 'valid', '_annotations.coco.json'))

print("Done!")

### 2. Syntehtic defect generation 
We will augment the TRAINING set by the following steps

In [None]:
work_dir = './Augment_result/'+'ps5_dataset/'

### 2.1. Get defect from the training set and save them in ```output_defect_source_dir``` which is ```work_dir/training_defect_sources/```
Move the target defect into the middle of the image to facilitate image augmentation in the next step
1. images are saved in the ```work_dir/training_defect_sources/``` 
2. the defect annotation file is saved as ```work_dir/training_defect_sources/training_defect_library_annotations.json```

In [None]:
output_defect_source_dir = work_dir +'training_defect_sources/'
defect_source_prep(image_path, train_imgs, annotations, output_defect_source_dir)

### 2.2. Augment defects from "output_defect_source_dir" and save them in "augmented_defect_dir"
Augment the defect using operations from ```albumentations``` package
1. images are saved in the ```work_dir/augmented_defect_library/``` 
2. the defect annotation file is saved as ```work_dir/training_defect_sources/augmented_training_defect_library_annotations.json```

In [None]:
# augment defects from "output_defect_source_dir" and save as "augmented_defect_dir"
defects_path = output_defect_source_dir
defect_annotation_file = output_defect_source_dir+'training_defect_library_annotations.json'
augmented_defect_dir = work_dir +'augmented_defect_library/'
augment(defects_path, defect_annotation_file, augmented_defect_dir)

### 2.3. Generate the synthetic defect dataset (training set) and save them in "output_path"
Sample from backgrounds ```input_background_images_list``` and sample defects from ```defects_path```, seamlessly merge thme together.
1. images are saved in the ```work_dir+'augmented_training_set/'``` 
2. the defect annotation file is saved as ```work_dir/training_defect_sources/'generated_training_images_annotation.json```

In [None]:
# defects
defects_path = augmented_defect_dir
# defect_annotations 
defect_annotation_file = defects_path+'augmented_training_defect_library_annotations.json'
output_path = work_dir+'augmented_training_set/'
input_background_images_list = train_imgs
input_image_annotation = annotations # annotation for the background images, use [] if background is clean
generate_new_dataset(image_path, input_background_images_list, input_image_annotation, defects_path, defect_annotation_file, output_path)

### 2.4 change 'generated_training_images_annotation.json' to coco format

In [8]:
# load image path as well as annotations based on which we want to generate more synthetic defect
work_dir = './Augment_result/'+'ps5_dataset/'
output_path = work_dir+'augmented_training_set/'
defect_annotation_file = output_path + 'generated_training_images_annotation.json'

image_path = output_path
annotation_file = defect_annotation_file 
with open(annotation_file, 'r') as f:
    annotations = json.load(f)

In [9]:
# Merge the annotation masks from the two annotators 
cat_id = {'Collision': 0, 'Dirty': 1, 'Scratch': 2}
rows = []
defect_id = 0
images = sorted(list(annotations.keys()))
for image_id, image_name in enumerate(images):
    image = Image.open(os.path.join(image_path, image_name)).convert('RGB')
    defects = annotations[image_name]
    for defect in defects:
        mask = np.zeros((image.height, image.width)).astype(np.uint8)
        seg1 = [np.array(poly) for poly in defect['seg1']]
        mask = cv2.fillPoly(mask, seg1, 255, lineType=cv2.LINE_AA)
        # # visualize annotator 2 polygons in blue channel # I just ignored seg2
        # seg2 = [np.array(poly) for poly in defect['seg2']]
        # mask = cv2.fillPoly(mask, seg2, 255, lineType=cv2.LINE_AA)
        rows.append({
            'defect_id': defect_id,
            'image_id': image_id,
            'image_name':image_name, # I added a image name
            'category': defect['category'],
            'category_id': cat_id[defect['category']],
            'bbox': get_bbox_from_bool_mask(mask),
            'seg': get_coco_segmentation_from_bool_mask(mask),
        })
df = pd.DataFrame(rows)

In [10]:
train_imgs = sorted(list(annotations.keys()))
val_imges = []

In [None]:
# # Generate Coco Style Annotation with Sahi
# data_path = work_dir + 'ps5_seg_coco/'
# coco_train = Coco()
# coco_valid = Coco()

# for category, i in cat_id.items():
#     coco_train.add_category(CocoCategory(id=i, name=category))
#     coco_valid.add_category(CocoCategory(id=i, name=category))

# for image_id, image_name in enumerate(images):
#     image = Image.open(os.path.join(image_path, image_name)).convert('RGB')
#     coco_image = CocoImage(file_name=image_name, height=image.height, width=image.width)

#     defects = df[df.image_name == image_name]
#     for _, defect in defects.iterrows():
#         # print(defect.bbox, defect.seg, defect.category_id, defect.category)
#         if defect.bbox:
#             coco_image.add_annotation(
#                 CocoAnnotation(
#                     bbox=defect.bbox,
#                     segmentation=defect.seg,
#                     category_id=defect.category_id,
#                     category_name=defect.category
#                 )
#             )
#     if image_name in train_imgs:
#         shutil.copy(os.path.join(image_path, image_name), os.path.join(data_path, 'train', image_name))
#         coco_train.add_image(coco_image)
# print(coco_train.stats["num_annotations_per_category"])
# save_json(coco_train.json, os.path.join(data_path, 'train', '_annotations.coco.json'))


In [12]:
# Generate Coco Style Annotation with Sahi
binarize = True
data_path = work_dir + 'ps5_seg_coco/'
coco_train = Coco()
coco_valid = Coco()

if binarize:
    coco_train.add_category(CocoCategory(id=0, name='NG'))
    coco_valid.add_category(CocoCategory(id=0, name='NG'))
else:
    for category, i in cat_id.items():
        coco_train.add_category(CocoCategory(id=i, name=category))
        coco_valid.add_category(CocoCategory(id=i, name=category))

for image_id, image_name in enumerate(images):
    image = Image.open(os.path.join(image_path, image_name)).convert('RGB')
    coco_image = CocoImage(file_name=image_name, height=image.height, width=image.width)

    defects = df[df.image_name == image_name]
    for _, defect in defects.iterrows():
        # print(defect.bbox, defect.seg, defect.category_id, defect.category)
        if defect.bbox:
            coco_image.add_annotation(
                CocoAnnotation(
                    bbox=defect.bbox,
                    segmentation=defect.seg,
                    category_id=0 if binarize else defect.category_id,
                    category_name='NG' if binarize else defect.category
                )
            )
    if image_name in train_imgs:
        shutil.copy(os.path.join(image_path, image_name), os.path.join(data_path, 'train', image_name))
        coco_train.add_image(coco_image)
    elif image_name in valid_imgs:
        shutil.copy(os.path.join(image_path, image_name), os.path.join(data_path, 'valid', image_name))
        coco_valid.add_image(coco_image)
print(coco_train.stats["num_annotations_per_category"])
print(coco_valid.stats["num_annotations_per_category"])
save_json(coco_train.json, os.path.join(data_path, 'train', '_annotations.coco.json'))
save_json(coco_valid.json, os.path.join(data_path, 'valid', '_annotations.coco.json'))

{'NG': 99840}
{'NG': 0}


In [None]:
# # Create Sliced Datasets with Sahi
# from sahi.slicing import slice_coco
# from sahi.utils.file import load_json, save_json


# data_path = work_dir + 'ps5_seg_coco/'
# slice_path = work_dir + 'ps5_sliced/'
# train_sliced_coco_dict, _ = slice_coco(
#     coco_annotation_file_path=os.path.join(data_path, 'train', '_annotations.coco.json'),
#     image_dir=os.path.join(data_path, 'train'),
#     output_coco_annotation_file_name=None,
#     ignore_negative_samples=False,
#     output_dir=os.path.join(slice_path, 'train'),
#     slice_height=512,
#     slice_width=512,
#     overlap_height_ratio=0.2,
#     overlap_width_ratio=0.2,
#     min_area_ratio=0.1,
#     verbose=False
# )
# save_json(train_sliced_coco_dict, os.path.join(slice_path, 'train', '_annotations.coco.json'))


# print("Done!")