# labelme2coco

In [1]:
import json
from glob import glob

In [2]:
ann_path = './data/train/*.txt'
ann_files = glob(ann_path)

In [3]:
def labelmetxt2coco(anns):
    cars = {}
    # 숫자로 해도 되지만, 클래스 확인하기 위해 차종으로 변경
    classes = ["chevrolet_malibu_sedan_2012_2016", "chevrolet_malibu_sedan_2017_2019", "chevrolet_spark_hatchback_2016_2021", "chevrolet_trailblazer_suv_2021_", "chevrolet_trax_suv_2017_2019", "genesis_g80_sedan_2016_2020", "genesis_g80_sedan_2021_", "genesis_gv80_suv_2020_", "hyundai_avante_sedan_2011_2015", "hyundai_avante_sedan_2020_", "hyundai_grandeur_sedan_2011_2016", "hyundai_grandstarex_van_2018_2020", "hyundai_ioniq_hatchback_2016_2019", "hyundai_sonata_sedan_2004_2009", "hyundai_sonata_sedan_2010_2014", "hyundai_sonata_sedan_2019_2020", "kia_carnival_van_2015_2020", "kia_carnival_van_2021_", "kia_k5_sedan_2010_2015", "kia_k5_sedan_2020_", "kia_k7_sedan_2016_2020", "kia_mohave_suv_2020_", "kia_morning_hatchback_2004_2010", "kia_morning_hatchback_2011_2016", "kia_ray_hatchback_2012_2017", "kia_sorrento_suv_2015_2019", "kia_sorrento_suv_2020_", "kia_soul_suv_2014_2018", "kia_sportage_suv_2016_2020", "kia_stonic_suv_2017_2019", "renault_sm3_sedan_2015_2018", "renault_xm3_suv_2020_", "ssangyong_korando_suv_2019_2020", "ssangyong_tivoli_suv_2016_2020"]
    cars["categories"] = [{"id":i,"name":cat,"supercategory":"none"} for i, cat in enumerate(classes)]
    cars["images"] = []
    cars["annotations"] = []
    cnt_ann = 0
    for i, ann in enumerate(anns):
        img_path = ann.replace("/data", "").replace("txt", "png")
        #h, w, _ = cv2.imread(img_path).shape
        cars["images"].append({"id":i,"height":1040,"width":1920,"file_name":img_path})

        f_ann = open(ann, "r")
        for line in f_ann.readlines():
            data = line.split()
            cat = int(float(data[0]))
            pt1x = int(data[1])
            pt1y = int(data[2])
            #pt2x = int(data[3])    not to use
            #pt2y = int(data[4])
            pt3x = int(data[5])
            pt3y = int(data[6])
            #pt4x = int(data[7])
            #pt4y = int(data[8])
            x = pt1x
            y = pt1y
            width = pt3x - pt1x
            height = pt3y - pt1y
            area = width * height
            cars["annotations"].append({"id": cnt_ann,
                                        "image_id": i,
                                        "category_id": cat,
                                        "bbox": [x, y, width, height],
                                        "area": area,
                                        "segmentation": [],
                                        "iscrowd": 0})
            cnt_ann += 1
        
    return cars

In [4]:
with open('./data/annotations/train_json', "w") as f:
    json.dump(labelmetxt2coco(ann_files), f, ensure_ascii=False, indent=4)

# Split data

In [8]:
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold

In [9]:
# annotation = {train.json dataset file 경로}
annotation = './data/annotations/train_json'

with open(annotation) as f: data = json.load(f)

var = [(ann['image_id'], ann['category_id']) for ann in data['annotations']]
X = np.ones((len(data['annotations']),1))
y = np.array([v[1] for v in var])
groups = np.array([v[0] for v in var])

cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=777)

for train_idx, val_idx in cv.split(X, y, groups):
    print("TRAIN:", groups[train_idx]) # image_id
    print(" ", y[train_idx])           # category_id
    print(" TEST:", groups[val_idx])
    print(" ", y[val_idx])

TRAIN: [   0    0    2 ... 6480 6480 6480]
  [25 25 22 ... 11 21  9]
 TEST: [   1    1    1 ... 6479 6479 6479]
  [ 0  2 25 ... 13 32 13]
TRAIN: [   0    0    1 ... 6480 6480 6480]
  [25 25  0 ... 11 21  9]
 TEST: [   4    4    4 ... 6477 6477 6477]
  [18  8  1 ... 29  3 18]
TRAIN: [   1    1    1 ... 6480 6480 6480]
  [ 0  2 25 ... 11 21  9]
 TEST: [   0    0    2 ... 6478 6478 6478]
  [25 25 22 ... 31  9 20]
TRAIN: [   0    0    1 ... 6479 6479 6479]
  [25 25  0 ... 13 32 13]
 TEST: [   5    5    7 ... 6480 6480 6480]
  [14 33 11 ... 11 21  9]
TRAIN: [   0    0    1 ... 6480 6480 6480]
  [25 25  0 ... 11 21  9]
 TEST: [   6    6    6 ... 6476 6476 6476]
  [17  5 17 ... 11  3 29]


In [10]:
from collections import Counter
import pandas as pd

In [11]:
def get_distribution(y):
    y_distr = Counter(y)
    y_vals_sum = sum(y_distr.values())

    return [f'{y_distr[i]/y_vals_sum:.2%}' for i in range(np.max(y) +1)]

In [12]:
distrs = [get_distribution(y)]
index = ['training set']

In [13]:
for fold_ind, (train_idx, val_idx) in enumerate(cv.split(X, y, groups)):
    train_y, val_y = y[train_idx], y[val_idx]
    train_gr, val_gr = groups[train_idx], groups[val_idx]

    assert len(set(train_gr) & set(val_gr)) == 0
    
    distrs.append(get_distribution(train_y))
    distrs.append(get_distribution(val_y))
    index.append(f'train - fold{fold_ind+1}')
    index.append(f'val - fold{fold_ind+1}')

In [14]:
categories = [d['name'] for d in data['categories']]
pd.DataFrame(distrs, index=index, columns = [categories[i] for i in range(np.max(y) + 1)])

Unnamed: 0,chevrolet_malibu_sedan_2012_2016,chevrolet_malibu_sedan_2017_2019,chevrolet_spark_hatchback_2016_2021,chevrolet_trailblazer_suv_2021_,chevrolet_trax_suv_2017_2019,genesis_g80_sedan_2016_2020,genesis_g80_sedan_2021_,genesis_gv80_suv_2020_,hyundai_avante_sedan_2011_2015,hyundai_avante_sedan_2020_,...,kia_ray_hatchback_2012_2017,kia_sorrento_suv_2015_2019,kia_sorrento_suv_2020_,kia_soul_suv_2014_2018,kia_sportage_suv_2016_2020,kia_stonic_suv_2017_2019,renault_sm3_sedan_2015_2018,renault_xm3_suv_2020_,ssangyong_korando_suv_2019_2020,ssangyong_tivoli_suv_2016_2020
training set,2.94%,2.94%,2.94%,2.94%,2.94%,2.94%,2.94%,2.94%,2.94%,2.94%,...,2.94%,2.94%,2.94%,2.94%,2.94%,2.94%,2.94%,2.94%,2.94%,2.94%
train - fold1,2.97%,2.83%,2.94%,2.88%,2.92%,3.06%,2.92%,2.91%,2.92%,2.69%,...,2.88%,2.88%,2.94%,2.92%,2.95%,2.94%,2.99%,2.89%,3.11%,2.86%
val - fold1,2.84%,3.37%,2.96%,3.17%,3.02%,2.46%,3.02%,3.08%,3.02%,3.93%,...,3.17%,3.17%,2.93%,3.02%,2.90%,2.93%,2.76%,3.14%,2.29%,3.26%
train - fold2,2.97%,2.91%,2.97%,2.90%,3.06%,3.00%,2.94%,2.86%,2.93%,2.94%,...,2.93%,2.91%,2.97%,2.92%,3.00%,2.90%,2.91%,2.89%,2.91%,2.99%
val - fold2,2.81%,3.05%,2.84%,3.10%,2.46%,2.69%,2.93%,3.25%,2.99%,2.93%,...,2.99%,3.05%,2.84%,3.02%,2.69%,3.10%,3.07%,3.13%,3.05%,2.75%
train - fold3,2.92%,2.96%,2.94%,3.00%,2.92%,2.91%,2.83%,3.02%,2.92%,3.07%,...,2.98%,2.89%,2.86%,2.94%,2.97%,2.97%,2.97%,3.02%,2.85%,2.99%
val - fold3,3.02%,2.87%,2.93%,2.70%,3.02%,3.05%,3.41%,2.64%,3.02%,2.43%,...,2.79%,3.14%,3.29%,2.93%,2.81%,2.81%,2.84%,2.64%,3.32%,2.73%
train - fold4,2.93%,2.90%,2.87%,2.98%,2.98%,2.84%,3.03%,3.01%,2.89%,2.93%,...,2.99%,2.98%,2.94%,2.98%,2.91%,3.02%,2.90%,2.94%,3.09%,2.88%
val - fold4,2.97%,3.12%,3.24%,2.77%,2.80%,3.33%,2.59%,2.65%,3.15%,3.00%,...,2.74%,2.80%,2.94%,2.80%,3.06%,2.62%,3.12%,2.94%,2.36%,3.18%
train - fold5,2.91%,3.10%,2.99%,2.93%,2.82%,2.88%,2.99%,2.91%,3.04%,3.07%,...,2.92%,3.04%,3.00%,2.94%,2.87%,2.87%,2.95%,2.96%,2.75%,2.98%


In [15]:
# annotation = {dataset 경로/K-fold}
output_filename = "./data/K-fold"

In [16]:
for idx, (train_idx, val_idx) in enumerate(cv.split(X, y, groups)):
    train_images, val_images = [], []
    train_annotations, val_annotations = [], []
    for i in groups[train_idx]: # image_id
        train_images.append(data["images"][i].copy())
    for i in groups[val_idx]:   # image_id
        val_images.append(data["images"][i].copy())
    for annotation in data["annotations"]:
        if annotation["image_id"] in groups[val_idx]:
            val_annotations.append(annotation.copy())
        else:
            train_annotations.append(annotation.copy())

    train_split = {
            "images": train_images,
            "annotations": train_annotations,
            "info": data.get("info", {}),
            "licenses": data.get("licenses", []),
            "categories": data["categories"],
        }

    val_split = {
            "images": val_images,
            "annotations": val_annotations,
            "info": data.get("info", {}),
            "licenses": data.get("licenses", []),
            "categories": data["categories"],
        }
    
    output_files = []
    for split_type, split in zip(["train", "val"], [train_split, val_split]):
        output_files.append(output_filename + f"_{split_type}{idx+1}.json")
        with open(output_files[-1], "w") as f:
            json.dump(split, f, indent=2)