In [1]:
import os
from glob import glob
import pandas as pd
from functools import reduce
import json
from shutil import move
import yaml

In [2]:
def extract_json(filename):
    json_file_path = filename
    parser = []
    
    with open(json_file_path, "r") as file:
        data = json.load(file)

    for file in data:
        filename = os.path.join(file["uuid"] + ".png")

        imgwidth = file["width"]
        imgheight = file["height"]
        
        for box in file["boundingBoxes"]:
            name = box["concept"]
    
            boxx = box["x"]
            boxy = box["y"]
            boxwidth = box["width"]
            boxheight = box["height"]
            
            parser.append([filename, imgwidth, imgheight, name, boxwidth, boxheight, boxx, boxy])

    return parser

In [66]:
df = pd.DataFrame(extract_json("data.json"), columns = ["filename", "imgwidth", "imgheight", "name", "boxwidth", "boxheight", "boxx", "boxy"])

In [67]:
df.info()
df["name"].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20626 entries, 0 to 20625
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   filename   20626 non-null  object
 1   imgwidth   20626 non-null  int64 
 2   imgheight  20626 non-null  int64 
 3   name       20626 non-null  object
 4   boxwidth   20626 non-null  int64 
 5   boxheight  20626 non-null  int64 
 6   boxx       20626 non-null  int64 
 7   boxy       20626 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 1.3+ MB


name
Ophiuroidea                    10983
Crinoidea                        710
Strongylocentrotus fragilis      471
Holothuroidea                    412
Scotoplanes                      406
                               ...  
Apristurus kampae                  1
Iosactis                           1
eggcase                            1
Thenea                             1
Brisingidae                        1
Name: count, Length: 442, dtype: int64

In [68]:
columns = ["filename", "imgwidth", "imgheight", "name", "boxwidth", "boxheight", "boxx", "boxy"]

df["center_x"] = ((df["boxwidth"] / 2 + df["boxx"]) / df["imgwidth"])
df["center_y"] = ((df["boxheight"] / 2 + df["boxy"]) / df["imgheight"])

df["w"] = df["boxwidth"] / df["imgwidth"]
df["h"] = df["boxheight"] / df["imgheight"]
df.head()

Unnamed: 0,filename,imgwidth,imgheight,name,boxwidth,boxheight,boxx,boxy,center_x,center_y,w,h
0,a5bd67a6-9a0f-45b3-85a7-017d5f77fc66.png,715,486,Actiniaria,76,83,638,400,0.945455,0.908436,0.106294,0.170782
1,a5bd67a6-9a0f-45b3-85a7-017d5f77fc66.png,715,486,Sebastolobus,95,53,522,108,0.796503,0.276749,0.132867,0.109053
2,a5bd67a6-9a0f-45b3-85a7-017d5f77fc66.png,715,486,Ophiuroidea,37,42,421,116,0.614685,0.281893,0.051748,0.08642
3,a5bd67a6-9a0f-45b3-85a7-017d5f77fc66.png,715,486,Psolus squamatus,113,85,391,378,0.625874,0.865226,0.158042,0.174897
4,a5bd67a6-9a0f-45b3-85a7-017d5f77fc66.png,715,486,Actiniaria,67,67,645,132,0.948951,0.340535,0.093706,0.13786


In [69]:
unique_names = df["name"].unique()

labels = dict()
for num in range(len(unique_names)):
    labels[unique_names[num]] = num

df["id"] = df["name"].map(labels)

In [70]:
images = df['filename'].unique()
len(images)

2687

In [71]:
img_df = pd.DataFrame(images, columns=['filename'])
img_df.head()

Unnamed: 0,filename
0,a5bd67a6-9a0f-45b3-85a7-017d5f77fc66.png
1,cc3640a1-beb6-454b-9f86-02942a7c4006.png
2,2f3758f6-2fc1-4c18-b8af-02ccb66f8c36.png
3,02d00747-6b43-4bd4-8095-031c6fec4eb0.png
4,a1011047-2d88-4703-b11b-0334b1cc95e3.png


In [72]:
img_train = tuple(img_df.sample(frac=0.8)['filename'])
#shuffling data and picking 80% of images

In [73]:
#picking all images that were not in the 80% train set (the other 20%)
img_test = tuple(img_df[~img_df['filename'].isin(img_train)]['filename'])
len(img_train), len(img_test) #length of each set

(2150, 537)

In [74]:
#making dataframes
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [75]:
train_df.head()
test_df.head()

Unnamed: 0,filename,imgwidth,imgheight,name,boxwidth,boxheight,boxx,boxy,center_x,center_y,w,h,id
110,03179563-b13c-4669-a1c0-06d8dc1da6d4.png,1920,1079,Ophiuroidea,180,162,38,23,0.066667,0.096386,0.09375,0.150139,2
111,03179563-b13c-4669-a1c0-06d8dc1da6d4.png,1920,1079,Ophiuroidea,188,264,1555,152,0.858854,0.263207,0.097917,0.244671,2
112,03179563-b13c-4669-a1c0-06d8dc1da6d4.png,1920,1079,Hyalonema (Corynonema) populiferum,677,828,607,61,0.492448,0.440222,0.352604,0.767377,17
113,03179563-b13c-4669-a1c0-06d8dc1da6d4.png,1920,1079,Holothuroidea,361,393,209,679,0.202865,0.811399,0.188021,0.364226,18
114,03179563-b13c-4669-a1c0-06d8dc1da6d4.png,1920,1079,Ophiuroidea,190,198,20,320,0.059896,0.388323,0.098958,0.183503,2


In [76]:
os.makedirs("images/images/train", exist_ok=True)
os.makedirs("images/images/val", exist_ok=True)
os.makedirs("images/labels/train", exist_ok=True)
os.makedirs("images/labels/val", exist_ok=True)

In [77]:
columns = ["filename", "id", "center_x", "center_y", "w", "h"]
groupby_obj_train = train_df[columns].groupby("filename")
groupby_obj_test = test_df[columns].groupby("filename")

In [78]:
def save_data(filename, images_folder_path, labels_folder_path, group_obj):
    source = os.path.join("images", filename)
    destination = os.path.join(images_folder_path, filename)
    move(source, destination)

    text_filename = os.path.join(labels_folder_path,os.path.splitext(filename)[0] + ".txt")
    group_obj.get_group(filename).set_index("filename").to_csv(text_filename,sep=' ',index=False,header=False) 

In [79]:
filename_series_train = pd.Series(groupby_obj_train.groups.keys())
filename_series_train.apply(save_data,args=("images/images/train","images/labels/train",groupby_obj_train))

0       None
1       None
2       None
3       None
4       None
        ... 
2145    None
2146    None
2147    None
2148    None
2149    None
Length: 2150, dtype: object

In [80]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data,args=("images/images/val","images/labels/val",groupby_obj_test))

0      None
1      None
2      None
3      None
4      None
       ... 
532    None
533    None
534    None
535    None
536    None
Length: 537, dtype: object

In [81]:
data = {
    'train': 'images/images/train',
    'val': 'images/images/test',
    'nc': len(labels), 
    'names': list(labels.keys()) 
}

with open('data.yaml', 'w') as file:
    yaml.dump(data, file, default_flow_style=False, sort_keys=False)
