In [58]:
import os
from glob import glob
import pandas as pd
from functools import reduce
import json
from shutil import move
import yaml

In [61]:
!python3 cleanup.py

Updated JSON file saved to data.json
No entries were deleted.
Extra JSON entries (not having corresponding image files): 0
Extra image files (not referenced in JSON): 0
Filtered data saved to JSON file.
Cleanup complete.
Total number of json files 0
Number of files: 5183
Total size in bytes: 7084580378


In [41]:
def extract_json(filename):
    json_file_path = filename
    parser = []
    
    with open(json_file_path, "r") as file:
        data = json.load(file)

    for file in data:
        filename = os.path.join(file["uuid"] + ".png")

        imgwidth = file["width"]
        imgheight = file["height"]
        
        for box in file["boundingBoxes"]:
            name = box["concept"]
    
            boxx = box["x"]
            boxy = box["y"]
            boxwidth = box["width"]
            boxheight = box["height"]
            
            parser.append([filename, imgwidth, imgheight, name, boxwidth, boxheight, boxx, boxy])

    return parser

In [42]:
df = pd.DataFrame(extract_json("data.json"), columns = ["filename", "imgwidth", "imgheight", "name", "boxwidth", "boxheight", "boxx", "boxy"])

In [43]:
df.info()
df["name"].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19966 entries, 0 to 19965
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   filename   19966 non-null  object
 1   imgwidth   19966 non-null  int64 
 2   imgheight  19966 non-null  int64 
 3   name       19966 non-null  object
 4   boxwidth   19966 non-null  int64 
 5   boxheight  19966 non-null  int64 
 6   boxx       19966 non-null  int64 
 7   boxy       19966 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 1.2+ MB


name
Ophiuroidea                    10629
Crinoidea                        693
Strongylocentrotus fragilis      439
Holothuroidea                    401
Scotoplanes                      395
                               ...  
Bathybembix                        1
Cataetyx                           1
Scaphopoda                         1
Albatrossia pectoralis             1
Brisingidae                        1
Name: count, Length: 431, dtype: int64

In [44]:
columns = ["filename", "imgwidth", "imgheight", "name", "boxwidth", "boxheight", "boxx", "boxy"]

df["center_x"] = ((df["boxwidth"] / 2 + df["boxx"]) / df["imgwidth"])
df["center_y"] = ((df["boxheight"] / 2 + df["boxy"]) / df["imgheight"])

df["w"] = df["boxwidth"] / df["imgwidth"]
df["h"] = df["boxheight"] / df["imgheight"]
df.head()

Unnamed: 0,filename,imgwidth,imgheight,name,boxwidth,boxheight,boxx,boxy,center_x,center_y,w,h
0,a5bd67a6-9a0f-45b3-85a7-017d5f77fc66.png,715,486,Actiniaria,76,83,638,400,0.945455,0.908436,0.106294,0.170782
1,a5bd67a6-9a0f-45b3-85a7-017d5f77fc66.png,715,486,Sebastolobus,95,53,522,108,0.796503,0.276749,0.132867,0.109053
2,a5bd67a6-9a0f-45b3-85a7-017d5f77fc66.png,715,486,Ophiuroidea,37,42,421,116,0.614685,0.281893,0.051748,0.08642
3,a5bd67a6-9a0f-45b3-85a7-017d5f77fc66.png,715,486,Psolus squamatus,113,85,391,378,0.625874,0.865226,0.158042,0.174897
4,a5bd67a6-9a0f-45b3-85a7-017d5f77fc66.png,715,486,Actiniaria,67,67,645,132,0.948951,0.340535,0.093706,0.13786


In [45]:
unique_names = df["name"].unique()

labels = dict()
for num in range(len(unique_names)):
    labels[unique_names[num]] = num

df["id"] = df["name"].map(labels)

In [46]:
images = df['filename'].unique()
len(images)

2591

In [47]:
img_df = pd.DataFrame(images, columns=['filename'])
img_df.head()

Unnamed: 0,filename
0,a5bd67a6-9a0f-45b3-85a7-017d5f77fc66.png
1,cc3640a1-beb6-454b-9f86-02942a7c4006.png
2,2f3758f6-2fc1-4c18-b8af-02ccb66f8c36.png
3,02d00747-6b43-4bd4-8095-031c6fec4eb0.png
4,a1011047-2d88-4703-b11b-0334b1cc95e3.png


In [48]:
img_train = tuple(img_df.sample(frac=0.8)['filename'])
#shuffling data and picking 80% of images

In [49]:
#picking all images that were not in the 80% train set (the other 20%)
img_test = tuple(img_df[~img_df['filename'].isin(img_train)]['filename'])
len(img_train), len(img_test) #length of each set

(2073, 518)

In [50]:
#making dataframes
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [51]:
train_df.head()
test_df.head()

Unnamed: 0,filename,imgwidth,imgheight,name,boxwidth,boxheight,boxx,boxy,center_x,center_y,w,h,id
83,624bc9f2-9979-4df8-8b86-03f557c6fda4.png,1600,900,Ophiuroidea,400,787,665,35,0.540625,0.476111,0.25,0.874444,2
107,153d5239-f2d6-4b6b-bdc8-066ca000ccd8.png,714,486,Sebastolobus,39,46,294,209,0.439076,0.477366,0.054622,0.09465,1
108,153d5239-f2d6-4b6b-bdc8-066ca000ccd8.png,714,486,Ophiuroidea,52,36,359,250,0.539216,0.55144,0.072829,0.074074,2
109,153d5239-f2d6-4b6b-bdc8-066ca000ccd8.png,714,486,Pannychia moseleyi,41,25,450,335,0.658964,0.715021,0.057423,0.05144,16
145,05d21c38-47fb-4dfd-80d7-0a2c8076a3d2.png,704,483,Bathyraja abyssicola,611,236,2,158,0.43679,0.571429,0.867898,0.488613,29


In [52]:
os.makedirs("images/images/train", exist_ok=True)
os.makedirs("images/images/val", exist_ok=True)
os.makedirs("images/labels/train", exist_ok=True)
os.makedirs("images/labels/val", exist_ok=True)

In [53]:
columns = ["filename", "id", "center_x", "center_y", "w", "h"]
groupby_obj_train = train_df[columns].groupby("filename")
groupby_obj_test = test_df[columns].groupby("filename")

In [54]:
def save_data(filename, images_folder_path, labels_folder_path, group_obj):
    source = os.path.join("images", filename)
    destination = os.path.join(images_folder_path, filename)
    move(source, destination)

    text_filename = os.path.join(labels_folder_path,os.path.splitext(filename)[0] + ".txt")
    group_obj.get_group(filename).set_index("filename").to_csv(text_filename,sep=' ',index=False,header=False) 

In [55]:
filename_series_train = pd.Series(groupby_obj_train.groups.keys())
filename_series_train.apply(save_data,args=("images/images/train","images/labels/train",groupby_obj_train))

0       None
1       None
2       None
3       None
4       None
        ... 
2068    None
2069    None
2070    None
2071    None
2072    None
Length: 2073, dtype: object

In [56]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data,args=("images/images/val","images/labels/val",groupby_obj_test))

0      None
1      None
2      None
3      None
4      None
       ... 
513    None
514    None
515    None
516    None
517    None
Length: 518, dtype: object

In [57]:
data = {
    'train': 'images/images/train',
    'val': 'images/images/test',
    'nc': len(labels), 
    'names': list(labels.keys()) 
}

with open('data.yaml', 'w') as file:
    yaml.dump(data, file, default_flow_style=False, sort_keys=False)