In [82]:
import os
from glob import glob
import pandas as pd
from functools import reduce
import json
from shutil import move
import yaml

In [57]:
def extract_json(filename):
    json_file_path = filename
    parser = []
    
    with open(json_file_path, "r") as file:
        data = json.load(file)

    for file in data:
        filename = os.path.join(file["uuid"] + ".png")

        imgwidth = file["width"]
        imgheight = file["height"]
        
        for box in file["boundingBoxes"]:
            name = box["concept"]
    
            boxx = box["x"]
            boxy = box["y"]
            boxwidth = box["width"]
            boxheight = box["height"]
            
            parser.append([filename, imgwidth, imgheight, name, boxwidth, boxheight, boxx, boxy])

    return parser

In [58]:
df = pd.DataFrame(extract_json("data.json"), columns = ["filename", "imgwidth", "imgheight", "name", "boxwidth", "boxheight", "boxx", "boxy"])

In [59]:
df.info()
df["name"].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20626 entries, 0 to 20625
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   filename   20626 non-null  object
 1   imgwidth   20626 non-null  int64 
 2   imgheight  20626 non-null  int64 
 3   name       20626 non-null  object
 4   boxwidth   20626 non-null  int64 
 5   boxheight  20626 non-null  int64 
 6   boxx       20626 non-null  int64 
 7   boxy       20626 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 1.3+ MB


name
Ophiuroidea                    10983
Crinoidea                        710
Strongylocentrotus fragilis      471
Holothuroidea                    412
Scotoplanes                      406
                               ...  
Apristurus kampae                  1
Iosactis                           1
eggcase                            1
Thenea                             1
Brisingidae                        1
Name: count, Length: 442, dtype: int64

In [60]:
columns = ["filename", "imgwidth", "imgheight", "name", "boxwidth", "boxheight", "boxx", "boxy"]

df["center_x"] = ((df["boxwidth"] / 2 + df["boxx"]) / df["imgwidth"])
df["center_y"] = ((df["boxheight"] / 2 + df["boxy"]) / df["imgheight"])

df["w"] = df["boxwidth"] / df["imgwidth"]
df["h"] = df["boxheight"] / df["imgheight"]
df.head()

Unnamed: 0,filename,imgwidth,imgheight,name,boxwidth,boxheight,boxx,boxy,center_x,center_y,w,h
0,a5bd67a6-9a0f-45b3-85a7-017d5f77fc66.png,715,486,Actiniaria,76,83,638,400,0.945455,0.908436,0.106294,0.170782
1,a5bd67a6-9a0f-45b3-85a7-017d5f77fc66.png,715,486,Sebastolobus,95,53,522,108,0.796503,0.276749,0.132867,0.109053
2,a5bd67a6-9a0f-45b3-85a7-017d5f77fc66.png,715,486,Ophiuroidea,37,42,421,116,0.614685,0.281893,0.051748,0.08642
3,a5bd67a6-9a0f-45b3-85a7-017d5f77fc66.png,715,486,Psolus squamatus,113,85,391,378,0.625874,0.865226,0.158042,0.174897
4,a5bd67a6-9a0f-45b3-85a7-017d5f77fc66.png,715,486,Actiniaria,67,67,645,132,0.948951,0.340535,0.093706,0.13786


In [61]:
unique_names = df["name"].unique()

labels = dict()
for num in range(len(unique_names)):
    labels[unique_names[num]] = num

df["id"] = df["name"].map(labels)

In [62]:
images = df['filename'].unique()
len(images)

2687

In [63]:
img_df = pd.DataFrame(images, columns=['filename'])
img_df.head()

Unnamed: 0,filename
0,a5bd67a6-9a0f-45b3-85a7-017d5f77fc66.png
1,cc3640a1-beb6-454b-9f86-02942a7c4006.png
2,2f3758f6-2fc1-4c18-b8af-02ccb66f8c36.png
3,02d00747-6b43-4bd4-8095-031c6fec4eb0.png
4,a1011047-2d88-4703-b11b-0334b1cc95e3.png


In [64]:
img_train = tuple(img_df.sample(frac=0.8)['filename'])
#shuffling data and picking 80% of images

In [65]:
#picking all images that were not in the 80% train set (the other 20%)
img_test = tuple(img_df[~img_df['filename'].isin(img_train)]['filename'])
len(img_train), len(img_test) #length of each set

(2150, 537)

In [66]:
#making dataframes
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [67]:
train_df.head()
test_df.head()

Unnamed: 0,filename,imgwidth,imgheight,name,boxwidth,boxheight,boxx,boxy,center_x,center_y,w,h,id
0,a5bd67a6-9a0f-45b3-85a7-017d5f77fc66.png,715,486,Actiniaria,76,83,638,400,0.945455,0.908436,0.106294,0.170782,0
1,a5bd67a6-9a0f-45b3-85a7-017d5f77fc66.png,715,486,Sebastolobus,95,53,522,108,0.796503,0.276749,0.132867,0.109053,1
2,a5bd67a6-9a0f-45b3-85a7-017d5f77fc66.png,715,486,Ophiuroidea,37,42,421,116,0.614685,0.281893,0.051748,0.08642,2
3,a5bd67a6-9a0f-45b3-85a7-017d5f77fc66.png,715,486,Psolus squamatus,113,85,391,378,0.625874,0.865226,0.158042,0.174897,3
4,a5bd67a6-9a0f-45b3-85a7-017d5f77fc66.png,715,486,Actiniaria,67,67,645,132,0.948951,0.340535,0.093706,0.13786,0


In [70]:
os.makedirs("images/train", exist_ok=True)
os.makedirs("images/test", exist_ok=True)

In [71]:
columns = ["filename", "id", "center_x", "center_y", "w", "h"]
groupby_obj_train = train_df[columns].groupby("filename")
groupby_obj_test = test_df[columns].groupby("filename")

In [83]:
def save_data(filename, folder_path, group_obj):
    source = os.path.join("images", filename)
    destination = os.path.join(folder_path, filename)
    move(source, destination)

    text_filename = os.path.join(folder_path,os.path.splitext(filename)[0] + ".txt")
    group_obj.get_group(filename).set_index("filename").to_csv(text_filename,sep=' ',index=False,header=False) 

In [78]:
filename_series_train = pd.Series(groupby_obj_train.groups.keys())
filename_series_train.apply(save_data,args=("images/train",groupby_obj_train))

0       None
1       None
2       None
3       None
4       None
        ... 
2145    None
2146    None
2147    None
2148    None
2149    None
Length: 2150, dtype: object

In [106]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data,args=("images/test",groupby_obj_test))

FileNotFoundError: [Errno 2] No such file or directory: 'images/0075e1a9-e8f1-4e69-8d2b-d27f87e0e0b3.png'

In [109]:
data = {
    'train': 'images/train',
    'val': 'images/test',
    'nc': len(labels), 
    'names': list(labels.keys()) 
}

with open('data.yaml', 'w') as file:
    yaml.dump(data, file, default_flow_style=False, sort_keys=False)
