In [1]:
import numpy as np 
import json 
import os
import shutil
import ast
from tqdm import tqdm
import pandas as pd
from sklearn import model_selection

In [2]:
def conv_bbox(box_dict):
    
    xs = np.array(list(set([i['x'] for i in box_dict])))
    ys = np.array(list(set([i['y'] for i in box_dict])))
    
    x_min = xs.min()
    x_max = xs.max()
    y_min = ys.min()
    y_max = ys.max()
    
    return y_min, x_min, y_max, x_max

def convert(data, data_type):
    df = data.groupby('image_name')['bbox'].apply(list).reset_index(name='bboxes')
    df['classes'] = data.groupby('image_name')['class'].apply(list).reset_index(drop=True)
    df.to_csv(data_type + '.csv', index=False)
    print(data_type)
    print(df.shape)
    print(df.head())

In [3]:
DATA_PATH = os.getcwd()
print(DATA_PATH)
print(os.listdir(DATA_PATH))
source = os.path.join(DATA_PATH,'image_patches')
destination_1 = 'train' #Train images(only tank containing images)
destination_2 = 'test' #Test images

if not os.path.isdir(destination_1):
    os.mkdir(destination_1)
if not os.path.isdir(destination_2):
    os.mkdir(destination_2)
label_to_num = {'dontcare': 0, 'Tank': 1, 'Tank Cluster': 2, 'Floating Head Tank': 3}

/home/okta/side-project/TECHBROS/Code/AI-DATA/OTHER/stockpile-detection/archive/oil-tanks/try
['digits-jobs', 'tank_data', 'data', 'image_patches', 'data_converter.py', 'data_converter.ipynb', 'labels.json']


In [4]:
annotations = []

json_labels = json.load(open(os.path.join(DATA_PATH,'labels.json')))
for i in tqdm(range(len(json_labels))):
    file = json_labels[i]['file_name']
    if(file.startswith('01')):
        shutil.copy(source+'/'+file,destination_2)
    elif(json_labels[i]['label']!='Skip'):  
        shutil.copy(source+'/'+file,destination_1)
        for label in json_labels[i]['label'].keys():
            for box in json_labels[i]['label'][label]:
                y_min, x_min, y_max, x_max = conv_bbox(box['geometry'])
                width = x_max - x_min
                height = y_max - y_min
                annotations.append((file.split('.')[0] ,label_to_num[label], label, [x_min, y_min, width, height]))

annotations = pd.DataFrame(annotations, columns=['image_name', 'class', 'class_name', 'bbox'])

100%|██████████| 10000/10000 [00:11<00:00, 862.93it/s]


In [5]:
df_train, df_valid= model_selection.train_test_split(
    annotations, 
    test_size=0.1, 
    random_state=42, 
    shuffle=True, 
    stratify = annotations['class']
)

df_train = convert(df_train, 'train')
df_valid = convert(df_valid, 'validation')

train
(1720, 3)
  image_name                                             bboxes  \
0     02_0_7           [[123, 470, 72, 41], [426, 490, 75, 22]]   
1     02_0_8  [[1, 491, 26, 19], [58, 476, 50, 35], [62, 291...   
2     02_1_5           [[393, 389, 42, 39], [375, 440, 31, 28]]   
3     02_1_6  [[28, 488, 18, 17], [128, 456, 17, 17], [158, ...   
4     02_1_7  [[304, 299, 22, 22], [320, 26, 41, 39], [339, ...   

                          classes  
0                          [1, 1]  
1              [1, 1, 1, 1, 1, 1]  
2                          [1, 1]  
3     [1, 1, 1, 3, 1, 1, 1, 1, 1]  
4  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]  
validation
(565, 3)
  image_name                                    bboxes classes
0     02_0_7                      [[321, 501, 38, 11]]     [1]
1     02_1_7                       [[262, 84, 32, 31]]     [1]
2     02_2_0                      [[286, 439, 61, 61]]     [1]
3     02_2_3  [[441, 417, 70, 95], [196, 289, 91, 92]]  [3, 3]
4     02_2_6                  

In [6]:
tank_data = 'tank_data'
if not os.path.isdir(tank_data):
    os.mkdir(tank_data)

tank_data_dir = DATA_PATH + '/tank_data'
os.chdir(tank_data_dir)
labels_dir = 'labels'
images_dir = 'images'

if not os.path.isdir(images_dir):
    os.mkdir(images_dir)
if not os.path.isdir(labels_dir):
    os.mkdir(labels_dir)

tank_data_images = tank_data_dir + '/images'
os.chdir(tank_data_images)
tank_data_images_train = 'train'
tank_data_images_validation = 'validation'

if not os.path.isdir(tank_data_images_train):
    os.mkdir(tank_data_images_train)
if not os.path.isdir(tank_data_images_validation):
    os.mkdir(tank_data_images_validation)

tank_data_labels = tank_data_dir + '/labels'
os.chdir(tank_data_labels)
tank_data_labels_train = 'train'
tank_data_labels_validation = 'validation'

if not os.path.isdir(tank_data_labels_train):
    os.mkdir(tank_data_labels_train)
if not os.path.isdir(tank_data_labels_validation):
    os.mkdir(tank_data_labels_validation)

os.chdir(DATA_PATH)

In [7]:
INPUT_PATH = DATA_PATH
OUTPUT_PATH = INPUT_PATH+'/tank_data'
def process_data(data, data_type='train'):
    for _, row in tqdm(data.iterrows(), total = len(data)):
        image_name = row['image_name']
        bounding_boxes = row['bboxes']
        classes = row['classes']
        yolo_data = []
        for bbox, Class in zip(bounding_boxes, classes):
            x = bbox[0]
            y = bbox[1]
            w = bbox[2]
            h = bbox[3]
            x_center = x + w / 2
            y_center = y + h / 2
            
            x_center /= 512
            y_center /= 512
            w /= 512
            h /= 512
            yolo_data.append([Class, x_center, y_center, w, h])
        yoy_data = np.array(yolo_data)
        np.savetxt(
            os.path.join(OUTPUT_PATH, f"labels/{data_type}/{image_name}.txt"),
            yolo_data,
            fmt = ["%d", "%f", "%f", "%f", "%f"]
        )
        shutil.copyfile(
            os.path.join(INPUT_PATH, f"train/{image_name}.jpg"),
            os.path.join(OUTPUT_PATH, f"images/{data_type}/{image_name}.jpg")
        )

In [8]:
df_train = pd.read_csv('train.csv')
df_train.bboxes = df_train.bboxes.apply(ast.literal_eval)
df_train.classes = df_train.classes.apply(ast.literal_eval)

df_valid = pd.read_csv('validation.csv')
df_valid.bboxes = df_valid.bboxes.apply(ast.literal_eval)
df_valid.classes = df_valid.classes.apply(ast.literal_eval)

process_data(df_train, data_type='train')
process_data(df_valid, data_type='validation')

100%|██████████| 1720/1720 [00:02<00:00, 828.78it/s] 
100%|██████████| 565/565 [00:00<00:00, 1774.41it/s]


In [9]:
tank_data_images_train_path = tank_data_images + '/train'
tank_data_images_val_path = tank_data_images + '/validation'

tank_data_labels_train_path = tank_data_labels + '/train'
tank_data_labels_val_path = tank_data_labels + '/validation'

path, dirs, files = next(os.walk(tank_data_images_train_path))
print('[INFO] Number images for training:', len(files))
path, dirs, files = next(os.walk(tank_data_labels_train_path))
print('[INFO] Number labels for training:', len(files))

path, dirs, files = next(os.walk(tank_data_images_val_path))
print('[INFO] Number images for validation:', len(files))
path, dirs, files = next(os.walk(tank_data_labels_val_path))
print('[INFO] Number labels for validation:', len(files))


[INFO] Number images for training: 1720
[INFO] Number labels for training: 1720
[INFO] Number images for validation: 565
[INFO] Number labels for validation: 565
