# Data conversion for training the model

In [1]:
import os
import shutil
import pandas as pd
import PIL.Image
from tqdm import tqdm
import concurrent.futures as futures

In [3]:
DATA_PATH = "/mnt/stg/inclusive-images-challenge/"
RAW_PATH = f'{DATA_PATH}raw/'
TGT_PATH = f'{DATA_PATH}train/'

In [4]:
os.makedirs(TGT_PATH, exist_ok=True)

In [5]:
print("Loading labels data frame...")
df_label_names = pd.read_csv(f'{DATA_PATH}class-descriptions.csv')
df_trainable_labels = pd.read_csv(f'{DATA_PATH}classes-trainable.csv')
print("Loading bounding box data...")
df_bboxes = pd.read_csv(f'{DATA_PATH}train_bounding_boxes.csv')

Loading labels data frame...
Loading bounding box data...


In [6]:
labels_set = set(df_trainable_labels.label_code.tolist())

In [8]:
TRAIN_PATH = f'{TGT_PATH}train/'
SUFFIXES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f')
if not os.path.exists(TRAIN_PATH):
    os.makedirs(TRAIN_PATH)
for s in SUFFIXES:
    p = f'{TRAIN_PATH}train_{s}/'
    if not os.path.exists(p):
        os.makedirs(p)

In [9]:
class Job:    
    def __init__(self, img_id):
        self.img_id = img_id
        self.sub_map = {}
        self.sub_idx = 0
        self.images_labels = []
        
    def add_row(self, row):
        if self.img_id != row['ImageID']:
            return False
        label = row['LabelName']
        if label not in labels_set:
            return True
        bbox = [min(1.0, max(0, round(row[t], 5))) for t in ('XMin', 'YMin', 'XMax', 'YMax')]
        bbox = tuple(bbox)
        tgt_img_id = self.sub_map.get(bbox)
        if tgt_img_id is None:
            tgt_img_id = f'{self.img_id}_{self.sub_idx:05}'
            self.sub_idx += 1
            self.sub_map[bbox] = tgt_img_id
        self.images_labels.append((tgt_img_id, label))
        return True

    def add_labels(self, df):
        for img, label in self.images_labels:
            df = df.append({'ImageID': f'train_{img[0]}/{img}', 'LabelName': label}, ignore_index=True)
        return df
    
    def something_to_submit(self):
        return len(self.images_labels) > 0
    
    def submit(self, executor):
        return executor.submit(do_job, self.img_id, self.sub_map)
        
def do_job(img_id, sub_map):
    processing_needed = False
    for bbox, tgt_img_id in sub_map.items():
        tgt_fname = f'{TRAIN_PATH}train_{tgt_img_id[0]}/{tgt_img_id}.jpg'
        if os.path.exists(tgt_fname):
            continue
        processing_needed = True
        break
    if not processing_needed:
        return
    
    fname = f'{RAW_PATH}train_{img_id[0]}/{img_id}.jpg'
    if not os.path.exists(fname):
        return
    img = PIL.Image.open(fname)
    w, h = img.size
    for bbox, tgt_img_id in sub_map.items():
        tgt_fname = f'{TRAIN_PATH}train_{tgt_img_id[0]}/{tgt_img_id}.jpg'
        if os.path.exists(tgt_fname):
            continue
        crop = (w * bbox[0], h * bbox[1], w * bbox[2], h * bbox[3])
        crop = list(map(int, crop))
        tgt_img = img.crop(crop)
        tgt_img.save(tgt_fname)

In [46]:
NUM_JOBS = 6
MAX_CONCURRENT_JOBS = 10000
WAIT_SECONDS = 30
#df = df_bboxes[:30000]
df = df_bboxes
#ignore_groups = set(['0', '1'])
#ignore_groups = {'0', '1', '2', '3'} # 4 and 7 is fully unpacked, so, start it
ignore_groups = set()

In [47]:
print("Converting %d bounding boxes" % len(df))
tgt_df = pd.DataFrame(columns=['ImageID', 'LabelName'])
fs = []
with futures.ThreadPoolExecutor(max_workers=NUM_JOBS) as executor:
    job = None
    try:
        for idx, row in tqdm(df.iterrows(), total=len(df)):
            img_id = row['ImageID']
            if img_id[0] in ignore_groups:
                continue
            if job is None:
                job = Job(img_id)
            if not job.add_row(row):
                if job.something_to_submit():
                    fs.append(job.submit(executor))
                    tgt_df = job.add_labels(tgt_df)
                job = Job(img_id)
                job.add_row(row)
                if len(fs) >= MAX_CONCURRENT_JOBS:                
                    done_fs, fs = futures.wait(fs, timeout=WAIT_SECONDS)
                    fs = list(fs)
                    print("Collected %d completed jobs, cur ImageID=%s" % (len(done_fs), img_id))
        if job.something_to_submit():
            fs.append(job.submit(executor))
            tgt_df = job.add_labels(tgt_df)
        tgt_df.to_csv(f'{TGT_PATH}train_proc.csv', index=False)
        print("Waiting for %d jobs to be completed" % len(fs))
        futures.wait(fs)
    except KeyboardInterrupt:
        print("Interrupt pressed, waiting for %d jobs to be completed gracefully" % len(fs))
        futures.wait(fs)


  0%|          | 0/30000 [00:00<?, ?it/s][A
  0%|          | 72/30000 [00:00<00:42, 706.05it/s][A

Converting 30000 bounding boxes



  0%|          | 123/30000 [00:00<00:50, 596.77it/s][A
  1%|          | 182/30000 [00:00<00:52, 563.66it/s][A
  1%|          | 272/30000 [00:00<00:46, 633.89it/s][A
  1%|          | 351/30000 [00:00<00:45, 650.14it/s][A
  1%|▏         | 413/30000 [00:00<00:46, 635.23it/s][A
  2%|▏         | 478/30000 [00:00<00:47, 625.98it/s][A
  2%|▏         | 538/30000 [00:00<00:47, 624.45it/s][A
  2%|▏         | 626/30000 [00:00<00:46, 637.21it/s][A
  2%|▏         | 722/30000 [00:01<00:44, 660.12it/s][A
  3%|▎         | 803/30000 [00:01<00:43, 669.13it/s][A
  3%|▎         | 875/30000 [00:01<00:43, 663.94it/s][A
  3%|▎         | 947/30000 [00:01<00:43, 663.46it/s][A
  3%|▎         | 1035/30000 [00:01<00:43, 661.97it/s][A
  4%|▎         | 1108/30000 [00:01<00:43, 666.12it/s][A
  4%|▍         | 1182/30000 [00:01<00:43, 667.01it/s][A
  4%|▍         | 1264/30000 [00:01<00:43, 667.55it/s][A
  4%|▍         | 1333/30000 [00:02<00:43, 662.26it/s][A
  6%|▌         | 1705/30000 [00:02<00:43, 

Waiting for 1000 jobs to be completed



 37%|███▋      | 11207/30000 [00:28<00:47, 399.76it/s][A
 38%|███▊      | 11262/30000 [00:38<01:04, 291.98it/s][A
 38%|███▊      | 11333/30000 [00:38<01:03, 292.92it/s][A
 38%|███▊      | 11400/30000 [00:38<01:03, 293.73it/s][A
 38%|███▊      | 11469/30000 [00:38<01:02, 294.64it/s][A
 38%|███▊      | 11528/30000 [00:39<01:02, 295.36it/s][A
 39%|███▊      | 11601/30000 [00:39<01:02, 296.40it/s][A
 39%|███▉      | 11660/30000 [00:39<01:01, 297.12it/s][A
 39%|███▉      | 11719/30000 [00:39<01:01, 297.75it/s][A
 39%|███▉      | 11783/30000 [00:39<01:01, 298.52it/s][A
 39%|███▉      | 11839/30000 [00:39<01:00, 299.23it/s][A
 40%|███▉      | 11903/30000 [00:39<01:00, 300.07it/s][A
 40%|███▉      | 11961/30000 [00:39<00:59, 300.74it/s][A
 40%|████      | 12018/30000 [00:39<00:59, 301.39it/s][A
 40%|████      | 12108/30000 [00:39<00:59, 302.88it/s][A
 41%|████      | 12175/30000 [00:40<00:58, 303.56it/s][A
 41%|████      | 12243/30000 [00:40<00:58, 304.16it/s][A
 41%|████    

Waiting for 1000 jobs to be completed



 70%|███████   | 21061/30000 [01:08<00:28, 309.51it/s][A
 70%|███████   | 21116/30000 [01:13<00:31, 285.57it/s][A
 71%|███████   | 21181/30000 [01:14<00:30, 286.06it/s][A
 71%|███████   | 21255/30000 [01:14<00:30, 286.32it/s][A
 71%|███████   | 21317/30000 [01:14<00:30, 286.65it/s][A
 71%|███████   | 21374/30000 [01:14<00:30, 287.01it/s][A
 71%|███████▏  | 21444/30000 [01:14<00:29, 287.41it/s][A
 72%|███████▏  | 21513/30000 [01:14<00:29, 287.80it/s][A
 72%|███████▏  | 21574/30000 [01:14<00:29, 288.16it/s][A
 72%|███████▏  | 21621/30000 [01:14<00:29, 288.36it/s][A
 72%|███████▏  | 21680/30000 [01:15<00:28, 288.72it/s][A
 72%|███████▏  | 21740/30000 [01:15<00:28, 289.13it/s][A
 73%|███████▎  | 21791/30000 [01:15<00:28, 289.42it/s][A
 73%|███████▎  | 21842/30000 [01:15<00:28, 289.65it/s][A
 73%|███████▎  | 21905/30000 [01:15<00:27, 290.00it/s][A
 73%|███████▎  | 21966/30000 [01:15<00:27, 290.34it/s][A
 74%|███████▎  | 22108/30000 [01:15<00:27, 291.23it/s][A
 74%|███████▍

Waiting for 1000 jobs to be completed



 96%|█████████▋| 28934/30000 [01:38<00:03, 295.03it/s][A
 97%|█████████▋| 29003/30000 [01:45<00:03, 273.64it/s][A
 97%|█████████▋| 29057/30000 [01:46<00:03, 273.89it/s][A
 97%|█████████▋| 29111/30000 [01:46<00:03, 274.12it/s][A
 97%|█████████▋| 29162/30000 [01:46<00:03, 274.30it/s][A
 97%|█████████▋| 29224/30000 [01:46<00:02, 274.61it/s][A
 98%|█████████▊| 29287/30000 [01:46<00:02, 274.91it/s][A
 98%|█████████▊| 29343/30000 [01:46<00:02, 275.17it/s][A
 98%|█████████▊| 29404/30000 [01:46<00:02, 275.46it/s][A
 98%|█████████▊| 29456/30000 [01:46<00:01, 275.69it/s][A
 98%|█████████▊| 29507/30000 [01:46<00:01, 275.82it/s][A
 99%|█████████▊| 29579/30000 [01:47<00:01, 276.20it/s][A
 99%|█████████▉| 29642/30000 [01:47<00:01, 276.51it/s][A
 99%|█████████▉| 29697/30000 [01:47<00:01, 276.75it/s][A
 99%|█████████▉| 29750/30000 [01:47<00:00, 276.98it/s][A
 99%|█████████▉| 29803/30000 [01:47<00:00, 277.13it/s][A
100%|█████████▉| 29865/30000 [01:47<00:00, 277.36it/s][A
100%|████████

Waiting for 157 jobs to be completed
