In [None]:
# warning: current data partitioning with label overrides assumes one object per image

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from glob import glob
import random
import os
import json

from common import *

In [None]:
# data directories
data_dir = "../data/combined/resized"
image_dir = os.path.join(data_dir, "images")
label_dir = os.path.join(data_dir, "labels")

# label overrides
label_overrides_df = pd.read_excel('../data/labels.xlsx').fillna('')
label_overrides = {}
for index, row in label_overrides_df.iterrows():
    label_overrides[str(row['ID']).strip()] = row['Tier1'].lower()    

In [None]:
# show class distribution
image_paths = glob(os.path.join(image_dir, "*.jpg"))
random.shuffle(image_paths) # randomise
distribution(class_summary(image_paths, label_dir, label_overrides))

In [None]:
# create partitions
k = 5 # number of partitions

n_k = len(image_paths) // k + 1
parts = []
for i in range(k):
    start = i * n_k
    end = (i+1) * n_k
    p = image_paths[start:end]
    print(f"partition: {i+1}, start: {start}, end: {end}, count: {len(p)}")
    classes = class_summary(p, label_dir, label_overrides)
    distribution(classes)
    parts.append(classes)
    print("================================")

In [None]:
# save partitions
exp_name = 't1'
if not os.path.exists(exp_name):
    os.mkdir(exp_name)
    
for i, p in enumerate(parts):
    path = os.path.join(exp_name, f"{exp_name}-part-{i}.json")
    save_partition(path, p)

In [None]:
# save label map
label_map = {
    'background': 0, # required
    'rock': 1,
    'artefact': 2,
}

path = os.path.join(exp_name, f"{exp_name}-label-map.json")
with open(path, 'w') as f:
    json.dump(label_map, f)