In [None]:
# warning: current data partitioning with label overrides assumes one object per image

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from glob import glob
import random
import os
import json

from common import *

In [None]:
# data directories
data_dir = "../data/combined/resized"
image_dir = os.path.join(data_dir, "images")
label_dir = os.path.join(data_dir, "labels")

# label overrides
label_overrides_df = pd.read_excel('../data/labels.xlsx').fillna('')
label_overrides = {}
for index, row in label_overrides_df.iterrows():
    label_overrides[str(row['ID']).strip()] = row['Tier2'].lower()    

In [None]:
# show class distribution
image_paths = glob(os.path.join(image_dir, "*.jpg"))
cs = class_summary(image_paths, label_dir, label_overrides)

In [None]:
distribution(cs)

In [None]:
# create a base set of image paths of all cores and tools
p_base = [p[0] for p in cs['core']]
p_base += [p[0] for p in cs['tool']]
print(len(p_base))

other_classes = ['flake', 'rock']
sample_size = 300

In [None]:
# create partitions
k = 5 # number of partitions
r_test = 0.2 # test ratio

n_k = (len(p_base) + len(other_classes) * sample_size) // k + 1
parts = []
for i in range(k):
    
    # generate class balanced list of images
    p_image_paths = [] + p_base
    for c in other_classes:
        p_image_paths += random.sample([p[0] for p in cs[c]], sample_size)    
    random.shuffle(p_image_paths)
    
    start = 0
    end = len(p_image_paths)
    p = p_image_paths[start:end]
    print(f"partition: {i+1}, start: {start}, end: {end}, count: {len(p)}")
    
    # train-test split
    n_train = int(r_test * len(p))
    p_train = p[:-n_train]
    p_test = p[-n_train:]
    
    classes_train = class_summary(p_train, label_dir, label_overrides)
    classes_test = class_summary(p_test, label_dir, label_overrides)
    print("train:")  
    print("-----------")
    distribution(classes_train)
    print("test:")
    print("-----------")
    distribution(classes_test)
    parts.append((classes_train, classes_test))
    print("================================")

In [None]:
# save partitions
exp_name = 't2b'
if not os.path.exists(exp_name):
    os.mkdir(exp_name)
    
for i, (part_train, part_test) in enumerate(parts):
    path_train = os.path.join(exp_name, f"{exp_name}-train-{i}.json")
    path_test = os.path.join(exp_name, f"{exp_name}-test-{i}.json")
    save_partition(path_train, part_train)
    save_partition(path_test, part_test)

In [None]:
# save label map
label_map = {
    'background': 0, # required
    'rock': 1,
    'flake': 2,
    'tool': 3,
    'core': 4
}

path = os.path.join(exp_name, f"{exp_name}-label-map.json")
with open(path, 'w') as f:
    json.dump(label_map, f)