In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from glob import glob
import os
import json
import pandas as pd
import numpy as np
from common import *

In [None]:
# load benchmark reference paths
exp_name = "t2b"
responses = {'SM': None, 'RP': None}
benchmark_dir = os.path.join(exp_name, 'benchmark')
answers_paths = [os.path.join(benchmark_dir, f'{exp_name}-part-{i}', 'answers.json') for i in range(5)]    

In [None]:
# load responses
data = []
for r in list(responses.keys()):
    r_path = os.path.join(exp_name, 'benchmark', f'{r}.xlsx')    
    responses[r] = pd.read_excel(r_path, header=0, index_col=0)

# iterate over partitions
for pi, part in enumerate([os.path.join(benchmark_dir,  f'{exp_name}-part-{i}') for i in range(5)]):
    col_name = os.path.basename(part)
    answers_path = os.path.join(part, 'answers.json')
    
    # load answers
    with open(answers_path, 'r') as f:
        answers = json.load(f)
    
    # append responses
    for r in list(responses.keys()):
        for i, a in enumerate(answers):            
            answers[i][f'{r}'] = responses[r].iloc[i, pi].lower()    
    
    # append to master list
    data += answers

In [None]:
benchmark = pd.DataFrame.from_dict(data).dropna().drop(columns=['test_idx']).drop_duplicates("image_path")
print(benchmark.label_override.value_counts())
print('Total', len(benchmark))

In [None]:
# data directories
data_dir = "../data/combined/resized"
image_dir = os.path.join(data_dir, "images")
label_dir = os.path.join(data_dir, "labels")

# label overrides
label_overrides_df = pd.read_excel('../data/labels.xlsx').fillna('')
label_overrides = {}
for index, row in label_overrides_df.iterrows():
    label_overrides[str(row['ID']).strip()] = row['Tier1'].lower()   

In [None]:
# remove benchmark images from training set
training_image_prefix = '../data/combined/resized/images/'
image_paths = glob(os.path.join(image_dir, "*.jpg"))
training_image_paths = set([os.path.basename(x) for x in image_paths]) - set([os.path.basename(x) for x in benchmark['image_path'].values])
training_image_paths = [os.path.join(training_image_prefix, x) for x in training_image_paths]
print(len(training_image_paths))

In [None]:
# show class distribution
random.shuffle(training_image_paths) # randomise
distribution(class_summary(training_image_paths, label_dir, label_overrides))

In [None]:
# create partitions
k = 1 # number of partitions

n_k = len(training_image_paths) // k + 1
parts = []
for i in range(k):
    start = i * n_k
    end = (i+1) * n_k
    p = training_image_paths[start:end]
    print(f"partition: {i+1}, start: {start}, end: {end}, count: {len(p)}")
    classes = class_summary(p, label_dir, label_overrides)
    distribution(classes)
    parts.append(classes)
    print("================================")

In [None]:
# save partitions
exp_name = 't1-final'
if not os.path.exists(exp_name):
    os.mkdir(exp_name)
    
for i, p in enumerate(parts):
    path = os.path.join(exp_name, f"{exp_name}-part-{i}.json")
    save_partition(path, p)

In [None]:
# save label map
label_map = {
    'background': 0, # required
    'rock': 1,
    'artefact': 2,
}

path = os.path.join(exp_name, f"{exp_name}-label-map.json")
with open(path, 'w') as f:
    json.dump(label_map, f)

In [None]:
# save benchmark as test set
image_prefix = '../data/combined/resized/images/'
benchmark_image_paths = [os.path.basename(x) for x in benchmark['image_path'].values]
benchmark_image_paths = [os.path.join(image_prefix, x) for x in benchmark_image_paths]

label_prefix = '../data/combined/resized/labels/'
benchmark_label_paths = [os.path.basename(x) for x in benchmark['label_path'].values]
benchmark_label_paths = [os.path.join(label_prefix, x) for x in benchmark_label_paths]

# drop other cols
test_set = benchmark.drop(columns=['SM', 'RP'])

# set correct paths
test_set['image_path'] = benchmark_image_paths
test_set['label_path'] = benchmark_label_paths

# override labels
mask = test_set.label_override != 'rock'
test_set.loc[mask, 'label_override'] = 'artefact'

In [None]:
test_set.label_override.value_counts()

In [None]:
path = os.path.join(exp_name, f"{exp_name}-test.json")
with open(path, 'w') as f:
    test_set.to_json(f, orient='records')