# Active Learning (Labeling Selection) using Entropy

In [None]:
BASE_PATH = '/content/'
MODEL_FNAME = 'model_stage1'
ANNOTATIONS_FNAME = 'annotations.txt'
NUM_ANNOTATE = 52
FROM_STAGE1 = True

In [None]:
![ ! -f "pip_installed" ] && \
pip install -q tensorflow-datasets==4.4.0 tensorflow-addons && \
unzip -qq /content/drive/MyDrive/TeamSemiSuperCV/Wing/xray_reborn.zip -d /root/tensorflow_datasets && \
unzip -qq /content/drive/MyDrive/TeamSemiSuperCV/Wing/XRay_.zip -d /content && \
unzip -qq /content/drive/MyDrive/TeamSemiSuperCV/Active_Learn/$MODEL_FNAME\.zip -d /content/$MODEL_FNAME && \
cp /content/drive/MyDrive/TeamSemiSuperCV/Active_Learn/preprocess.py /content && \
cp /content/drive/MyDrive/TeamSemiSuperCV/Active_Learn/Xray_Reborn.py /content && \
cp /content/drive/MyDrive/TeamSemiSuperCV/Active_Learn/valid.txt /content && \
cp /content/drive/MyDrive/TeamSemiSuperCV/Active_Learn/test.txt /content && \
cp /content/drive/MyDrive/TeamSemiSuperCV/Active_Learn/$ANNOTATIONS_FNAME /content && \
git clone --depth 1 https://github.com/TeamSemiSuperCV/semi-super.git /content/semi-super && \
touch pip_installed
!ls -F

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import logging
logging.disable(logging.WARNING)

In [None]:
import json

import tensorflow as tf
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
import numpy as np

from importlib import reload
from pathlib import Path
from scipy.stats import entropy
from scipy.special import softmax

from preprocess import dict2dict, IMG_SIZE
import Xray_Reborn

In [None]:
model_path = Path(BASE_PATH + MODEL_FNAME)
if (model_path / 'saved_model.pb').exists():
  model_path = str(model_path)
else:
  model_path = str([p.parent for p in model_path.glob('*/*/assets') if p.is_dir()][0])
model_path

## Labeling Selection

In [None]:
ds = tfds.load('xray_reborn')

ds_train = ds['train']
AUTO = tf.data.AUTOTUNE
ds_train = ds_train.map(dict2dict, num_parallel_calls=AUTO)
ds_train = ds_train.batch(64).prefetch(AUTO)

In [None]:
model = tf.saved_model.load(model_path)

In [None]:
# ['block_group1', 'block_group3', 'block_group2', 'initial_conv', 'final_avg_pool', 'block_group4',
#  'sup_head_input', 'proj_head_output', 'proj_head_input', 'initial_max_pool', 'logits_sup'])
feats = {'shi': [], 'lsp': [], 'phi': []}
labels = []
fnames = []

for batch in ds_train:
    outputs = model(batch['image'], trainable=False)
    feats['shi'].append(outputs['sup_head_input'].numpy())
    feats['lsp'].append(outputs['logits_sup'].numpy())
    feats['phi'].append(outputs['proj_head_input'].numpy())
    labels.append(batch['label'].numpy())
    fnames.append(batch['fname'].numpy())

for key in feats:
    feats[key] = np.concatenate(feats[key])
labels = np.concatenate(labels)
fnames = np.concatenate(fnames)
fnames = [bs.decode('utf-8') for bs in fnames]
feats['shi'].shape, feats['lsp'].shape, feats['phi'].shape, labels.shape, len(fnames)

In [None]:
def get_entropy(feats):
  ent_sorted = {}
  feats_names = feats.keys()
  for key in feats.keys():
    ent = entropy(softmax(feats[key], axis=1) if key == 'lsp' else feats[key],
                  axis=1)
    ent_lbl_fname = list(zip(ent, labels, fnames))
    ent_lbl_fname.sort(key=lambda x: x[0], reverse=True)
    num_ones = sum(l for _, l, _ in ent_lbl_fname[:NUM_ANNOTATE])
    print(f'{key}: {num_ones}, {ent.max():.3f}, {ent.min():.3f}, {ent.mean():.3f}')
    ent_sorted[key] = [f.strip() for _, _, f in ent_lbl_fname]
  return ent_sorted

ent_sorted = get_entropy(feats)

In [None]:
if FROM_STAGE1:
  # ds_train_1pc = ds['train_1pc']
  # annotations = {d['fname'].numpy().decode('utf-8') for d in ds_train_1pc}
  annotations = set() 
elif os.path.isfile(ANNOTATIONS_FNAME):
  print(ANNOTATIONS_FNAME)
  with open(ANNOTATIONS_FNAME) as f:
    annotations = set()
    for line in f:
      fname = line.strip()
      if fname: annotations.add(fname)
else:
  print(f'{ANNOTATIONS_FNAME} not found!')
  
len(annotations)

In [None]:
selected_feat = 'shi' if FROM_STAGE1 else 'lsp'
selected = ent_sorted[selected_feat]
annotate_new = set(selected[:NUM_ANNOTATE])
annotate_thresh = NUM_ANNOTATE
target_annotations = len(annotations) + NUM_ANNOTATE
annotate_new |= annotations

while len(annotate_new) < target_annotations:
  annotate_thresh += 1
  annotate_new.add(selected[annotate_thresh])
  print('!', end='')
print()
len(annotate_new)

In [None]:
with open('annotations.txt', 'w') as f:
    for fname in annotate_new:
        print(fname, file=f)
!wc -l annotations.txt

## Generate New Dataset

In [None]:
# reload(Xray_Reborn)
!rm -rf {BASE_PATH + 'xray_reborn'}
ds = tfds.load('XrayReborn', data_dir=BASE_PATH)  # will re-generate TFDS dataset
len(ds['train_act']), len(ds['validation']), len(ds['test'])

## Stage 2 Fine-Tuning

In [None]:
wc_annotations = !wc -l annotations.txt
len_annotations = int(wc_annotations[0].split()[0])
len_annotations

In [None]:
class simclrCommand():
  def __init__(self, params):
    self.params = params

  def compile_command(self):
    simclr_command = ['python3 /content/semi-super/run.py']
    for k,v in self.params.items():
      simclr_command.append(f'--{k}={v}')
    return (" ").join(simclr_command)

  def run_command(self):
    !{self.compile_command()}

In [None]:
params = {
    # Dataset
    'dataset': "xray_reborn",

    # Training Logistics
    'train_mode': "finetune", 
    "mode": "train_then_eval",
    'train_split': 'train_act',
    'eval_split': "validation", 
    'checkpoint_epochs': 20,
    "save_only_last_ckpt": True,
    "eval_per_loop": False,
    'zero_init_logits_layer': False,
    "use_tpu": False,

    # Training Hyperparams
    'warmup_epochs': 0,
    "train_epochs": 60,
    'fine_tune_after_block': 3,
    "train_batch_size": 14,
    "learning_rate": 0.0005, 
    "learning_rate_scaling": 'sqrt',
    'weight_decay': 0.001, 
    "temperature": 0.1,

    # Architecture
    "image_size": 224,   
    "resnet_depth": 50,
    "width_multiplier": 2,
    "sk_ratio":0.0625,  

    # Augmentations
    "color_jitter_strength": 0.5,
    "use_blur": False, 
    "area_range_min": 1.0,

    # Static
    "data_dir": '/content/',
    }

if FROM_STAGE1:
  params['zero_init_logits_layer'] = True
slimsk2 = simclrCommand(params)

In [None]:
# 1st Fine-Tuning /w Validation Split Results
def FT1(run, rerun=False):
  model_ft_name = f'model_{len_annotations}-{run}'
  global model_ft_path
  model_ft_path = BASE_PATH + model_ft_name
  if not rerun:
    !rm -rf $model_ft_path
    assert not os.path.isdir(model_ft_path)

  slimsk2.params['mode'] = 'train_then_eval'
  slimsk2.params['checkpoint'] = model_path
  slimsk2.params['model_dir'] = model_ft_path
  slimsk2.run_command()

In [None]:
# 2nd (Follow-on) Fine-Tuning /w Validation Split Results
def FT2(run, rerun=False):
  model_ft_name = f'model_{len_annotations}-{run}'
  model_ft_path = BASE_PATH + model_ft_name
  model_ft_path_sm = str([p.parent for p in Path(model_ft_path).glob('*/*/assets') if p.is_dir()][0])
  global model_ft2_path
  model_ft2_path = model_ft_path + '+'
  if not rerun:
    !rm -rf $model_ft2_path
    assert not os.path.isdir(model_ft2_path)

  slimsk2.params['mode'] = 'train_then_eval'
  slimsk2.params['checkpoint'] = model_ft_path_sm
  slimsk2.params['model_dir'] = model_ft2_path
  slimsk2.run_command()

In [None]:
FT1(1) # Run #1

In [None]:
FT1(2) # Run #2

In [None]:
FT1(3) # Run #3

In [None]:
FT1(4) # Run #4

In [None]:
# Average Validation Accuracy based on 4 Runs
avg_eval_accuracy = np.mean([0.956107, 0.956107, 0.948473, 0.948473])
avg_eval_accuracy