# Sampling

In [None]:
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Returns points that minimizes the maximum distance of any point to a center.
Implements the k-Center-Greedy method in
Ozan Sener and Silvio Savarese.  A Geometric Approach to Active Learning for
Convolutional Neural Networks. https://arxiv.org/abs/1708.00489 2017
Distance metric defaults to l2 distance.  Features used to calculate distance
are either raw features or if a model has transform method then uses the output
of model.transform(X).
Can be extended to a robust k centers algorithm that ignores a certain number of
outlier datapoints.  Resulting centers are solution to multiple integer program.
"""

from sklearn.metrics import pairwise_distances
import abc
import numpy as np

class SamplingMethod(object):
  __metaclass__ = abc.ABCMeta

  @abc.abstractmethod
  def __init__(self, X, y, seed, **kwargs):
    self.X = X
    self.y = y
    self.seed = seed

  def flatten_X(self):
    shape = self.X.shape
    flat_X = self.X
    if len(shape) > 2:
      flat_X = np.reshape(self.X, (shape[0],np.product(shape[1:])))
    return flat_X


  @abc.abstractmethod
  def select_batch_(self):
    return

  def select_batch(self, **kwargs):
    return self.select_batch_(**kwargs)

  def to_dict(self):
    return None

class kCenterGreedy(SamplingMethod):

  def __init__(self, X, y, seed, metric='euclidean'):
    self.X = X
    self.y = y
    self.flat_X = self.flatten_X()
    self.name = 'kcenter'
    self.features = self.flat_X
    self.metric = metric
    self.min_distances = None
    self.n_obs = self.X.shape[0]
    self.already_selected = []

  def update_distances(self, cluster_centers, only_new=True, reset_dist=False):
    """Update min distances given cluster centers.
    Args:
      cluster_centers: indices of cluster centers
      only_new: only calculate distance for newly selected points and update
        min_distances.
      rest_dist: whether to reset min_distances.
    """

    if reset_dist:
      self.min_distances = None
    if only_new:
      cluster_centers = [d for d in cluster_centers
                         if d not in self.already_selected]
    if cluster_centers:
      # Update min_distances for all examples given new cluster center.
      x = self.features[cluster_centers]
      dist = pairwise_distances(self.features, x, metric=self.metric)

      if self.min_distances is None:
        self.min_distances = np.min(dist, axis=1).reshape(-1,1)
      else:
        self.min_distances = np.minimum(self.min_distances, dist)

  def select_batch_(self, model, already_selected, N, **kwargs):
    """
    Diversity promoting active learning method that greedily forms a batch
    to minimize the maximum distance to a cluster center among all unlabeled
    datapoints.
    Args:
      model: model with scikit-like API with decision_function implemented
      already_selected: index of datapoints already selected
      N: batch size
    Returns:
      indices of points selected to minimize distance to cluster centers
    """

    try:
      # Assumes that the transform function takes in original data and not
      # flattened data.
      print('Getting transformed features...')
      if model:
        self.features = model.transform(self.X)
      else:
        self.features = self.X
      
      print('Calculating distances...')
      self.update_distances(already_selected, only_new=False, reset_dist=True)
    except:
      print('Using flat_X as features.')
      self.update_distances(already_selected, only_new=True, reset_dist=False)

    new_batch = []

    for _ in tqdm(range(N)):
      if self.already_selected is None:
        # Initialize centers with a randomly selected datapoint
        ind = np.random.choice(np.arange(self.n_obs))
      else:
        ind = np.argmax(self.min_distances)
      # New examples should not be in already selected since those points
      # should have min_distance of zero to a cluster center.
      assert ind not in already_selected

      self.update_distances([ind], only_new=True, reset_dist=False)
      new_batch.append(ind)

    self.already_selected = already_selected
    return new_batch

# Coreset sampling

In [None]:
# Mount Drive
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

In [None]:
import os
import pickle
import numpy as np
import torch
import torch.nn.functional as F
from tqdm.notebook import tqdm
from sklearn.random_projection import SparseRandomProjection


_to_test = ['bottle', 'cable',
            'capsule', 'carpet',
            'grid', 'hazelnut', 
            'leather', 'metalnut', 
            'pill', 'screw', 
            'tile', 'toothbrush', 
            'transistor', 'wood', 
            'zipper'
            ]

embeddings_path = 'drive/MyDrive/data/efficientnet_embeddings_b7'

def reshape_embedding(embedding):
    embedding_list = []
    for k in range(embedding.shape[0]):
        for i in range(embedding.shape[2]):
            for j in range(embedding.shape[3]):
                embedding_list.append(embedding[k, :, i, j])
    return embedding_list

def embedding_concat(x, y):
    # from https://github.com/xiahaifeng1995/PaDiM-Anomaly-Detection-Localization-master
    B, C1, H1, W1 = x.size()
    _, C2, H2, W2 = y.size()
    s = int(H1 / H2)
    x = F.unfold(x, kernel_size=s, dilation=1, stride=s)
    x = x.view(B, C1, -1, H2, W2)
    z = torch.zeros(B, C1 + C2, x.size(2), H2, W2)
    for i in range(x.size(2)):
        z[:, :, i, :, :] = torch.cat((x[:, :, i, :, :], y), 1)
    z = z.view(B, -1, H2 * W2)
    z = F.fold(z, kernel_size=s, output_size=(H1, W1), stride=s)

    return z

for category in _to_test:

  print(category)

  layer_combis = [["3", "4"],
                  ["4", "5"],
                  ["3", "5"],
                  ["3", "4", "5"]]
  for layer_combi in layer_combis:

    print('\tLayer:', '_'.join(layer_combi), end=' ')
    f_name = f'{embeddings_path}/{category}/layer_{"_".join(layer_combi)}/embedding_train_projected_greedy.pickle'
    
    if os.path.isfile(f_name):
      print('done!')
      continue

    embeddings_reshaped = []
    embeddings_single = {}
    for layer in layer_combi:
      with open(f'{embeddings_path}/{category}/layer_{layer}/embedding_train_projected.pickle', 'rb') as f:
        embeddings_single[layer] = pickle.load(f)['embedding']

    for ix, _ in enumerate(embeddings_single[layer_combi[0]]):
      embedding = embedding_concat(torch.FloatTensor(embeddings_single[layer_combi[0]][ix]),
                                   torch.FloatTensor(embeddings_single[layer_combi[1]][ix])
                                   )
      
      if len(layer_combi) == 3:
        embedding = embedding_concat(embedding, torch.FloatTensor(embeddings_single[layer_combi[2]][ix]))
      print(embedding.size())
      embeddings_reshaped.extend(reshape_embedding(embedding.cpu().detach().numpy()))

    embeddings_train = np.array(embeddings_reshaped)
    print(embeddings_train.shape, end=' - ')
    # Random projection
    try:
      randomprojector = SparseRandomProjection(n_components='auto', eps=0.9) # 'auto' => Johnson-Lindenstrauss lemma
      randomprojector.fit(embeddings_train)
    except:
      randomprojector = SparseRandomProjection(n_components=embeddings_train.shape[1]) # 'auto' => Johnson-Lindenstrauss lemma
      randomprojector.fit(embeddings_train)

    # Coreset Subsampling
    selector = kCenterGreedy(embeddings_train,0,0)
    selected_idx = selector.select_batch(model=randomprojector, already_selected=[], N=min(int(embeddings_train.shape[0]*0.5), 15000))
    embedding_coreset = embeddings_train[selected_idx]
    if not os.path.isdir(f'{embeddings_path}/{category}/layer_{"_".join(layer_combi)}'):
      os.mkdir(f'{embeddings_path}/{category}/layer_{"_".join(layer_combi)}')
    with open(f_name, 'wb') as f:
      pickle.dump({'embedding': embedding_coreset}, f) 
    print('done!')

# Single Patch

In [None]:
import os
import pickle
import numpy as np
from tqdm.notebook import tqdm
from sklearn.random_projection import SparseRandomProjection


_to_test = ['bottle', 'cable', 'capsule', 'carpet', 'grid',
            'hazelnut', 'leather', 'metalnut', 'pill', 'screw', 
            'tile', 'toothbrush', 'transistor', 'wood', 'zipper']

embeddings_path = 'drive/MyDrive/data/efficientnet_embeddings_b3'

def reshape_embedding(embedding):
    embedding_list = []
    for k in range(embedding.shape[0]):
        for i in range(embedding.shape[2]):
            for j in range(embedding.shape[3]):
                embedding_list.append(embedding[k, :, i, j])
    return embedding_list

for category in _to_test:

  print(category)

  for layer in range(1,8):

    print('\tLayer:', layer, end=' ')
    f_name = f'{embeddings_path}/{category}/layer_{layer}/embedding_train_projected_greedy.pickle'
    
    if os.path.isfile(f_name):
      print('done!')
      continue

    with open(f'{embeddings_path}/{category}/layer_{layer}/embedding_train_projected.pickle', 'rb') as f:
      embeddings_train_full = pickle.load(f)['embedding']

    embeddings_reshaped = []
    for i in embeddings_train_full:
      embeddings_reshaped.extend(reshape_embedding(np.array(i)))

    embeddings_train = np.array(embeddings_reshaped)
    print(embeddings_train.shape, end=' - ')
    # Random projection
    try:
      randomprojector = SparseRandomProjection(n_components='auto', eps=0.9) # 'auto' => Johnson-Lindenstrauss lemma
      randomprojector.fit(embeddings_train)
    except:
      randomprojector = SparseRandomProjection(n_components=embeddings_train.shape[1]) # 'auto' => Johnson-Lindenstrauss lemma
      randomprojector.fit(embeddings_train)

    # Coreset Subsampling
    selector = kCenterGreedy(embeddings_train,0,0)
    selected_idx = selector.select_batch(model=randomprojector, already_selected=[], N=min(int(embeddings_train.shape[0]*0.5), 15000))
    embedding_coreset = embeddings_train[selected_idx]
    with open(f_name, 'wb') as f:
      pickle.dump({'embedding': embedding_coreset}, f) 
    print('done!')