# Analysis
In this notebook we will implement some dataset analysis techniques to integrate into Suggester.

In [48]:
import modules
import importlib
import torch
import random
import numpy as np
import modules.models.Linear as Linear
from modules.ActiveLearning import Samplings
from tqdm import tqdm
from scipy.stats import entropy
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from modules import ActiveLearning
from modules import Suggester
from collections import defaultdict


importlib.reload(modules)
importlib.reload(modules.models.Linear)
importlib.reload(modules.ActiveLearning)
importlib.reload(modules.Suggester)


random_seed = 42
torch.cuda.is_available()

True

In [13]:
class TrainConfusion:
    def __init__(self, top_n=10):
        self.top_n = top_n

    def get_most_frequently_confused_classes(self, model, X_train, y_train):
        y_pred = model.predict(X_train)
        conf_dict = {}
        for real, pred in zip(y_train, y_pred):
            if (real != pred):
                mask = (real,pred)
                conf_dict[mask] = conf_dict.get(mask, 0) + 1
        lst = list(conf_dict.items())
        lst = sorted(lst, key=lambda x: -1 * (conf_dict.get((x[0][1],x[0][0]),0) + x[1]))
        tmp = {}
        successfully_added = 0
        for key, val in lst:
            if successfully_added >= self.top_n:
                break
            if (tmp.get((key[1],key[0]),-1) == -1):
                tmp[key] = (val, conf_dict.get((key[1], key[0]),0))
                successfully_added += 1
        return tmp
    
    def pretty_print(self, model, X_train, y_train):
        tmp = self.get_most_frequently_confused_classes(model, X_train, y_train)
        for key, value in tmp.items():
            print(f"{key[0]} was mistaken for {key[1]} {value[0]} times and {key[1]} for {key[0]} {value[1]} times")
        

In [14]:
class FindZeroSamples:
    def __init__(self):
        self.epsilon = 1e-7
    
    def get_zero_samples_count(self, X):
        print(np.argwhere(np.all(abs(X) < self.epsilon, axis=1)))
        return np.count_nonzero(np.all(abs(X) < self.epsilon, axis=1))

In [2]:
def import_dataset():
    le = preprocessing.LabelEncoder()
    vectorized_output_path = "selfpost/vectorized.npy"
    vectorized_labels_output_path = "selfpost/vectorized_labels.npy"
    with open(vectorized_output_path, "rb") as vect_X, open(vectorized_labels_output_path, "rb") as vect_y:
        X = np.load(vect_X, allow_pickle=True)
        y = np.load(vect_y, allow_pickle=True)
    le.fit(y)
    y = le.transform(y)
    return X, y, le.get_params() 

X, y, y_dict = import_dataset()

In [20]:
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed)
np.random.seed(random_seed)
torch.use_deterministic_algorithms(False)
random.seed(random_seed)
sug = Suggester.Suggester(X,y, test_fraction=0.99)
model = Linear.LinearModelTorch(Linear.LogReg(), 100)
model.train(sug.X_train, sug.y_train)

100%|█████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 50009.59it/s]


In [18]:
tconf = TrainConfusion(20)
tconf.pretty_print(model, sug.X_train, sug.y_train)

In [80]:
# Dispute measure: how much n_nearest nearest points' known class counts are unbalanced
# Не уверен, как это впихнуть в саджестр, потому что не совпадает интерфейс с остальными
# методами активлернинга. Возможно, я передаю интерфейс get_samples_for_labeling и буду 
# передавать туда сам suggester
class LabelDisputablePoints:
    def __init__(self, n_top=1000, n_nearest=100):
        self.n_top = n_top
        self.n_nearest = n_nearest
        
# Придумать более адекватную реализацию)
    def get_samples_for_labeling(self, sug, X_test, y_test):
        print(X_test.shape, y_test.shape)
        dist, ind = sug.index.search(sug.X_test, self.n_nearest)
        entropies = np.zeros(sug.X.shape[0], np.float32)
        for i, idx in tqdm(enumerate(ind)):
            mask = np.zeros(sug.X.shape[0], np.bool)
            mask[idx] = True
            mask = mask * sug.is_train_mask
            neigh = np.bincount(sug.y[mask])
            entropies[i] = entropy(neigh, axis=0)
        indices_to_return = np.argsort(entropies)[::-1]
        return indices_to_return[:self.n_top]

In [33]:
from faiss import IndexFlatL2
from modAL.models.base import BaseEstimator
from modAL.utils.data import modALinput
from functools import partial


def _nearest_neighbours_to_entropy(nearest_neighbours: np.ndarray, min_bins: int):
    bin_count = np.apply_along_axis(partial(np.bincount, minlength=min_bins), 1, nearest_neighbours)
    return entropy(bin_count, axis=1)


def classifier_train_confusion(classifier: BaseEstimator, X: modALinput,
                               index: IndexFlatL2, n_nearest: int = 100,
                               n_instances: int = 1, random_tie_break: bool = False,
                               **uncertainty_measure_kwargs):
    dist, ind = index.search(X, n_nearest)
    entropies = _nearest_neighbours_to_entropy(classifier.y_training[ind], np.unique(classifier.y_training).shape[0])
    indices_to_return = np.argsort(entropies)[::-1]
    return indices_to_return[:n_instances]

In [None]:
num_of_elements = 100000
subsample = np.random.choice(X.shape[0], num_of_elements, replace=False)
sug = Suggester.Suggester(X[subsample],y[subsample], test_fraction=0.8, build_index=True)

In [None]:
ldp = LabelDisputablePoints()
ldp.get_samples_for_labeling(sug, sug.X_test, sug.y_test)

In [23]:
arr = np.array([[1, 1, 1], [1, 2, 3], [1, 2, 2]])
_nearest_neighbours_to_entropy(arr, min_bins=4)

array([0.        , 1.09861229, 0.63651417])

In [34]:
import faiss
from modAL.models import ActiveLearner
from modAL.uncertainty import entropy_sampling
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from modules.models import Linear
importlib.reload(Linear)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, train_size=0.01, random_state=random_seed)
index = faiss.IndexFlatL2(X_test.shape[1])
index.add(X_train.astype(np.float32))

learner = ActiveLearner(
    estimator=Linear.LinearModelTorch(Linear.LogReg(), 100),
    query_strategy=classifier_train_confusion,
    X_training=X_train, y_training=y_train
)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:11<00:00,  8.85it/s]


In [35]:
query_idx, query_inst = learner.query(X_test.astype(np.float32), index=index, n_instances=100)

In [37]:
print(query_idx)

[ 57438  47534  17361  56281  55553  50680  16802  52674  50242  70260
  53060  71813  60726  40455  70972  56816  50172  83366  67631  30613
  98789  97989  33731  36447  55315  16865 100338   4367   4992  17184
  63143  16705  64433  29503  37309  42989  68123  57645  16925  15182
  88186  86248  96934  48148  70632  89089  42561  94993  11871  62771
  90671  54461  71559  23168  38862  45270  86572  62658  47845  80249
  31005   5433  12292  27428  42143   6954   7029  42073  44592  16587
  60016  92934  11210  71787  21243  20364  28385  18567   1148  89075
  56296  17103    848  43754  61463   9523  30271  55540  57215  14670
  59491  68820  56912   4014  30474  96031  23807  10289  18179  56291]


In [None]:
class PseudoLabeling(ActiveLearningBase):
    def __init__(self, n_top=1000):
        super().__init__(n_top)
        self.n_top = n_top

    def get_samples_for_labeling(self, model, X_test, y_test):
        y_proba = model.predict_proba(X_test)
        max_ind = np.argmax(y_proba, axis=1)
        y_proba = np.max(y_proba, axis=1)
        ind = np.lexsort((max_ind, y_proba))[::-1]
        ind_to_return = ind[:min(self.n_top, y_proba.shape[0])]
        return "labeling", ind_to_return, max_ind[ind_to_return]

In [52]:
def classifier_pseudolabeling(classifier: BaseEstimator, X: modALinput, 
                               n_nearest: int = 100,
                               n_instances: int = 1, random_tie_break: bool = False,
                               **uncertainty_measure_kwargs):
        y_proba = classifier.predict_proba(X)
        entropies = entropy(y_proba, axis=1)
        ind = np.argsort(entropies)
        return ind[:n_instances]

In [53]:
learner = ActiveLearner(
    estimator=Linear.LinearModelTorch(Linear.LogReg(), 100),
    query_strategy=classifier_pseudolabeling,
    X_training=X_train, y_training=y_train
)



100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:11<00:00,  8.38it/s]


In [54]:
query_idx, query_inst = learner.query(X_test.astype(np.float32), index=index, n_instances=100)
print(query_idx)

[ 27666   5341  55109  54160  38441  86666  53797  42529  24680  32840
  20845  75265  81582  23898  87102  81589  10828  32101  75743  50718
  17767  10730  92555  37968  56495  91285  25682   1525  26884  17762
  41728  78060  40913  89181  75707   6001  81740  60742   3640  88635
  68236  39517  86243  52512   2552  31827  14612   3879  22031  57727
  62354  42808  27645  88267  79922  35362  46768  32552  14260  74931
  11041  44516  87013  16093  38415  51698  32317  68217  80316  56618
  24924   3114  50981  63565 100303  51363  12105   8957  33177  44945
  98491  43848  78913  39102  49669  61121   6443  19048  24463  72725
  27218  44396  10446  21445  52798  72248  20801  13735  83742  96758]
