In [53]:
from data_reading import *
import matplotlib.pyplot as plt
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit

# contains some functions for visualizing the results/progress
# from exercise_6_iris_utils import *

In [57]:
import numpy as np

np.shape(x_train), np.shape(y_train), np.shape(x_test), np.shape(y_test)

X = np.vstack((x_test,x_train))
y = np.hstack((y_test,y_train))
np.shape(X), np.shape(y)
X = X.reshape(70000,-1,1)
X

array([[[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]],

       [[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]],

       [[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]],

       ...,

       [[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]],

       [[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]],

       [[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]]], dtype=uint8)

In [61]:
#Helper function
def prepare_data(X, y, n_init, use_classes=None, use_features=None, seed=None):
    """Extract classes and features and split data in training, pool, and test
    sets.

    PARAMETERS
    ----------
    iris
        The iris data object.
    use_classes : list | None
        The classes to be used. List of class labels (0, 1, 2) or None for all.
    use_features : list | None
        The features to be used. List of feature indices (0, 1, 2, 3) or None for all.
    n_init : int
        Number of initial data points in the training set.
    seed : int
        Seed for reproducibility.
    """          

    # Extract classes and features
    if use_classes is not None:
        use_examples = np.isin(y, use_classes)
        X = X[use_examples]
        y = y[use_examples]
    if use_features is not None:
        X = X[:, use_features]

    n = len(X)
    assert n_init <= n

    # Split in train, pool, and test set
    # Use stratified split to make sure we sample all classes equally
    sss = StratifiedShuffleSplit(n_splits=1, train_size=n_init / n, random_state=seed)
    train, pool = next(sss.split(X, y))
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)
    pool_, test = next(sss.split(X[pool], y[pool]))

    return dict(
        train = dict(
            X=X[train],
            y=y[train]
        ),
        pool = dict(
            X=X[pool[pool_]],
            y=y[pool[pool_]]
        ),
        test = dict(
            X=X[pool[test]],
            y=y[pool[test]]
        )
    )

In [59]:
def evaluate_uncertainty(prob, strategy):
    """Evaluate the desired uncertainty sampling strategy on predictive
    probabilities 'prob'.

    PARAMETERS
    ----------
    prob : ndarray 
        numpy array with predictive probabilities of shape 
        (n_points, n_classes)
    strategy : str
        One of 'least confident', 'margin', or 'entropy'.

    The function should return an array with uncertainties of shape
    (n_points, ) corresponding to the desired strategy.
    """
    # solution::start
    if strategy == 'least confident':
        res = 1 - prob.max(1)
    elif strategy == 'margin':
        ix = np.arange(len(prob))
        p2, p1 = prob.argsort(1)[:, -2:].T
        res = 1 - (prob[ix, p1] - prob[ix, p2])
    elif strategy == 'entropy':
        res = - np.sum(prob * np.log2(prob), axis=1)
    else:
        raise ValueError
    return res
    # solution::end

In [58]:
# Settings
n_init = 20             # Number of points to use for fitting the model
use_classes = [0,1,2,3,4,5,6,7,8,9]    # None (all three classes or a list of class labels to use, e.g., [0, 1])
use_features = range(0,784)  # The two features to use. Should be in the set (0, 1, 2, 3)
seed = 0

# Prepare the data (extract and split)
data = prepare_data(X, y, n_init, use_classes, use_features, seed=seed)

# Fit a logistic classifier
# (While testing this code I experienced some issues with the default
# solver (lbfgs) which is why I use different one here.)
model = LogisticRegression(penalty='l2', C=1e1, solver='liblinear', random_state=seed)
model = model.fit(data['train']['X'], data['train']['y'])

# For the sake of visualization, we are going to create a grid on
# which to evaluate probabilities and uncertainties
grid, imshow_kwargs = make_grid(data['train']['X'], data['pool']['X'])
pool = grid.transpose(1,2,0).reshape(-1, len(use_features))

# Evaluate uncertainties on the `pool` variable

# solution::start

# Compute the predictive probabilities
prob = model.predict_proba(pool)

# Compute the uncertainties
least_confident = evaluate_uncertainty(prob, 'least confident')
margin = evaluate_uncertainty(prob, 'margin')
entropy = evaluate_uncertainty(prob, 'entropy')

# solution::end

# Once calculated, plot the uncertainty metrics
plot_grid(prob, least_confident, margin, entropy, data, grid, imshow_kwargs)

ValueError: Found array with dim 3. Estimator expected <= 2.