# Introduction

### Brief Overview

Coming soon.

### References

* An article that contains review of approaches to active learning: [Yang, 2017](https://arxiv.org/pdf/1702.08540.pdf);
* An article about EG-Active algorithm: [Bouneffouf, 2014](https://arxiv.org/abs/1408.2196).

# General Preparations

### Import Statements

In [1]:
import math
from copy import copy

import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.base import BaseEstimator
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

from dsawl.active_learning.pool_based_sampling import EpsilonGreedyPickerFromPool

### Notebook-level Settings

In [2]:
np.random.seed(361)

In [3]:
sns.set()

### User-defined Settings

In [4]:
# It is not a good practice to store binary files
# (like PNG images) in a Git repository, but for
# your local use you can set it to `True`.
draw_plots = False

# Dataset Generation

In [5]:
dimensionality = 2
lower_bound = -2
upper_bound = 2
pool_size = 300

In [6]:
X_train_initial = np.array(
    [[1, -1],
     [2, -2],
     [3, -3],
     [-1, -1],
     [-2, -2],
     [-3, -3],
     [0, 1],
     [0, 2],
     [0, 3]]
)

In [7]:
X_new = np.random.uniform(
    lower_bound, upper_bound, size=(pool_size, dimensionality)
)

In [8]:
X_hold_out = np.random.uniform(
    lower_bound, upper_bound, size=(pool_size, dimensionality)
)

In [9]:
def compute_target(X: np.ndarray) -> np.ndarray:
    """
    Compute class label for a simple classification problem where
    2D plane is split into three regions by rays such that they
    start from the origin and an angle between any pair of them
    has 120 degrees.
    
    :param X:
        coordinates of points from the plane
    :return:
        labels of regions where points are located
    """
    
    def compute_target_for_row(x: np.ndarray) -> int:
        if x[0] > 0:
            return 1 if x[1] - math.tan(math.radians(30)) * x[0] > 0 else 2
        else:
            return 1 if x[1] + math.tan(math.radians(30)) * x[0] > 0 else 3
        
    y = np.apply_along_axis(compute_target_for_row, axis=1, arr=X)
    return y

In [10]:
y_train_initial = compute_target(X_train_initial)
y_new = compute_target(X_new)
y_hold_out = compute_target(X_hold_out)

In [11]:
if draw_plots:
    fig = plt.figure(figsize=(15, 15))
    ax = fig.add_subplot(111)
    for label, color in zip(range(1, 4), ['b', 'r', 'g']):
        curr_X = X_train_initial[y_train_initial == label, :]
        ax.scatter(curr_X[:, 0], curr_X[:, 1], c=color, marker='D')
    for label, color in zip(range(1, 4), ['b', 'r', 'g']):
        curr_X = X_new[y_new == label, :]
        ax.scatter(curr_X[:, 0], curr_X[:, 1], c=color)

# Illustrative Example

In [12]:
clf = RandomForestClassifier(n_estimators=20, random_state=361)

In [13]:
max_n_points_to_explore = 100

In [14]:
epsilon = 0.1
scorer = 'margin'

In [15]:
def report_accuracy_of_benchmark(
        n_new_points: int,
        clf: BaseEstimator,
        X_train_initial: np.ndarray, y_train_inital: np.ndarray,
        X_new: np.ndarray, y_new: np.ndarray,
        X_hold_out: np.ndarray, y_hold_out: np.ndarray
        ) -> float:
    """
    Compute accuracy of approach where `n_new_points` objects
    are picked from a pool at random, without active learning.
    """
    X_train = np.vstack((X_train_initial, X_new[:n_new_points, :]))
    y_train = np.hstack((y_train_initial, y_new[:n_new_points]))
    clf.fit(X_train, y_train)
    y_hold_out_hat = clf.predict(X_hold_out)
    return accuracy_score(y_hold_out, y_hold_out_hat)

In [16]:
def report_accuracy_of_epsilon_greedy_strategy(
        n_new_points: int,
        clf: BaseEstimator,
        epsilon: float,
        scorer: str,
        X_train_initial: np.ndarray, y_train_inital: np.ndarray,
        X_new: np.ndarray, y_new: np.ndarray,
        X_hold_out: np.ndarray, y_hold_out: np.ndarray
        ) -> float:
    """
    Compute accuracy of epsilon-greedy approach to active
    learning.
    """
    X_train = copy(X_train_initial)
    y_train = copy(y_train_inital)
    clf.fit(X_train, y_train)
    picker = EpsilonGreedyPickerFromPool(
        scorer, exploration_probability=epsilon
    )
    picker.set_tools(clf)
    for i in range(n_new_points):
        indices = picker.pick_new_objects(X_new, n_to_pick=1)
        X_train = np.vstack((X_train, X_new[indices, :]))
        y_train = np.hstack((y_train, y_new[indices]))
        picker.update_tools(X_train, y_train)
        X_new = np.delete(X_new, indices, axis=0)
        y_new = np.delete(y_new, indices)
    clf = picker.get_tools()
    y_hold_out_hat = clf.predict(X_hold_out)
    return accuracy_score(y_hold_out, y_hold_out_hat)

In [17]:
benchmark_scores = [
    report_accuracy_of_benchmark(
        n, clf,
        X_train_initial, y_train_initial, X_new, y_new,
        X_hold_out, y_hold_out
    )
    for n in range(1, max_n_points_to_explore + 1)
]
sum(benchmark_scores)

93.703333333333276

In [18]:
epsilon_greedy_scores = [
    report_accuracy_of_epsilon_greedy_strategy(
        n, clf, epsilon, scorer,
        X_train_initial, y_train_initial, X_new, y_new,
        X_hold_out, y_hold_out
    )
    for n in range(1, max_n_points_to_explore + 1)
]
sum(epsilon_greedy_scores)

96.406666666666737

In [19]:
if draw_plots:
    fig = plt.figure(figsize=(15, 15))
    ax = fig.add_subplot(111)
    ax.plot(benchmark_scores)
    ax.plot(epsilon_greedy_scores, c='g')

# Conclusion

Coming soon.

And examples will be continued.