In [10]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

from sklearn import datasets
from sklearn.semi_supervised import label_propagation
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.cross_validation import train_test_split

%matplotlib inline

In [67]:
digits = datasets.load_digits()
rng = np.random.RandomState(0)
indices = np.arange(len(digits.data))
rng.shuffle(indices)

X = digits.data[indices[:330]]
y = digits.target[indices[:330]]

n_total_samples = len(y)
n_labeled_points = 10

unlabeled_indices = np.arange(n_total_samples)[n_labeled_points:]

for i in range(5):
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1

    # LabelSpreading minimizes loss (better for noise data)
    lp_model = label_propagation.LabelSpreading(gamma=0.25, max_iter=5)
    lp_model.fit(X, y_train)

    predicted_labels = lp_model.transduction_[unlabeled_indices]
    true_labels = y[unlabeled_indices]

    cm = confusion_matrix(true_labels, predicted_labels,
                          labels=lp_model.classes_)

    print('Iteration %i %s' % (i, 70 * '_'))
    print("Label Spreading model: %d labeled & %d unlabeled (%d total)"
          % (n_labeled_points, n_total_samples - n_labeled_points, n_total_samples))

    print(classification_report(true_labels, predicted_labels))

    print("Confusion matrix")
    print(cm)
    print("Accuracy: {0}".format(accuracy_score(true_labels, predicted_labels)))

    # compute the entropies of transduced label distributions
    pred_entropies = stats.distributions.entropy(
        lp_model.label_distributions_.T)

    # select five digit examples that the classifier is most certain about
    uncertainty_index = uncertainty_index = np.argsort(pred_entropies)[-5:]

    # keep track of indices that we get labels for
    delete_indices = np.array([])
    
    counter = 0

    for index, image_index in enumerate(uncertainty_index):

        if(image_index in unlabeled_indices):
            # labeling 5 points, remote from labeled set
            delete_index, = np.where(unlabeled_indices == image_index)
            delete_indices = np.concatenate((delete_indices, delete_index))
            counter += 1
            if counter == 5:
                break

    unlabeled_indices = np.delete(unlabeled_indices, delete_indices)
    n_labeled_points += 5

lp_model.fit(X, y_train)
predicts = lp_model.predict(X)
print(confusion_matrix(y, predicts,labels=lp_model.classes_))
print(classification_report(y, predicts))
print(accuracy_score(y,predicts))

Iteration 0 ______________________________________________________________________
Label Spreading model: 10 labeled & 320 unlabeled (330 total)
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        24
          1       0.49      0.90      0.63        29
          2       0.88      0.97      0.92        31
          3       0.00      0.00      0.00        28
          4       0.00      0.00      0.00        27
          5       0.89      0.49      0.63        35
          6       0.86      0.95      0.90        40
          7       0.75      0.92      0.83        36
          8       0.54      0.79      0.64        33
          9       0.41      0.86      0.56        37

avg / total       0.52      0.63      0.55       320

Confusion matrix
[[26  1  0  0  1  0  1]
 [ 1 30  0  0  0  0  0]
 [ 0  0 17  6  0  2 10]
 [ 2  0  0 38  0  0  0]
 [ 0  3  0  0 33  0  0]
 [ 7  0  0  0  0 26  0]
 [ 0  0  2  0  0  3 32]]
Accuracy: 0.63125
Iteration 1 

  'precision', 'predicted', average, warn_for)


In [69]:
digits = datasets.load_digits()
rng = np.random.RandomState(0)
indices = np.arange(len(digits.data))
rng.shuffle(indices)

X = digits.data[indices[:330]]
y = digits.target[indices[:330]]

n_total_samples = len(y)
n_labeled_points = 50

unlabeled_indices = np.arange(n_total_samples)[n_labeled_points:]

y_train = np.copy(y)
y_train[unlabeled_indices] = -1

for i in range(56):
    # LabelSpreading minimizes loss (better for noise data)
    lp_model = label_propagation.LabelSpreading(gamma=0.25, max_iter=5)
    lp_model.fit(X, y_train)
    #print('{0} unlabeled'.format(np.unique(y_train,return_counts=True)[1][0]))

    predicted_labels = lp_model.transduction_[unlabeled_indices]
    true_labels = y[unlabeled_indices]

    cm = confusion_matrix(true_labels, predicted_labels,
                          labels=lp_model.classes_)

    print('Iteration %i %s' % (i, 70 * '_'))
    print("Label Spreading model: %d labeled & %d unlabeled (%d total)"
          % (n_labeled_points, n_total_samples - n_labeled_points, n_total_samples))

    print(classification_report(true_labels, predicted_labels))

    print("Confusion matrix")
    print(cm)
    print("Accuracy: {0}".format(accuracy_score(true_labels, predicted_labels)))

    # compute the entropies of transduced label distributions
    pred_entropies = stats.distributions.entropy(
        lp_model.label_distributions_.T)

    # select five digit examples that the classifier is most certain about
    uncertainty_index = uncertainty_index = np.argsort(pred_entropies)[:]

    # keep track of indices that we get labels for
    delete_indices = np.array([])
    
    counter = 0

    for index, image_index in enumerate(uncertainty_index):

        if(image_index in unlabeled_indices):
            # labeling 5 points, remote from labeled set
            delete_index, = np.where(unlabeled_indices == image_index)
            delete_indices = np.concatenate((delete_indices, delete_index))
            y_train[image_index] = lp_model.predict(X[image_index])
            counter += 1
            if counter == 5:
                break

    unlabeled_indices = np.delete(unlabeled_indices, delete_indices)
    n_labeled_points += 5

lp_model.fit(X, y_train)
predicts = lp_model.predict(X)
print(confusion_matrix(y, predicts,labels=lp_model.classes_))
print(classification_report(y, predicts))
print(accuracy_score(y,predicts))

Iteration 0 ______________________________________________________________________
Label Spreading model: 50 labeled & 280 unlabeled (330 total)
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        21
          1       0.77      0.71      0.74        24
          2       0.96      0.93      0.95        28
          3       1.00      0.78      0.88        27
          4       0.92      0.96      0.94        23
          5       0.96      0.70      0.81        33
          6       0.97      0.97      0.97        33
          7       0.93      0.90      0.92        30
          8       0.60      0.89      0.72        27
          9       0.69      0.79      0.74        34

avg / total       0.88      0.86      0.86       280

Confusion matrix
[[21  0  0  0  0  0  0  0  0  0]
 [ 0 17  1  0  0  0  1  0  5  0]
 [ 0  0 26  0  0  0  0  0  2  0]
 [ 0  0  0 21  0  0  0  0  4  2]
 [ 0  1  0  0 22  0  0  0  0  0]
 [ 0  0  0  0  0 23  0  0  0 10]
 

  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)



Label Spreading model: 305 labeled & 25 unlabeled (330 total)
             precision    recall  f1-score   support

          1       0.00      0.00      0.00         1
          2       1.00      0.33      0.50         3
          4       1.00      0.50      0.67         2
          5       0.50      1.00      0.67         1
          6       1.00      0.50      0.67         2
          7       1.00      0.71      0.83         7
          8       0.38      0.60      0.46         5
          9       1.00      0.75      0.86         4

avg / total       0.81      0.60      0.66        25

Confusion matrix
[[0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0]
 [0 0 1 0 0 0 0 0 2 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0]
 [0 1 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 5 2 0]
 [0 2 0 0 0 0 0 0 3 0]
 [0 0 0 0 0 1 0 0 0 3]]
Accuracy: 0.6
Iteration 52 ______________________________________________________________________
Label Spreading model: 310 labeled & 20 unlabeled (330 tot

In [73]:
X = digits.data
y = digits.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.183333333333, random_state=0)

lp_model = label_propagation.LabelSpreading(gamma=0.25, max_iter=5)
lp_model.fit(X_train, y_train)

predicted_labels = lp_model.predict(X_test)
true_labels = y_test

cm = confusion_matrix(true_labels, predicted_labels, labels=lp_model.classes_)

print(classification_report(true_labels, predicted_labels))

print("Confusion matrix")
print(cm)
print(accuracy_score(true_labels, predicted_labels))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        24
          1       1.00      1.00      1.00        30
          2       1.00      0.97      0.98        33
          3       0.93      1.00      0.97        28
          4       1.00      1.00      1.00        27
          5       0.97      0.97      0.97        36
          6       1.00      1.00      1.00        42
          7       1.00      1.00      1.00        37
          8       1.00      1.00      1.00        35
          9       0.97      0.95      0.96        38

avg / total       0.99      0.99      0.99       330

Confusion matrix
[[24  0  0  0  0  0  0  0  0  0]
 [ 0 30  0  0  0  0  0  0  0  0]
 [ 0  0 32  1  0  0  0  0  0  0]
 [ 0  0  0 28  0  0  0  0  0  0]
 [ 0  0  0  0 27  0  0  0  0  0]
 [ 0  0  0  0  0 35  0  0  0  1]
 [ 0  0  0  0  0  0 42  0  0  0]
 [ 0  0  0  0  0  0  0 37  0  0]
 [ 0  0  0  0  0  0  0  0 35  0]
 [ 0  0  0  1  0  1  0  0  0 36]]
0.98787878787

In [15]:
len(digits.data)

1797

In [19]:
np.argsort([4,2,3])[-2:]

array([2, 0], dtype=int64)

In [49]:
print(uncertainty_index)
np.argsort(uncertainty_index)[-4:]

[205 167 252  11 122]


array([4, 1, 0, 2], dtype=int64)

In [48]:
5 - np.unique([1,2,3,-1,2], return_counts=True)[1][0]

4