In [1]:
# Semi-supervised clustering.
# We only have labels for a few instances.
# Use those as the initial centroids.
# Apply labels to other instances transitively.

# Active learning:
# Get centroids, ask experts to label those, and repeat.

import ssl
import tensorflow
from tensorflow.keras.datasets import mnist
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train.shape

(60000, 28, 28)

In [2]:
from sklearn.linear_model import LogisticRegression
# Logistic regression requires 2 dimensions: X and y.
num_pixels = 784
X_train1D = X_train.reshape(X_train.shape[0],num_pixels)
X_test1D = X_test.reshape(X_test.shape[0],num_pixels)
# Logistic regression fails to converge in reasonable num iterations.
# Scaling seems to help though pixel colors shouldn't need it.
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(X_train1D)
X_train1D=scaler.transform(X_train1D)
X_test1D=scaler.transform(X_test1D)
# n_jobs doesn't use threads on my Mac
log = LogisticRegression(max_iter=1000)
log.fit(X_train1D[:1000],y_train[:1000])
# This takes 15 min on the 60,000 samples so we ran it on 1,000.

LogisticRegression(max_iter=1000)

In [3]:
score1=log.score(X_test1D,y_test)

In [4]:
# If the data weren't labeled, we could manually label few.
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
n_labels = 3
X_exemplars = X_train[:n_labels]
i=0
y_exemplars=np.array([5,0,4])  # manual
#for sample in X_exemplars:
#    plt.imshow(sample,cmap="binary")
#    print(i)
#    print(y_exemplars[i])
#    plt.show()
#    i += 1

In [5]:
# Establish a baseline: logistic regression trained on first 50
# with no boost from k-means.
n_labels = 50
X_exemplars = X_train1D[:n_labels]
y_exemplars = y_train[:n_labels]
# n_jobs doesn't use threads on my Mac
log = LogisticRegression(max_iter=1000)
log.fit(X_exemplars,y_exemplars)

LogisticRegression(max_iter=1000)

In [6]:
score2 = log.score(X_test1D,y_test)

In [7]:
# Next use 50 labeled instances as initial centroids for K-means.
from sklearn.cluster import KMeans
K = 50
km = KMeans(n_clusters=K,init=X_exemplars)
X_train_distances = km.fit_transform(X_train1D)
X_train_distances.shape

  return self.fit(X, sample_weight=sample_weight)._transform(X)


(60000, 50)

In [8]:
# Now we have 60K instances whose features are 50 distances to centroids.
# The minimum distance is 0 and we should have at least 50 of those.
centroid_index = np.argmin(X_train_distances, axis=0)
centroid_index

array([57689, 38110, 36238,  4585, 50557, 23171,  6690, 39871,  7252,
        5417, 31258, 42267, 54288, 46323,   638,  5569, 52986, 59803,
       18376, 29282,  2648, 11846, 35840, 19400,  9092, 50474, 22948,
       13239, 18448, 25392, 28298, 59352, 41442, 49565,  8368, 41321,
       12191, 35502,  6156, 40979, 19552, 48106, 51068,   423, 33444,
       24088, 51519, 25440, 28596,  4808])

In [9]:
X_centroids = X_train1D[centroid_index]
y_centroids = y_train[centroid_index]
log = LogisticRegression(max_iter=1000)
log.fit(X_centroids,y_centroids)

LogisticRegression(max_iter=1000)

In [12]:
score3 = log.score(X_test1D,y_test)

In [13]:
# Try again without setting the initial centroids.
km = KMeans(n_clusters=K)
X_distances = km.fit_transform(X_train1D)
centroid_index = np.argmin(X_distances, axis=0)
X_centroids = X_train1D[centroid_index]
y_centroids = y_train[centroid_index]

In [14]:
log = LogisticRegression(max_iter=1000)
log.fit(X_centroids,y_centroids)
score4 = log.score(X_test1D,y_test)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 out of   1 | elapsed:    2.3s finished


In [16]:
# Summary
print("%f %s"%(score1,"LogReg trained on 1000."))
print("%f %s"%(score2,"LogReg trained on first 50."))
print("%f %s"%(score3,"LogReg trained on 50 centroids (guided)."))
print("%f %s"%(score4,"LogReg trained on 50 centroids (unguided)."))


0.853700 LogReg trained on 1000.
0.622300 LogReg trained on first 50.
0.701000 LogReg trained on 50 centroids (guided).
0.734300 LogReg trained on 50 centroids (unguided).


In [None]:
# Conclusion. 
# If we only had 50 labels, the situation is worse than 1000 labels.
# But, we can improve accuracy by preprocessing with K-means,
# replacing the 50 labeled instances with 50 centroids.
# Initializing K-means with the 50 labeled instances did not help.

In [47]:
# Next try label propagation.
# After clustering, label all the instances close to their centroids.
X_distances.shape

(60000, 50)

In [99]:
# Try several cutoffs. Choose one that generates a decent sized train set.
zero=0.000000001
threshold=10
distance=X_distances[0]
n_centroids=len(distance)
centroid_labels=[0]*n_centroids
for i in range(len(X_distances)):
    distance=X_distances[i]
    for j in range(len(distance)):
        if distance[j]<zero:
            centroid_labels[j]=y_train[i]
            break
central_indices=[]
central_values=[]
for i in range(len(X_distances)):
    distance=X_distances[i]
    for j in range(len(distance)):
        if distance[j]<threshold:     
            central_indices.append(i)
            central_values.append(centroid_labels[j])
            break
X_central = X_train1D[central_indices]
X_central.shape

(4562, 784)

In [100]:
y_central = np.array(central_values)
y_central.shape

(4562,)

In [101]:
log = LogisticRegression(max_iter=1000)
log.fit(X_central,y_central)
score5 = log.score(X_test1D,y_test)
score5

0.098