# Semi-supervised clustering
Suppose We only have labels for a few instances. Use those as the initial centroids.Apply labels to other instances transitively.

## Active learning:
Get centroids, ask experts to label those, and repeat. (The book mentions this but does not have an exercise for it.)

## MNIST
[Geron](https://github.com/ageron/handson-ml2/blob/master/09_unsupervised_learning.ipynb) 
applies this to a random "blob" dataset. We will try it on digits. Our results are never as good as his.

In [None]:
import ssl
import tensorflow
from tensorflow.keras.datasets import mnist
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train.shape

In [2]:
from sklearn.linear_model import LogisticRegression
# Logistic regression requires 2 dimensions: X and y.
num_pixels = 784
X_train1D = X_train.reshape(X_train.shape[0],num_pixels)
X_test1D = X_test.reshape(X_test.shape[0],num_pixels)
# Logistic regression fails to converge in reasonable num iterations.
# Scaling seems to help though pixel colors shouldn't need it.
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(X_train1D)
X_train1D=scaler.transform(X_train1D)
X_test1D=scaler.transform(X_test1D)
# n_jobs doesn't use threads on my Mac
# To avoid errors about non-convergence, we boost max_iter to 1000.
log = LogisticRegression(max_iter=1000)
log.fit(X_train1D[:1000],y_train[:1000])
# This takes 15 min on the 60,000 samples. Score = 0.9211
# It takes 5 min on 1000 samples. Score = 0.8537
score1=log.score(X_test1D,y_test)
score1

0.8537

In [3]:
# Establish a baseline: logistic regression trained on first 50
# with no boost from k-means.
n_labels = 50
X_exemplars = X_train1D[:n_labels]
y_exemplars = y_train[:n_labels]
# n_jobs doesn't use threads on my Mac
log = LogisticRegression(max_iter=1000)
log.fit(X_exemplars,y_exemplars)
score2 = log.score(X_test1D,y_test)
score2

0.6223

In [4]:
# This was my idea.
# Use first 50 instances as initial centroids for K-means.
import numpy as np
from sklearn.cluster import KMeans
K = n_labels
km = KMeans(n_clusters=K,init=X_exemplars)
km.fit(X_train1D)
X_distances = km.transform(X_train1D)
# Now we have 60K instances whose features are 50 distances to centroids.
# The minimum distance is 0 and we should have at least 50 of those.
centroid_index = np.argmin(X_distances, axis=0)
X_centroids = X_train1D[centroid_index]
y_centroids = y_train[centroid_index]
log = LogisticRegression(max_iter=1000)
log.fit(X_centroids,y_centroids)
score3 = log.score(X_test1D,y_test)
score3

  km.fit(X_train1D)


0.701

In [5]:
# Try again without setting the initial centroids.
# This runs for 5 min, a little less if we limit the initializations to 10.
km = KMeans(n_clusters=K,n_init=10)
km.fit(X_train1D)
X_distances = km.transform(X_train1D)

In [6]:
# Now we have 60K instances whose features are 50 distances to centroids.
# The minimum distance is 0 and we should have at least 50 of those.
centroid_index = np.argmin(X_distances, axis=0)
X_centroids = X_train1D[centroid_index]
y_centroids = y_train[centroid_index]
print("Using argmin, we train on %d instances"%(len(X_centroids)))
log = LogisticRegression(max_iter=1000)
log.fit(X_centroids,y_centroids)
score3 = log.score(X_test1D,y_test)
score3

Using argmin, we train on 50 instances


0.6843

In [12]:
# This should do the same as the above using loop instead of argmin.
BIG=10000
num_centroids=len(X_distances[0])
centroid_indices=[-1]*num_centroids
mins=[BIG]*num_centroids
i= -1
for distances in X_distances:
    i += 1
    for j in range(num_centroids):
        if distances[j]<mins[j]:
            mins[j]=distances[j]
            centroid_indices[j]=i
X_centroids = X_train1D[centroid_indices]
y_centroids = y_train[centroid_indices]
print("Using min distance per dimension, we train on %d instances"%(len(X_centroids)))
log = LogisticRegression(max_iter=1000)
log.fit(X_centroids,y_centroids)
score4 = log.score(X_test1D,y_test)
score4

Using min distance per dimension, we train on 50 instances


0.6843

In [13]:
# Build on the above.
# Add more instances in neighborhood of each centroid.
# As above, use each instance's true label regardless of cluster.
# To avoid re-inserting a centroid, reuse centroid_indices from above.
MAX_DISTANCE = 7 
i=0
neighbor_indices=[]
neighbor_centroid=[]
for distances in X_distances:
    for d in distances:
        if d<MAX_DISTANCE and i not in centroid_indices:
            neighbor_indices.append(i)
            neighbor_centroid.append(j)  # will need this later
            break
    i += 1
combined_indices=centroid_indices+neighbor_indices
X_neighbors = X_train1D[combined_indices]
y_neighbors = y_train[combined_indices]
print("At max distance %d, we train on %d instances"%(MAX_DISTANCE,len(X_neighbors)))
log = LogisticRegression(max_iter=1000)
log.fit(X_neighbors,y_neighbors)
neighbors5=len(X_neighbors) # save for later
score5 = log.score(X_test1D,y_test)
score5

At max distance 7, we train on 440 instances


0.6508

In [14]:
# Improve on above using label propagation.
# Reuse variables from above.
new_labels=[]
for i in centroid_indices:
    new_labels.append(y_train[i])
for j in neighbor_centroid:
    new_labels.append(y_train[j])
print("Train on %d instances and %d labels"%(len(X_neighbors),len(new_labels)))
log = LogisticRegression(max_iter=1000)
log.fit(X_neighbors,new_labels)
neighbors6=len(X_neighbors) # save for later
score6 = log.score(X_test1D,y_test)
score6

Train on 440 instances and 440 labels


0.5018

In [15]:
# Summary
print("%f %s"%(score1,"LogReg trained on 1000."))
print("%f %s"%(score2,"LogReg trained on first 50."))
print("%f %s"%(score3,"LogReg trained on 50 centroids (guided)."))
print("%f %s"%(score4,"LogReg trained on 50 centroids (unguided)."))
print("%f %d neighbors %s"%(score5,neighbors5,"LogReg, K-means, (original labels)."))
print("%f %d neighbors %s"%(score6,neighbors6,"LogReg, K-means, (labels propagation)."))

0.853700 LogReg trained on 1000.
0.622300 LogReg trained on first 50.
0.684300 LogReg trained on 50 centroids (guided).
0.684300 LogReg trained on 50 centroids (unguided).
0.650800 440 neighbors LogReg, K-means, (original labels).
0.501800 440 neighbors LogReg, K-means, (labels propagation).
