# Fashion MNIST

In this notebook we're comparing t-SNE and UMAP on the Fashion MNIST dataset and try to understand how the two popular embedding methods differ in terms of the visual intermixing of the image classes and relative neighborhood changes of the image classes.

In [1]:
import numpy as np
import pyarrow as pa
import pandas as pd
from cev.widgets import Embedding, EmbeddingComparisonWidget

In [2]:
import requests
from io import BytesIO

r = requests.get(
    "https://storage.googleapis.com/flekschas/regl-scatterplot/fashion-mnist-embeddings.arrow"
)
df = pa.ipc.open_file(BytesIO(r.content)).read_all().to_pandas()

In [3]:
cmap = {
    "T-shirt/top": "#FFFF00",
    "Trouser": "#1CE6FF",
    "Pullover": "#FF34FF",
    "Dress": "#FF4A46",
    "Coat": "#008941",
    "Sandal": "#006FA6",
    "Shirt": "#A30059",
    "Sneaker": "#FFDBE5",
    "Bag": "#7A4900",
    "Ankle boot": "#0000A6",
}

labels = (
    df["class"]
    .map({i: label for i, label in enumerate(cmap.keys())})
    .astype("category")
)

tsne = df[["tsneX", "tsneY"]].values
umap = df[["umapX", "umapY"]].values

## Using the Image Classes as the Labels

In this first experiment, we're using the image classes that come with the Fashion MNIST dataset

In [4]:
tsne_embedding = Embedding(coords=tsne, labels=labels)
umap_embedding = Embedding(coords=umap, labels=labels)

tsne_vs_umap = EmbeddingComparisonWidget(
    tsne_embedding,
    umap_embedding,
    titles=["t-SNE", "UMAP"],
    metric="confusion",
    selection="synced",
    auto_zoom=True,
    row_height=320,
)

tsne_vs_umap.left.categorical_scatter.color(map=cmap)
tsne_vs_umap.left.categorical_scatter.legend(True)
tsne_vs_umap.right.categorical_scatter.color(map=cmap)
tsne_vs_umap.right.categorical_scatter.legend(True)

tsne_vs_umap

EmbeddingComparisonWidget(children=(VBox(children=(HBox(children=(WidthOptimizer(), Dropdown(description='Metr…

## Using HDBScan Clusters as the Labels

In the next experiment, we show how we can handle the case where we want to compare two embedding methods without any label information.

The idea is to cluster one embedding and use the cluster IDs as labels for comparing the two embedding. We will do this in both direction:
1. Using cluster derived with HDBScan from the t-SNE embedding
2. Using cluster derived with HDBScan from the UMAP embedding

In [5]:
import hdbscan

In [6]:
tsne_clusters = hdbscan.HDBSCAN(
    min_cluster_size=15, cluster_selection_epsilon=0.015
).fit_predict(tsne)
tsne_cluster_labels = pd.Series([str(i) for i in tsne_clusters]).astype("category")

tsne_based_tsne_embedding = Embedding(
    coords=tsne, labels=tsne_cluster_labels, robust=tsne_clusters >= 0
)
tsne_based_umap_embedding = Embedding(
    coords=umap, labels=tsne_cluster_labels, robust=tsne_clusters >= 0
)

tsne_based_tsne_vs_umap = EmbeddingComparisonWidget(
    tsne_based_tsne_embedding,
    tsne_based_umap_embedding,
    titles=["t-SNE with t-SNE Clusters", "UMAP with t-SNE Clusters"],
    metric="confusion",
    selection="synced",
    auto_zoom=True,
    row_height=320,
)

tsne_based_tsne_vs_umap

EmbeddingComparisonWidget(children=(VBox(children=(HBox(children=(WidthOptimizer(), Dropdown(description='Metr…

In [7]:
umap_clusterer = hdbscan.HDBSCAN(min_cluster_size=10, cluster_selection_epsilon=0)
umap_clusters = umap_clusterer.fit_predict(umap)
umap_cluster_labels = pd.Series([str(i) for i in umap_clusters]).astype("category")

umap_based_tsne_embedding = Embedding(
    coords=tsne, labels=umap_cluster_labels, robust=umap_clusters >= 0
)
umap_based_umap_embedding = Embedding(
    coords=umap, labels=umap_cluster_labels, robust=umap_clusters >= 0
)

umap_based_tsne_vs_umap = EmbeddingComparisonWidget(
    umap_based_tsne_embedding,
    umap_based_umap_embedding,
    titles=["t-SNE with UMAP Clusters", "UMAP with UMAP Clusters"],
    metric="confusion",
    selection="synced",
    auto_zoom=True,
    row_height=320,
)

umap_based_tsne_vs_umap

EmbeddingComparisonWidget(children=(VBox(children=(HBox(children=(WidthOptimizer(), Dropdown(description='Metr…

# Class-Based Sub-Clusters

Finally, we combine both approaches by relying on the classes that come with Fashion MNIST but sub-clustering each class.

In [8]:
tsne_subclass_ids = np.zeros_like(labels.values).astype(int)

k = 0
for class_id in df["class"].unique():
    indices = np.where(df["class"].values == class_id)[0]

    cluster_labels = hdbscan.HDBSCAN(
        min_cluster_size=20, cluster_selection_epsilon=0.05
    ).fit_predict(tsne[indices])

    tsne_subclass_ids[indices] = class_id + cluster_labels + k
    tsne_subclass_ids[indices[np.where(cluster_labels == -1)]] = -1
    k += np.max(cluster_labels) + 1

tsne_subcluster_labels = pd.Series([str(x) for x in tsne_subclass_ids]).astype(
    "category"
)

tsne_subcluster_based_tsne_embedding = Embedding(
    coords=tsne, labels=tsne_subcluster_labels, robust=tsne_subclass_ids >= 0
)
tsne_subcluster_based_umap_embedding = Embedding(
    coords=umap, labels=tsne_subcluster_labels, robust=tsne_subclass_ids >= 0
)

tsne_subcluster_based_tsne_vs_umap = EmbeddingComparisonWidget(
    tsne_subcluster_based_tsne_embedding,
    tsne_subcluster_based_umap_embedding,
    titles=["t-SNE with t-SNE Sub-Clusters", "UMAP with t-SNE Sub-Clusters"],
    metric="confusion",
    selection="synced",
    auto_zoom=True,
    row_height=320,
)

tsne_subcluster_based_tsne_vs_umap

EmbeddingComparisonWidget(children=(VBox(children=(HBox(children=(WidthOptimizer(), Dropdown(description='Metr…

In [9]:
umap_subclass_ids = np.zeros_like(labels.values).astype(int)

k = 0
for class_id in df["class"].unique():
    indices = np.where(df["class"].values == class_id)[0]

    cluster_labels = hdbscan.HDBSCAN(
        min_cluster_size=20, cluster_selection_epsilon=0.05
    ).fit_predict(umap[indices])

    umap_subclass_ids[indices] = class_id + cluster_labels + k
    umap_subclass_ids[indices[np.where(cluster_labels == -1)]] = -1
    k += np.max(cluster_labels) + 1

umap_subcluster_labels = pd.Series([str(x) for x in umap_subclass_ids]).astype(
    "category"
)

umap_subcluster_based_tsne_embedding = Embedding(
    coords=tsne, labels=umap_subcluster_labels, robust=umap_subclass_ids >= 0
)
umap_subcluster_based_umap_embedding = Embedding(
    coords=umap, labels=umap_subcluster_labels, robust=umap_subclass_ids >= 0
)

umap_subcluster_based_tsne_vs_umap = EmbeddingComparisonWidget(
    umap_subcluster_based_tsne_embedding,
    umap_subcluster_based_umap_embedding,
    titles=["t-SNE with UMAP Sub-Clusters", "UMAP with UMAP Sub-Clusters"],
    metric="confusion",
    selection="synced",
    auto_zoom=True,
    row_height=320,
)

umap_subcluster_based_tsne_vs_umap

EmbeddingComparisonWidget(children=(VBox(children=(HBox(children=(WidthOptimizer(), Dropdown(description='Metr…