<a href="https://colab.research.google.com/github/WuVi5054/interesting_things/blob/main/tSNE_CUDA_vs_Sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Daily Dose of Data Science

[This GPU Accelerated tSNE Can Be Upto 700x Faster Than Sklearn](https://www.blog.dailydoseofds.com/p/this-gpu-accelerated-tsne-can-be)

Author: Avi Chawla

Before running this notebook: Go to "Runtime" -> "Change Runtime Type" -> Select "GPU T4"

In [None]:
!pip install -q condacolab

import condacolab
condacolab.install()

In [None]:
!conda install tsnecuda -c conda-forge

Imports

In [None]:
import tsnecuda
tsnecuda.test() # this should work without any error

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import fetch_openml

sns.set()

In [None]:
colors = ['#fe7c73', '#2471A3', '#3498DB', '#27AE60', '#82E0AA', '#D35400', '#5D6D7E', '#E74C3C', '#21618C', '#B7950B', '#46C7C7', '#00B9FF']

Load dataset

In [None]:
# Load MNIST dataset
mnist = fetch_openml('mnist_784')

# Extract features and labels
X, y = mnist['data'], mnist['target']

# Convert to numpy arrays
X = X.to_numpy().astype('float32')
y = y.to_numpy().astype('int')

# Print the shape of the arrays
print("Shape of features (X):", X.shape)
print("Shape of labels (y):", y.shape)

Shape of features (X): (70000, 784)
Shape of labels (y): (70000,)


Sample rows

In [None]:
# Get the number of rows in X
num_rows = X.shape[0]

# Randomly sample 8,000 indices
random_indices = np.random.choice(num_rows, size=8000, replace=False)

# Select the rows based on the randomly sampled indices
sampled_X = X[random_indices]
sampled_y = y[random_indices]

# tSNE CUDA

In [None]:
from tsnecuda import TSNE as TSNE_GPU

In [None]:
%%timeit

X_embedded_gpu = TSNE_GPU(n_components=2, learning_rate=20, perplexity=25).fit_transform(sampled_X)

In [None]:
X_embedded_gpu = TSNE_GPU(n_components=2, learning_rate=20, perplexity=25).fit_transform(sampled_X)

In [None]:
fig, ax = plt.subplots()

ax.scatter(X_embedded_gpu[:, 0], X_embedded_gpu[:, 1], c = [colors[i] for i in sampled_y], s=10)

ax.set_title("tSNE GPU", fontsize=20, weight="bold")
plt.savefig("./tsne_gpu.jpeg", dpi=300, bbox_inches="tight")

plt.show()

# tSNE Sklearn

In [None]:
from sklearn.manifold import TSNE as TSNE_CPU

In [None]:
%%timeit -n 1 -r 1

X_embedded_cpu = TSNE_CPU(n_components=2, learning_rate=20, perplexity=25).fit_transform(sampled_X)

In [None]:
X_embedded_cpu = TSNE_CPU(n_components=2, learning_rate=20, perplexity=25).fit_transform(sampled_X)

In [None]:
fig, ax = plt.subplots()

ax.scatter(X_embedded_cpu[:, 0], X_embedded_cpu[:, 1], c = [colors[i] for i in sampled_y], s=10)

ax.set_title("tSNE Sklearn", fontsize=20, weight="bold")
plt.savefig("./tsne_cpu.jpeg", dpi=300, bbox_inches="tight")
plt.show()