diff --git a/machine_learning/t_stochastic_neighbour_embedding.py b/machine_learning/t_stochastic_neighbour_embedding.py new file mode 100644 index 000000000000..d6f630149087 --- /dev/null +++ b/machine_learning/t_stochastic_neighbour_embedding.py @@ -0,0 +1,178 @@ +""" +t-distributed stochastic neighbor embedding (t-SNE) + +For more details, see: +https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding +""" + +import doctest + +import numpy as np +from numpy import ndarray +from sklearn.datasets import load_iris + + +def collect_dataset() -> tuple[ndarray, ndarray]: + """ + Load the Iris dataset and return features and labels. + + Returns: + tuple[ndarray, ndarray]: Feature matrix and target labels. + + >>> features, targets = collect_dataset() + >>> features.shape + (150, 4) + >>> targets.shape + (150,) + """ + iris_dataset = load_iris() + return np.array(iris_dataset.data), np.array(iris_dataset.target) + + +def compute_pairwise_affinities(data_matrix: ndarray, sigma: float = 1.0) -> ndarray: + """ + Compute high-dimensional affinities (P matrix) using a Gaussian kernel. + + Args: + data_matrix: Input data of shape (n_samples, n_features). + sigma: Gaussian kernel bandwidth. + + Returns: + ndarray: Symmetrized probability matrix. + + >>> x = np.array([[0.0, 0.0], [1.0, 0.0]]) + >>> probabilities = compute_pairwise_affinities(x) + >>> float(round(probabilities[0, 1], 3)) + 0.25 + """ + n_samples = data_matrix.shape[0] + squared_sum = np.sum(np.square(data_matrix), axis=1) + squared_distance = np.add( + np.add(-2 * np.dot(data_matrix, data_matrix.T), squared_sum).T, squared_sum + ) + + affinity_matrix = np.exp(-squared_distance / (2 * sigma**2)) + np.fill_diagonal(affinity_matrix, 0) + + affinity_matrix /= np.sum(affinity_matrix) + return (affinity_matrix + affinity_matrix.T) / (2 * n_samples) + + +def compute_low_dim_affinities(embedding_matrix: ndarray) -> tuple[ndarray, ndarray]: + """ + Compute low-dimensional affinities (Q matrix) using a Student-t distribution. + + Args: + embedding_matrix: Low-dimensional embedding of shape (n_samples, n_components). + + Returns: + tuple[ndarray, ndarray]: (Q probability matrix, numerator matrix). + + >>> y = np.array([[0.0, 0.0], [1.0, 0.0]]) + >>> q_matrix, numerators = compute_low_dim_affinities(y) + >>> q_matrix.shape + (2, 2) + """ + squared_sum = np.sum(np.square(embedding_matrix), axis=1) + numerator_matrix = 1 / ( + 1 + + np.add( + np.add(-2 * np.dot(embedding_matrix, embedding_matrix.T), squared_sum).T, + squared_sum, + ) + ) + np.fill_diagonal(numerator_matrix, 0) + + q_matrix = numerator_matrix / np.sum(numerator_matrix) + return q_matrix, numerator_matrix + + +def apply_tsne( + data_matrix: ndarray, + n_components: int = 2, + learning_rate: float = 200.0, + n_iter: int = 500, +) -> ndarray: + """ + Apply t-SNE for dimensionality reduction. + + Args: + data_matrix: Original dataset (features). + n_components: Target dimension (2D or 3D). + learning_rate: Step size for gradient descent. + n_iter: Number of iterations. + + Returns: + ndarray: Low-dimensional embedding of the data. + + >>> features, _ = collect_dataset() + >>> embedding = apply_tsne(features, n_components=2, n_iter=50) + >>> embedding.shape + (150, 2) + """ + if n_components < 1 or n_iter < 1: + raise ValueError("n_components and n_iter must be >= 1") + + n_samples = data_matrix.shape[0] + rng = np.random.default_rng() + embedding = rng.standard_normal((n_samples, n_components)) * 1e-4 + + high_dim_affinities = compute_pairwise_affinities(data_matrix) + high_dim_affinities = np.maximum(high_dim_affinities, 1e-12) + + embedding_increment = np.zeros_like(embedding) + momentum = 0.5 + + for iteration in range(n_iter): + low_dim_affinities, numerator_matrix = compute_low_dim_affinities(embedding) + low_dim_affinities = np.maximum(low_dim_affinities, 1e-12) + + affinity_diff = high_dim_affinities - low_dim_affinities + + gradient = 4 * ( + np.dot((affinity_diff * numerator_matrix), embedding) + - np.multiply( + np.sum(affinity_diff * numerator_matrix, axis=1)[:, np.newaxis], + embedding, + ) + ) + + embedding_increment = momentum * embedding_increment - learning_rate * gradient + embedding += embedding_increment + + if iteration == int(n_iter / 4): + momentum = 0.8 + + return embedding + + +def main() -> None: + """ + Run t-SNE on the Iris dataset and display the first 5 embeddings. + + >>> main() # doctest: +ELLIPSIS + t-SNE embedding (first 5 points): + [[... + """ + features, _labels = collect_dataset() + embedding = apply_tsne(features, n_components=2, n_iter=300) + + if not isinstance(embedding, np.ndarray): + raise TypeError("t-SNE embedding must be an ndarray") + + print("t-SNE embedding (first 5 points):") + print(embedding[:5]) + + # Optional visualization (Ruff/mypy compliant) + + # import matplotlib.pyplot as plt + # plt.scatter(embedding[:, 0], embedding[:, 1], c=labels, cmap="viridis") + # plt.title("t-SNE Visualization of the Iris Dataset") + # plt.xlabel("Dimension 1") + # plt.ylabel("Dimension 2") + # plt.show() + + +if __name__ == "__main__": + doctest.testmod() + main()