From 85aca0f94971e6155d0a4f42774577fc32d20073 Mon Sep 17 00:00:00 2001 From: Nikita-Kedari Date: Sun, 12 Oct 2025 01:11:17 +0530 Subject: [PATCH 01/13] Add t-SNE algorithm for dimensionality reduction (#13432) --- DIRECTORY.md | 1 + machine_learning/tsne.py | 204 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 205 insertions(+) create mode 100644 machine_learning/tsne.py diff --git a/DIRECTORY.md b/DIRECTORY.md index 36acb3b97f1e..5470491850f1 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -623,6 +623,7 @@ * [Sequential Minimum Optimization](machine_learning/sequential_minimum_optimization.py) * [Similarity Search](machine_learning/similarity_search.py) * [Support Vector Machines](machine_learning/support_vector_machines.py) + * [t-SNE] (machine_learning/tsne.py) * [Word Frequency Functions](machine_learning/word_frequency_functions.py) * [Xgboost Classifier](machine_learning/xgboost_classifier.py) * [Xgboost Regressor](machine_learning/xgboost_regressor.py) diff --git a/machine_learning/tsne.py b/machine_learning/tsne.py new file mode 100644 index 000000000000..6197265f3f26 --- /dev/null +++ b/machine_learning/tsne.py @@ -0,0 +1,204 @@ +""" +t-Distributed Stochastic Neighbor Embedding (t-SNE) +--------------------------------------------------- +t-SNE is a nonlinear dimensionality reduction algorithm used for visualizing +high-dimensional data in a lower-dimensional (usually 2D or 3D) space. + +It models pairwise similarities between points in both the high-dimensional +and low-dimensional spaces, and minimizes the difference between them +using gradient descent. + +This simplified implementation demonstrates the core idea of t-SNE for +educational purposes — it is **not optimized for large datasets**. + +This implementation: +- Computes pairwise similarities in the high-dimensional space. +- Computes pairwise similarities in the low-dimensional (embedding) space. +- Minimizes the Kullback–Leibler divergence between these distributions + using gradient descent. +- Follows the original t-SNE formulation by van der Maaten & Hinton (2008). + +References: +- van der Maaten, L. and Hinton, G. (2008). + "Visualizing Data using t-SNE". Journal of Machine Learning Research. +- https://lvdmaaten.github.io/tsne/ + +Key Steps: +1. Compute pairwise similarities (P) in high-dimensional space. +2. Initialize low-dimensional map (Y) randomly. +3. Compute pairwise similarities (Q) in low-dimensional space using + Student-t distribution. +4. Minimize KL-divergence between P and Q using gradient descent. +""" +import doctest +import numpy as np +from sklearn.datasets import load_iris + +def collect_dataset() -> tuple[np.ndarray, np.ndarray]: + """ + Collects the dataset (Iris dataset) and returns feature matrix and target values. + + :return: Tuple containing feature matrix (X) and target labels (y) + + Example: + >>> X, y = collect_dataset() + >>> X.shape + (150, 4) + >>> y.shape + (150,) + """ + data = load_iris() + return np.array(data.data), np.array(data.target) + +def compute_pairwise_affinities(X: np.ndarray, sigma: float = 1.0) -> np.ndarray: + """ + Computes pairwise affinities (P matrix) in high-dimensional space using Gaussian kernel. + + :param X: Input data of shape (n_samples, n_features) + :param sigma: Variance (Bandwidth) of the Gaussian kernel + :return: Symmetrized probability matrix P of shape (n_samples, n_samples)/ Pairwise affinity matrix P + + Example: + >>> import numpy as np + >>> X = np.array([[0.0, 0.0], [1.0, 0.0]]) + >>> P = compute_pairwise_affinities(X) + >>> float(round(P[0, 1], 3)) + 0.25 + """ + n = X.shape[0] + sum_X = np.sum(np.square(X), axis=1) + D = np.add(np.add(-2 * np.dot(X, X.T), sum_X).T, sum_X) + P = np.exp(-D / (2 * sigma ** 2)) + np.fill_diagonal(P, 0) + P /= np.sum(P) + return (P + P.T) / (2 * n) + +def compute_low_dim_affinities(Y: np.ndarray) -> tuple[np.ndarray, np.ndarray]: + """ + Computes low-dimensional similarities (Q matrix) using Student-t distribution. + + :param Y: Low-dimensional embeddings (n_samples, n_components) + :return: Tuple (Q, num) where Q is the probability matrix and num is numerator array + """ + sum_Y = np.sum(np.square(Y), axis=1) + num = 1 / (1 + np.add(np.add(-2 * np.dot(Y, Y.T), sum_Y).T, sum_Y)) + np.fill_diagonal(num, 0) + Q = num / np.sum(num) + return Q, num + + +def apply_tsne( + data_x: np.ndarray, + n_components: int = 2, + learning_rate: float = 200.0, + n_iter: int = 500, +) -> np.ndarray: + """ + Applies t-SNE to reduce data dimensionality for visualization. + + :param data_x: Original dataset (features) + :param n_components: Target dimension (2D or 3D) + :param learning_rate: Learning rate for gradient descent + :param n_iter: Number of iterations + :return: Transformed dataset (low-dimensional embedding) + + Example: + >>> X, _ = collect_dataset() + >>> Y = apply_tsne(X, n_components=2, n_iter=250) + >>> Y.shape + (150, 2) + """ + if n_components < 1: + raise ValueError("n_components must be >= 1") + if n_iter < 1: + raise ValueError("n_iter must be >= 1") + + n_samples = data_x.shape[0] + + # Initialize low-dimensional map randomly + Y = np.random.randn(n_samples, n_components) * 1e-4 + P = compute_pairwise_affinities(data_x) + P = np.maximum(P, 1e-12) + + # Initialize parameters + Y_inc = np.zeros_like(Y) + momentum = 0.5 + + for i in range(n_iter): + Q, num = compute_low_dim_affinities(Y) + Q = np.maximum(Q, 1e-12) + + PQ = P - Q + + # Compute gradient + dY = 4 * ( + np.dot((PQ * num), Y) + - np.multiply(np.sum(PQ * num, axis=1)[:, np.newaxis], Y) + ) + + # Update with momentum and learning rate + Y_inc = momentum * Y_inc - learning_rate * dY + Y += Y_inc + + # Adjust momentum halfway through + if i == int(n_iter / 4): + momentum = 0.8 + + return Y + + +def main() -> None: + """ + Driver function for t-SNE demonstration. + """ + X, y = collect_dataset() + + Y = apply_tsne(X, n_components=2, n_iter=300) + print("t-SNE embedding (first 5 points):") + print(Y[:5]) + + # Optional visualization (commented to avoid dependency) + # import matplotlib.pyplot as plt + # plt.scatter(Y[:, 0], Y[:, 1], c=y, cmap="viridis") + # plt.title("t-SNE Visualization of Iris Dataset") + # plt.xlabel("Component 1") + # plt.ylabel("Component 2") + # plt.show() + + +if __name__ == "__main__": + doctest.testmod() + main() + +""" +Explanation of t-SNE Implementation +----------------------------------- + +Input: +- data_x: numpy array of shape (n_samples, n_features) + Example: Iris dataset (150 samples × 4 features) +- n_components: target dimension (usually 2 or 3 for visualization) +- learning_rate: controls step size in gradient descent +- n_iter: number of iterations for optimization + +Output: +- Y: numpy array of shape (n_samples, n_components) + Each row is the low-dimensional embedding of the corresponding high-dimensional point. + +How it works: +1. Compute high-dimensional similarities (P matrix): + - Measures how likely points are neighbors in the original space. +2. Initialize low-dimensional map (Y) randomly. +3. Compute low-dimensional similarities (Q matrix) using Student-t distribution: + - Heavy tail prevents distant points from crowding together. +4. Compute gradient of KL divergence between P and Q: + - If points are too far in low-D (Q < P), pull them closer. + - If points are too close in low-D (Q > P), push them apart. +5. Update Y using gradient descent with momentum: + - Repeat for n_iter iterations until low-dimensional layout reflects high-dimensional structure. + +Why it works: +- t-SNE tries to preserve **local structure**: neighbors stay close in the embedding. +- Distant points may not be perfectly preserved (global structure is secondary). +- The algorithm minimizes the KL divergence between high-D and low-D similarity distributions. +""" From af84d7f70c0976f6e4cfdd5aa53a9b106ab23dbf Mon Sep 17 00:00:00 2001 From: Nikita-Kedari Date: Sun, 12 Oct 2025 01:19:27 +0530 Subject: [PATCH 02/13] Add t-SNE to DIRECTORY.md --- DIRECTORY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DIRECTORY.md b/DIRECTORY.md index 5470491850f1..e81c4fc4a7d9 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -623,7 +623,7 @@ * [Sequential Minimum Optimization](machine_learning/sequential_minimum_optimization.py) * [Similarity Search](machine_learning/similarity_search.py) * [Support Vector Machines](machine_learning/support_vector_machines.py) - * [t-SNE] (machine_learning/tsne.py) + * [t-SNE](machine_learning/tsne.py) * [Word Frequency Functions](machine_learning/word_frequency_functions.py) * [Xgboost Classifier](machine_learning/xgboost_classifier.py) * [Xgboost Regressor](machine_learning/xgboost_regressor.py) From 88666f0451ce499a54c3e55072678cc7144903ce Mon Sep 17 00:00:00 2001 From: Nikita-Kedari Date: Sun, 12 Oct 2025 01:27:09 +0530 Subject: [PATCH 03/13] Updated tsne.py --- machine_learning/tsne.py | 141 ++++++++++++++++++--------------------- 1 file changed, 65 insertions(+), 76 deletions(-) diff --git a/machine_learning/tsne.py b/machine_learning/tsne.py index 6197265f3f26..bfa44b184061 100644 --- a/machine_learning/tsne.py +++ b/machine_learning/tsne.py @@ -1,12 +1,13 @@ """ t-Distributed Stochastic Neighbor Embedding (t-SNE) --------------------------------------------------- + t-SNE is a nonlinear dimensionality reduction algorithm used for visualizing high-dimensional data in a lower-dimensional (usually 2D or 3D) space. It models pairwise similarities between points in both the high-dimensional -and low-dimensional spaces, and minimizes the difference between them -using gradient descent. +and low-dimensional spaces, and minimizes the difference between them using +gradient descent. This simplified implementation demonstrates the core idea of t-SNE for educational purposes — it is **not optimized for large datasets**. @@ -14,7 +15,7 @@ This implementation: - Computes pairwise similarities in the high-dimensional space. - Computes pairwise similarities in the low-dimensional (embedding) space. -- Minimizes the Kullback–Leibler divergence between these distributions +- Minimizes the Kullback-Leibler divergence between these distributions using gradient descent. - Follows the original t-SNE formulation by van der Maaten & Hinton (2008). @@ -22,27 +23,23 @@ - van der Maaten, L. and Hinton, G. (2008). "Visualizing Data using t-SNE". Journal of Machine Learning Research. - https://lvdmaaten.github.io/tsne/ - -Key Steps: -1. Compute pairwise similarities (P) in high-dimensional space. -2. Initialize low-dimensional map (Y) randomly. -3. Compute pairwise similarities (Q) in low-dimensional space using - Student-t distribution. -4. Minimize KL-divergence between P and Q using gradient descent. """ + import doctest + import numpy as np from sklearn.datasets import load_iris + def collect_dataset() -> tuple[np.ndarray, np.ndarray]: """ - Collects the dataset (Iris dataset) and returns feature matrix and target values. + Collects the Iris dataset and returns features and labels. - :return: Tuple containing feature matrix (X) and target labels (y) + :return: Tuple containing feature matrix and target labels Example: - >>> X, y = collect_dataset() - >>> X.shape + >>> x, y = collect_dataset() + >>> x.shape (150, 4) >>> y.shape (150,) @@ -50,41 +47,43 @@ def collect_dataset() -> tuple[np.ndarray, np.ndarray]: data = load_iris() return np.array(data.data), np.array(data.target) -def compute_pairwise_affinities(X: np.ndarray, sigma: float = 1.0) -> np.ndarray: + +def compute_pairwise_affinities(x: np.ndarray, sigma: float = 1.0) -> np.ndarray: """ Computes pairwise affinities (P matrix) in high-dimensional space using Gaussian kernel. - :param X: Input data of shape (n_samples, n_features) + :param x: Input data of shape (n_samples, n_features) :param sigma: Variance (Bandwidth) of the Gaussian kernel - :return: Symmetrized probability matrix P of shape (n_samples, n_samples)/ Pairwise affinity matrix P + :return: Symmetrized probability matrix p Example: >>> import numpy as np - >>> X = np.array([[0.0, 0.0], [1.0, 0.0]]) - >>> P = compute_pairwise_affinities(X) - >>> float(round(P[0, 1], 3)) + >>> x = np.array([[0.0, 0.0], [1.0, 0.0]]) + >>> p = compute_pairwise_affinities(x) + >>> float(round(p[0, 1], 3)) 0.25 """ - n = X.shape[0] - sum_X = np.sum(np.square(X), axis=1) - D = np.add(np.add(-2 * np.dot(X, X.T), sum_X).T, sum_X) - P = np.exp(-D / (2 * sigma ** 2)) - np.fill_diagonal(P, 0) - P /= np.sum(P) - return (P + P.T) / (2 * n) - -def compute_low_dim_affinities(Y: np.ndarray) -> tuple[np.ndarray, np.ndarray]: + n_samples = x.shape[0] + sum_x = np.sum(np.square(x), axis=1) + d = np.add(np.add(-2 * np.dot(x, x.T), sum_x).T, sum_x) + p = np.exp(-d / (2 * sigma ** 2)) + np.fill_diagonal(p, 0) + p /= np.sum(p) + return (p + p.T) / (2 * n_samples) + + +def compute_low_dim_affinities(y: np.ndarray) -> tuple[np.ndarray, np.ndarray]: """ Computes low-dimensional similarities (Q matrix) using Student-t distribution. - :param Y: Low-dimensional embeddings (n_samples, n_components) - :return: Tuple (Q, num) where Q is the probability matrix and num is numerator array + :param y: Low-dimensional embeddings (n_samples, n_components) + :return: Tuple (q, num) where q is the probability matrix and num is numerator array """ - sum_Y = np.sum(np.square(Y), axis=1) - num = 1 / (1 + np.add(np.add(-2 * np.dot(Y, Y.T), sum_Y).T, sum_Y)) + sum_y = np.sum(np.square(y), axis=1) + num = 1 / (1 + np.add(np.add(-2 * np.dot(y, y.T), sum_y).T, sum_y)) np.fill_diagonal(num, 0) - Q = num / np.sum(num) - return Q, num + q = num / np.sum(num) + return q, num def apply_tsne( @@ -103,9 +102,9 @@ def apply_tsne( :return: Transformed dataset (low-dimensional embedding) Example: - >>> X, _ = collect_dataset() - >>> Y = apply_tsne(X, n_components=2, n_iter=250) - >>> Y.shape + >>> x, _ = collect_dataset() + >>> y_emb = apply_tsne(x, n_components=2, n_iter=50) + >>> y_emb.shape (150, 2) """ if n_components < 1: @@ -116,50 +115,49 @@ def apply_tsne( n_samples = data_x.shape[0] # Initialize low-dimensional map randomly - Y = np.random.randn(n_samples, n_components) * 1e-4 - P = compute_pairwise_affinities(data_x) - P = np.maximum(P, 1e-12) + y = np.random.randn(n_samples, n_components) * 1e-4 + p = compute_pairwise_affinities(data_x) + p = np.maximum(p, 1e-12) # Initialize parameters - Y_inc = np.zeros_like(Y) + y_inc = np.zeros_like(y) momentum = 0.5 for i in range(n_iter): - Q, num = compute_low_dim_affinities(Y) - Q = np.maximum(Q, 1e-12) + q, num = compute_low_dim_affinities(y) + q = np.maximum(q, 1e-12) - PQ = P - Q + pq = p - q # Compute gradient - dY = 4 * ( - np.dot((PQ * num), Y) - - np.multiply(np.sum(PQ * num, axis=1)[:, np.newaxis], Y) + d_y = 4 * ( + np.dot((pq * num), y) + - np.multiply(np.sum(pq * num, axis=1)[:, np.newaxis], y) ) # Update with momentum and learning rate - Y_inc = momentum * Y_inc - learning_rate * dY - Y += Y_inc + y_inc = momentum * y_inc - learning_rate * d_y + y += y_inc # Adjust momentum halfway through if i == int(n_iter / 4): momentum = 0.8 - return Y + return y def main() -> None: """ Driver function for t-SNE demonstration. """ - X, y = collect_dataset() - - Y = apply_tsne(X, n_components=2, n_iter=300) + x, y_labels = collect_dataset() + y_emb = apply_tsne(x, n_components=2, n_iter=300) print("t-SNE embedding (first 5 points):") - print(Y[:5]) + print(y_emb[:5]) # Optional visualization (commented to avoid dependency) # import matplotlib.pyplot as plt - # plt.scatter(Y[:, 0], Y[:, 1], c=y, cmap="viridis") + # plt.scatter(y_emb[:, 0], y_emb[:, 1], c=y_labels, cmap="viridis") # plt.title("t-SNE Visualization of Iris Dataset") # plt.xlabel("Component 1") # plt.ylabel("Component 2") @@ -170,35 +168,26 @@ def main() -> None: doctest.testmod() main() + """ -Explanation of t-SNE Implementation ------------------------------------ +Explanation of Input and Output +-------------------------------- Input: - data_x: numpy array of shape (n_samples, n_features) Example: Iris dataset (150 samples × 4 features) -- n_components: target dimension (usually 2 or 3 for visualization) -- learning_rate: controls step size in gradient descent +- n_components: target dimension (usually 2 or 3) +- learning_rate: gradient descent step size - n_iter: number of iterations for optimization Output: -- Y: numpy array of shape (n_samples, n_components) +- y: numpy array of shape (n_samples, n_components) Each row is the low-dimensional embedding of the corresponding high-dimensional point. How it works: -1. Compute high-dimensional similarities (P matrix): - - Measures how likely points are neighbors in the original space. -2. Initialize low-dimensional map (Y) randomly. -3. Compute low-dimensional similarities (Q matrix) using Student-t distribution: - - Heavy tail prevents distant points from crowding together. -4. Compute gradient of KL divergence between P and Q: - - If points are too far in low-D (Q < P), pull them closer. - - If points are too close in low-D (Q > P), push them apart. -5. Update Y using gradient descent with momentum: - - Repeat for n_iter iterations until low-dimensional layout reflects high-dimensional structure. - -Why it works: -- t-SNE tries to preserve **local structure**: neighbors stay close in the embedding. -- Distant points may not be perfectly preserved (global structure is secondary). -- The algorithm minimizes the KL divergence between high-D and low-D similarity distributions. +1. Compute high-dimensional similarities (p matrix) +2. Initialize low-dimensional map (y) randomly +3. Compute low-dimensional similarities (q matrix) +4. Minimize KL divergence between p and q using gradient descent +5. Update y with momentum and learning rate iteratively """ From 6cff5b89d0d86b018ee8d8d032f41f17fe0699ba Mon Sep 17 00:00:00 2001 From: Nikita-Kedari Date: Sun, 12 Oct 2025 01:43:34 +0530 Subject: [PATCH 04/13] Changed tsne.py --- machine_learning/tsne.py | 78 +++++++++++++++++++++++----------------- 1 file changed, 45 insertions(+), 33 deletions(-) diff --git a/machine_learning/tsne.py b/machine_learning/tsne.py index bfa44b184061..60beeb4b4a30 100644 --- a/machine_learning/tsne.py +++ b/machine_learning/tsne.py @@ -26,7 +26,6 @@ """ import doctest - import numpy as np from sklearn.datasets import load_iris @@ -38,49 +37,57 @@ def collect_dataset() -> tuple[np.ndarray, np.ndarray]: :return: Tuple containing feature matrix and target labels Example: - >>> x, y = collect_dataset() - >>> x.shape + >>> data_x, data_y = collect_dataset() + >>> data_x.shape (150, 4) - >>> y.shape + >>> data_y.shape (150,) """ data = load_iris() return np.array(data.data), np.array(data.target) -def compute_pairwise_affinities(x: np.ndarray, sigma: float = 1.0) -> np.ndarray: +def compute_pairwise_affinities(data_x: np.ndarray, sigma: float = 1.0) -> np.ndarray: """ Computes pairwise affinities (P matrix) in high-dimensional space using Gaussian kernel. - :param x: Input data of shape (n_samples, n_features) + :param data_x: Input data of shape (n_samples, n_features) :param sigma: Variance (Bandwidth) of the Gaussian kernel :return: Symmetrized probability matrix p Example: >>> import numpy as np - >>> x = np.array([[0.0, 0.0], [1.0, 0.0]]) - >>> p = compute_pairwise_affinities(x) + >>> data_x = np.array([[0.0, 0.0], [1.0, 0.0]]) + >>> p = compute_pairwise_affinities(data_x) >>> float(round(p[0, 1], 3)) 0.25 """ - n_samples = x.shape[0] - sum_x = np.sum(np.square(x), axis=1) - d = np.add(np.add(-2 * np.dot(x, x.T), sum_x).T, sum_x) + n_samples = data_x.shape[0] + sum_x = np.sum(np.square(data_x), axis=1) + d = np.add(np.add(-2 * np.dot(data_x, data_x.T), sum_x).T, sum_x) p = np.exp(-d / (2 * sigma ** 2)) np.fill_diagonal(p, 0) p /= np.sum(p) return (p + p.T) / (2 * n_samples) -def compute_low_dim_affinities(y: np.ndarray) -> tuple[np.ndarray, np.ndarray]: +def compute_low_dim_affinities(embedding_y: np.ndarray) -> tuple[np.ndarray, np.ndarray]: """ Computes low-dimensional similarities (Q matrix) using Student-t distribution. - :param y: Low-dimensional embeddings (n_samples, n_components) + :param embedding_y: Low-dimensional embeddings (n_samples, n_components) :return: Tuple (q, num) where q is the probability matrix and num is numerator array + + Example: + >>> embedding_y = np.array([[0.0, 0.0], [1.0, 0.0]]) + >>> q, num = compute_low_dim_affinities(embedding_y) + >>> q.shape + (2, 2) + >>> num.shape + (2, 2) """ - sum_y = np.sum(np.square(y), axis=1) - num = 1 / (1 + np.add(np.add(-2 * np.dot(y, y.T), sum_y).T, sum_y)) + sum_y = np.sum(np.square(embedding_y), axis=1) + num = 1 / (1 + np.add(np.add(-2 * np.dot(embedding_y, embedding_y.T), sum_y).T, sum_y)) np.fill_diagonal(num, 0) q = num / np.sum(num) return q, num @@ -102,8 +109,8 @@ def apply_tsne( :return: Transformed dataset (low-dimensional embedding) Example: - >>> x, _ = collect_dataset() - >>> y_emb = apply_tsne(x, n_components=2, n_iter=50) + >>> data_x, _ = collect_dataset() + >>> y_emb = apply_tsne(data_x, n_components=2, n_iter=50) >>> y_emb.shape (150, 2) """ @@ -115,49 +122,54 @@ def apply_tsne( n_samples = data_x.shape[0] # Initialize low-dimensional map randomly - y = np.random.randn(n_samples, n_components) * 1e-4 + y_emb = np.random.randn(n_samples, n_components) * 1e-4 p = compute_pairwise_affinities(data_x) p = np.maximum(p, 1e-12) # Initialize parameters - y_inc = np.zeros_like(y) + y_inc = np.zeros_like(y_emb) momentum = 0.5 for i in range(n_iter): - q, num = compute_low_dim_affinities(y) + q, num = compute_low_dim_affinities(y_emb) q = np.maximum(q, 1e-12) pq = p - q # Compute gradient d_y = 4 * ( - np.dot((pq * num), y) - - np.multiply(np.sum(pq * num, axis=1)[:, np.newaxis], y) + np.dot((pq * num), y_emb) + - np.multiply(np.sum(pq * num, axis=1)[:, np.newaxis], y_emb) ) # Update with momentum and learning rate y_inc = momentum * y_inc - learning_rate * d_y - y += y_inc + y_emb += y_inc # Adjust momentum halfway through if i == int(n_iter / 4): momentum = 0.8 - return y + return y_emb def main() -> None: """ Driver function for t-SNE demonstration. + + Example: + >>> main() # doctest: +ELLIPSIS + t-SNE embedding (first 5 points): + ... """ - x, y_labels = collect_dataset() - y_emb = apply_tsne(x, n_components=2, n_iter=300) + data_x, data_y = collect_dataset() + y_emb = apply_tsne(data_x, n_components=2, n_iter=300) print("t-SNE embedding (first 5 points):") print(y_emb[:5]) # Optional visualization (commented to avoid dependency) # import matplotlib.pyplot as plt - # plt.scatter(y_emb[:, 0], y_emb[:, 1], c=y_labels, cmap="viridis") + # plt.scatter(y_emb[:, 0], y_emb[:, 1], c=data_y, cmap="viridis") # plt.title("t-SNE Visualization of Iris Dataset") # plt.xlabel("Component 1") # plt.ylabel("Component 2") @@ -181,13 +193,13 @@ def main() -> None: - n_iter: number of iterations for optimization Output: -- y: numpy array of shape (n_samples, n_components) +- y_emb: numpy array of shape (n_samples, n_components) Each row is the low-dimensional embedding of the corresponding high-dimensional point. How it works: -1. Compute high-dimensional similarities (p matrix) -2. Initialize low-dimensional map (y) randomly -3. Compute low-dimensional similarities (q matrix) -4. Minimize KL divergence between p and q using gradient descent -5. Update y with momentum and learning rate iteratively +1. Compute high-dimensional similarities (P matrix) +2. Initialize low-dimensional map (y_emb) randomly +3. Compute low-dimensional similarities (Q matrix) +4. Minimize KL divergence between P and Q using gradient descent +5. Update y_emb with momentum and learning rate iteratively """ From c235a715f11340d31cdacf54546e459bf083687c Mon Sep 17 00:00:00 2001 From: Nikita-Kedari Date: Sun, 12 Oct 2025 01:54:50 +0530 Subject: [PATCH 05/13] Changed tsne.py --- machine_learning/tsne.py | 165 +++++++++++++++------------------------ 1 file changed, 61 insertions(+), 104 deletions(-) diff --git a/machine_learning/tsne.py b/machine_learning/tsne.py index 60beeb4b4a30..510706355a81 100644 --- a/machine_learning/tsne.py +++ b/machine_learning/tsne.py @@ -2,92 +2,84 @@ t-Distributed Stochastic Neighbor Embedding (t-SNE) --------------------------------------------------- -t-SNE is a nonlinear dimensionality reduction algorithm used for visualizing -high-dimensional data in a lower-dimensional (usually 2D or 3D) space. +t-SNE is a nonlinear dimensionality reduction algorithm for visualizing +high-dimensional data in a low-dimensional space (2D or 3D). -It models pairwise similarities between points in both the high-dimensional -and low-dimensional spaces, and minimizes the difference between them using -gradient descent. - -This simplified implementation demonstrates the core idea of t-SNE for -educational purposes — it is **not optimized for large datasets**. - -This implementation: -- Computes pairwise similarities in the high-dimensional space. -- Computes pairwise similarities in the low-dimensional (embedding) space. -- Minimizes the Kullback-Leibler divergence between these distributions - using gradient descent. -- Follows the original t-SNE formulation by van der Maaten & Hinton (2008). +It computes pairwise similarities in both spaces and minimizes the +Kullback-Leibler divergence using gradient descent. References: -- van der Maaten, L. and Hinton, G. (2008). - "Visualizing Data using t-SNE". Journal of Machine Learning Research. +- van der Maaten, L. & Hinton, G. (2008), JMLR. - https://lvdmaaten.github.io/tsne/ """ import doctest + import numpy as np from sklearn.datasets import load_iris def collect_dataset() -> tuple[np.ndarray, np.ndarray]: """ - Collects the Iris dataset and returns features and labels. + Load Iris dataset and return features and labels. - :return: Tuple containing feature matrix and target labels + Returns: + Tuple[np.ndarray, np.ndarray]: feature matrix and target labels Example: - >>> data_x, data_y = collect_dataset() - >>> data_x.shape + >>> x, y = collect_dataset() + >>> x.shape (150, 4) - >>> data_y.shape + >>> y.shape (150,) """ data = load_iris() return np.array(data.data), np.array(data.target) -def compute_pairwise_affinities(data_x: np.ndarray, sigma: float = 1.0) -> np.ndarray: +def compute_pairwise_affinities( + data_x: np.ndarray, sigma: float = 1.0 +) -> np.ndarray: """ - Computes pairwise affinities (P matrix) in high-dimensional space using Gaussian kernel. + Compute high-dimensional affinities (P matrix) using Gaussian kernel. - :param data_x: Input data of shape (n_samples, n_features) - :param sigma: Variance (Bandwidth) of the Gaussian kernel - :return: Symmetrized probability matrix p + Args: + data_x: Input data of shape (n_samples, n_features) + sigma: Gaussian kernel bandwidth + + Returns: + np.ndarray: Symmetrized probability matrix Example: >>> import numpy as np - >>> data_x = np.array([[0.0, 0.0], [1.0, 0.0]]) - >>> p = compute_pairwise_affinities(data_x) + >>> x = np.array([[0.0, 0.0], [1.0, 0.0]]) + >>> p = compute_pairwise_affinities(x) >>> float(round(p[0, 1], 3)) 0.25 """ n_samples = data_x.shape[0] sum_x = np.sum(np.square(data_x), axis=1) d = np.add(np.add(-2 * np.dot(data_x, data_x.T), sum_x).T, sum_x) - p = np.exp(-d / (2 * sigma ** 2)) + p = np.exp(-d / (2 * sigma**2)) np.fill_diagonal(p, 0) p /= np.sum(p) return (p + p.T) / (2 * n_samples) -def compute_low_dim_affinities(embedding_y: np.ndarray) -> tuple[np.ndarray, np.ndarray]: +def compute_low_dim_affinities( + y: np.ndarray, +) -> tuple[np.ndarray, np.ndarray]: """ - Computes low-dimensional similarities (Q matrix) using Student-t distribution. + Compute low-dimensional affinities (Q matrix) using Student-t distribution. - :param embedding_y: Low-dimensional embeddings (n_samples, n_components) - :return: Tuple (q, num) where q is the probability matrix and num is numerator array + Args: + y: Low-dimensional embeddings of shape (n_samples, n_components) - Example: - >>> embedding_y = np.array([[0.0, 0.0], [1.0, 0.0]]) - >>> q, num = compute_low_dim_affinities(embedding_y) - >>> q.shape - (2, 2) - >>> num.shape - (2, 2) + Returns: + Tuple[np.ndarray, np.ndarray]: Q probability matrix and numerator array """ - sum_y = np.sum(np.square(embedding_y), axis=1) - num = 1 / (1 + np.add(np.add(-2 * np.dot(embedding_y, embedding_y.T), sum_y).T, sum_y)) + sum_y = np.sum(np.square(y), axis=1) + num = 1 / (1 + np.add(np.add(-2 * np.dot(y, y.T), sum_y).T, sum_y)) np.fill_diagonal(num, 0) q = num / np.sum(num) return q, num @@ -100,106 +92,71 @@ def apply_tsne( n_iter: int = 500, ) -> np.ndarray: """ - Applies t-SNE to reduce data dimensionality for visualization. + Apply t-SNE for dimensionality reduction. - :param data_x: Original dataset (features) - :param n_components: Target dimension (2D or 3D) - :param learning_rate: Learning rate for gradient descent - :param n_iter: Number of iterations - :return: Transformed dataset (low-dimensional embedding) + Args: + data_x: Original dataset (features) + n_components: Target dimension (2D or 3D) + learning_rate: Step size for gradient descent + n_iter: Number of iterations + + Returns: + np.ndarray: Low-dimensional embedding of the data Example: - >>> data_x, _ = collect_dataset() - >>> y_emb = apply_tsne(data_x, n_components=2, n_iter=50) + >>> x, _ = collect_dataset() + >>> y_emb = apply_tsne(x, n_components=2, n_iter=50) >>> y_emb.shape (150, 2) """ - if n_components < 1: - raise ValueError("n_components must be >= 1") - if n_iter < 1: - raise ValueError("n_iter must be >= 1") + if n_components < 1 or n_iter < 1: + raise ValueError("n_components and n_iter must be >= 1") n_samples = data_x.shape[0] + rng = np.random.default_rng() + y = rng.standard_normal((n_samples, n_components)) * 1e-4 - # Initialize low-dimensional map randomly - y_emb = np.random.randn(n_samples, n_components) * 1e-4 p = compute_pairwise_affinities(data_x) p = np.maximum(p, 1e-12) - # Initialize parameters - y_inc = np.zeros_like(y_emb) + y_inc = np.zeros_like(y) momentum = 0.5 for i in range(n_iter): - q, num = compute_low_dim_affinities(y_emb) + q, num = compute_low_dim_affinities(y) q = np.maximum(q, 1e-12) pq = p - q - - # Compute gradient d_y = 4 * ( - np.dot((pq * num), y_emb) - - np.multiply(np.sum(pq * num, axis=1)[:, np.newaxis], y_emb) + np.dot((pq * num), y) + - np.multiply(np.sum(pq * num, axis=1)[:, np.newaxis], y) ) - # Update with momentum and learning rate y_inc = momentum * y_inc - learning_rate * d_y - y_emb += y_inc + y += y_inc - # Adjust momentum halfway through if i == int(n_iter / 4): momentum = 0.8 - return y_emb + return y def main() -> None: """ - Driver function for t-SNE demonstration. - - Example: - >>> main() # doctest: +ELLIPSIS - t-SNE embedding (first 5 points): - ... + Run t-SNE on Iris dataset and display the first 5 embeddings. """ - data_x, data_y = collect_dataset() + data_x, _ = collect_dataset() y_emb = apply_tsne(data_x, n_components=2, n_iter=300) + print("t-SNE embedding (first 5 points):") print(y_emb[:5]) - # Optional visualization (commented to avoid dependency) + # Optional visualization (commented out) # import matplotlib.pyplot as plt - # plt.scatter(y_emb[:, 0], y_emb[:, 1], c=data_y, cmap="viridis") - # plt.title("t-SNE Visualization of Iris Dataset") - # plt.xlabel("Component 1") - # plt.ylabel("Component 2") + # plt.scatter(y_emb[:, 0], y_emb[:, 1], c=_labels, cmap="viridis") # plt.show() if __name__ == "__main__": doctest.testmod() main() - - -""" -Explanation of Input and Output --------------------------------- - -Input: -- data_x: numpy array of shape (n_samples, n_features) - Example: Iris dataset (150 samples × 4 features) -- n_components: target dimension (usually 2 or 3) -- learning_rate: gradient descent step size -- n_iter: number of iterations for optimization - -Output: -- y_emb: numpy array of shape (n_samples, n_components) - Each row is the low-dimensional embedding of the corresponding high-dimensional point. - -How it works: -1. Compute high-dimensional similarities (P matrix) -2. Initialize low-dimensional map (y_emb) randomly -3. Compute low-dimensional similarities (Q matrix) -4. Minimize KL divergence between P and Q using gradient descent -5. Update y_emb with momentum and learning rate iteratively -""" From 1aa6b336fc0f4a86b9dcee55599c7481e70117d2 Mon Sep 17 00:00:00 2001 From: Nikita-Kedari Date: Sun, 12 Oct 2025 03:03:46 +0530 Subject: [PATCH 06/13] Changed tsne.py --- machine_learning/tsne.py | 223 +++++++++++++++++++++++---------------- 1 file changed, 130 insertions(+), 93 deletions(-) diff --git a/machine_learning/tsne.py b/machine_learning/tsne.py index 510706355a81..40aa38944f13 100644 --- a/machine_learning/tsne.py +++ b/machine_learning/tsne.py @@ -2,58 +2,33 @@ t-Distributed Stochastic Neighbor Embedding (t-SNE) --------------------------------------------------- -t-SNE is a nonlinear dimensionality reduction algorithm for visualizing -high-dimensional data in a low-dimensional space (2D or 3D). - -It computes pairwise similarities in both spaces and minimizes the -Kullback-Leibler divergence using gradient descent. +Nonlinear dimensionality reduction for visualizing high-dimensional data +in 2D or 3D. Computes pairwise similarities in high and low-dimensional +spaces and minimizes Kullback-Leibler divergence using gradient descent. References: - van der Maaten, L. & Hinton, G. (2008), JMLR. - https://lvdmaaten.github.io/tsne/ """ -import doctest - import numpy as np +from numpy import ndarray from sklearn.datasets import load_iris - -def collect_dataset() -> tuple[np.ndarray, np.ndarray]: +def _compute_pairwise_affinities(data_x: ndarray, sigma: float = 1.0) -> ndarray: """ - Load Iris dataset and return features and labels. - - Returns: - Tuple[np.ndarray, np.ndarray]: feature matrix and target labels - - Example: - >>> x, y = collect_dataset() - >>> x.shape - (150, 4) - >>> y.shape - (150,) - """ - data = load_iris() - return np.array(data.data), np.array(data.target) - - -def compute_pairwise_affinities( - data_x: np.ndarray, sigma: float = 1.0 -) -> np.ndarray: - """ - Compute high-dimensional affinities (P matrix) using Gaussian kernel. + Compute high-dimensional affinities using Gaussian kernel. Args: - data_x: Input data of shape (n_samples, n_features) - sigma: Gaussian kernel bandwidth + data_x (ndarray): shape (n_samples, n_features) + sigma (float): Gaussian kernel bandwidth Returns: - np.ndarray: Symmetrized probability matrix + ndarray: Symmetrized probability matrix Example: - >>> import numpy as np >>> x = np.array([[0.0, 0.0], [1.0, 0.0]]) - >>> p = compute_pairwise_affinities(x) + >>> p = _compute_pairwise_affinities(x) >>> float(round(p[0, 1], 3)) 0.25 """ @@ -66,97 +41,159 @@ def compute_pairwise_affinities( return (p + p.T) / (2 * n_samples) -def compute_low_dim_affinities( - y: np.ndarray, -) -> tuple[np.ndarray, np.ndarray]: +def _compute_low_dim_affinities(low_dim_embedding: ndarray) -> tuple[ndarray, ndarray]: """ - Compute low-dimensional affinities (Q matrix) using Student-t distribution. + Compute low-dimensional affinities using Student-t distribution. Args: - y: Low-dimensional embeddings of shape (n_samples, n_components) + low_dim_embedding (ndarray): shape (n_samples, n_components) Returns: - Tuple[np.ndarray, np.ndarray]: Q probability matrix and numerator array + tuple[ndarray, ndarray]: Q matrix and numerator + + Example: + >>> y = np.array([[0.0, 0.0], [1.0, 0.0]]) + >>> q, num = _compute_low_dim_affinities(y) + >>> q.shape + (2, 2) """ - sum_y = np.sum(np.square(y), axis=1) - num = 1 / (1 + np.add(np.add(-2 * np.dot(y, y.T), sum_y).T, sum_y)) + sum_y = np.sum(np.square(low_dim_embedding), axis=1) + num = 1 / (1 + np.add(np.add(-2 * np.dot(low_dim_embedding, low_dim_embedding.T), sum_y).T, sum_y)) np.fill_diagonal(num, 0) q = num / np.sum(num) return q, num -def apply_tsne( - data_x: np.ndarray, - n_components: int = 2, - learning_rate: float = 200.0, - n_iter: int = 500, -) -> np.ndarray: +class TSNE: """ - Apply t-SNE for dimensionality reduction. + t-SNE class for dimensionality reduction. Args: - data_x: Original dataset (features) - n_components: Target dimension (2D or 3D) - learning_rate: Step size for gradient descent - n_iter: Number of iterations - - Returns: - np.ndarray: Low-dimensional embedding of the data + n_components (int): target dimension (default: 2) + learning_rate (float): gradient descent step size (default: 200) + n_iter (int): number of iterations (default: 500) Example: - >>> x, _ = collect_dataset() - >>> y_emb = apply_tsne(x, n_components=2, n_iter=50) - >>> y_emb.shape + >>> x, _ = load_iris(return_X_y=True) + >>> tsne = TSNE(n_components=2, n_iter=50) + >>> tsne.fit(x) + >>> emb = tsne.embedding_ + >>> emb.shape (150, 2) """ - if n_components < 1 or n_iter < 1: - raise ValueError("n_components and n_iter must be >= 1") - - n_samples = data_x.shape[0] - rng = np.random.default_rng() - y = rng.standard_normal((n_samples, n_components)) * 1e-4 - - p = compute_pairwise_affinities(data_x) - p = np.maximum(p, 1e-12) - - y_inc = np.zeros_like(y) - momentum = 0.5 - - for i in range(n_iter): - q, num = compute_low_dim_affinities(y) - q = np.maximum(q, 1e-12) - pq = p - q - d_y = 4 * ( - np.dot((pq * num), y) - - np.multiply(np.sum(pq * num, axis=1)[:, np.newaxis], y) - ) - - y_inc = momentum * y_inc - learning_rate * d_y - y += y_inc + def __init__(self, *, n_components: int = 2, learning_rate: float = 200.0, n_iter: int = 500) -> None: + if n_components < 1: + raise ValueError("n_components must be >= 1") + if n_iter < 1: + raise ValueError("n_iter must be >= 1") + self.n_components = n_components + self.learning_rate = learning_rate + self.n_iter = n_iter + self.embedding_: ndarray | None = None + + def fit(self, data_x: ndarray) -> None: + """ + Fit t-SNE on data and compute low-dimensional embedding. + + Args: + data_x (ndarray): shape (n_samples, n_features) + + Example: + >>> x, _ = load_iris(return_X_y=True) + >>> tsne = TSNE(n_iter=10) + >>> tsne.fit(x) + >>> tsne.embedding_.shape + (150, 2) + """ + n_samples = data_x.shape[0] + rng = np.random.default_rng() + y = rng.standard_normal((n_samples, self.n_components)) * 1e-4 + + p = _compute_pairwise_affinities(data_x) + p = np.maximum(p, 1e-12) + + y_inc = np.zeros_like(y) + momentum = 0.5 + + for i in range(self.n_iter): + q, num = _compute_low_dim_affinities(y) + q = np.maximum(q, 1e-12) + pq = p - q + + d_y = 4 * ( + np.dot((pq * num), y) + - np.multiply(np.sum(pq * num, axis=1)[:, np.newaxis], y) + ) + + y_inc = momentum * y_inc - self.learning_rate * d_y + y += y_inc + + if i == int(self.n_iter / 4): + momentum = 0.8 + + self.embedding_ = y + + def transform(self, data_x: ndarray) -> ndarray: + """ + Return the computed embedding after fitting. + + Args: + data_x (ndarray): unused, exists for API consistency + + Returns: + ndarray: low-dimensional embedding + + Example: + >>> x, _ = load_iris(return_X_y=True) + >>> tsne = TSNE(n_iter=10) + >>> tsne.fit(x) + >>> tsne.transform(x).shape + (150, 2) + """ + if self.embedding_ is None: + raise ValueError("Fit the model first using fit()") + return self.embedding_ + + +def collect_dataset() -> tuple[ndarray, ndarray]: + """ + Load Iris dataset. - if i == int(n_iter / 4): - momentum = 0.8 + Returns: + tuple[ndarray, ndarray]: features and labels - return y + Example: + >>> x, y = collect_dataset() + >>> x.shape + (150, 4) + >>> y.shape + (150,) + """ + data = load_iris() + return np.array(data.data), np.array(data.target) def main() -> None: """ - Run t-SNE on Iris dataset and display the first 5 embeddings. + Run t-SNE on Iris dataset and print first 5 points. + + Example: + >>> main() # runs without errors """ data_x, _ = collect_dataset() - y_emb = apply_tsne(data_x, n_components=2, n_iter=300) - + tsne = TSNE(n_components=2, n_iter=300) + tsne.fit(data_x) print("t-SNE embedding (first 5 points):") - print(y_emb[:5]) + print(tsne.embedding_[:5]) - # Optional visualization (commented out) + # Optional visualization # import matplotlib.pyplot as plt - # plt.scatter(y_emb[:, 0], y_emb[:, 1], c=_labels, cmap="viridis") + # plt.scatter(tsne.embedding_[:, 0], tsne.embedding_[:, 1], c=_labels, cmap="viridis") # plt.show() if __name__ == "__main__": + import doctest doctest.testmod() main() From 9cc469266dbf12301dff2581bdf16cf694ae64e5 Mon Sep 17 00:00:00 2001 From: Nikita-Kedari Date: Sun, 12 Oct 2025 03:10:59 +0530 Subject: [PATCH 07/13] Changed tsne.py --- machine_learning/tsne.py | 239 ++++++++++++++++++--------------------- 1 file changed, 113 insertions(+), 126 deletions(-) diff --git a/machine_learning/tsne.py b/machine_learning/tsne.py index 40aa38944f13..043281b96acf 100644 --- a/machine_learning/tsne.py +++ b/machine_learning/tsne.py @@ -2,198 +2,185 @@ t-Distributed Stochastic Neighbor Embedding (t-SNE) --------------------------------------------------- -Nonlinear dimensionality reduction for visualizing high-dimensional data -in 2D or 3D. Computes pairwise similarities in high and low-dimensional -spaces and minimizes Kullback-Leibler divergence using gradient descent. +t-SNE is a nonlinear dimensionality reduction algorithm for visualizing +high-dimensional data in a low-dimensional space (2D or 3D). + +It computes pairwise similarities in both spaces and minimizes the +Kullback-Leibler divergence using gradient descent. References: - van der Maaten, L. & Hinton, G. (2008), JMLR. - https://lvdmaaten.github.io/tsne/ """ +import doctest +from typing import Tuple + import numpy as np from numpy import ndarray from sklearn.datasets import load_iris -def _compute_pairwise_affinities(data_x: ndarray, sigma: float = 1.0) -> ndarray: + +def collect_dataset() -> Tuple[ndarray, ndarray]: """ - Compute high-dimensional affinities using Gaussian kernel. + Load Iris dataset and return features and labels. + + Returns: + Tuple[ndarray, ndarray]: feature matrix and target labels + + Example: + >>> x, y = collect_dataset() + >>> x.shape + (150, 4) + >>> y.shape + (150,) + """ + data = load_iris() + return np.array(data.data), np.array(data.target) + + +def compute_pairwise_affinities( + data_x: ndarray, sigma: float = 1.0 +) -> ndarray: + """ + Compute high-dimensional affinities (P matrix) using Gaussian kernel. Args: - data_x (ndarray): shape (n_samples, n_features) - sigma (float): Gaussian kernel bandwidth + data_x: Input data of shape (n_samples, n_features) + sigma: Gaussian kernel bandwidth Returns: ndarray: Symmetrized probability matrix Example: >>> x = np.array([[0.0, 0.0], [1.0, 0.0]]) - >>> p = _compute_pairwise_affinities(x) + >>> p = compute_pairwise_affinities(x) >>> float(round(p[0, 1], 3)) 0.25 """ n_samples = data_x.shape[0] sum_x = np.sum(np.square(data_x), axis=1) - d = np.add(np.add(-2 * np.dot(data_x, data_x.T), sum_x).T, sum_x) - p = np.exp(-d / (2 * sigma**2)) + dist_sq = np.add(np.add(-2 * np.dot(data_x, data_x.T), sum_x).T, sum_x) + p = np.exp(-dist_sq / (2 * sigma**2)) np.fill_diagonal(p, 0) p /= np.sum(p) return (p + p.T) / (2 * n_samples) -def _compute_low_dim_affinities(low_dim_embedding: ndarray) -> tuple[ndarray, ndarray]: +def compute_low_dim_affinities( + low_dim_embedding: ndarray, +) -> Tuple[ndarray, ndarray]: """ - Compute low-dimensional affinities using Student-t distribution. + Compute low-dimensional affinities (Q matrix) using Student-t distribution. Args: - low_dim_embedding (ndarray): shape (n_samples, n_components) + low_dim_embedding: shape (n_samples, n_components) Returns: - tuple[ndarray, ndarray]: Q matrix and numerator + Tuple[ndarray, ndarray]: Q probability matrix and numerator Example: >>> y = np.array([[0.0, 0.0], [1.0, 0.0]]) - >>> q, num = _compute_low_dim_affinities(y) + >>> q, num = compute_low_dim_affinities(y) >>> q.shape (2, 2) """ sum_y = np.sum(np.square(low_dim_embedding), axis=1) - num = 1 / (1 + np.add(np.add(-2 * np.dot(low_dim_embedding, low_dim_embedding.T), sum_y).T, sum_y)) - np.fill_diagonal(num, 0) - q = num / np.sum(num) - return q, num - - -class TSNE: + numerator = 1 / ( + 1 + + np.add( + np.add(-2 * np.dot(low_dim_embedding, low_dim_embedding.T), sum_y).T, + sum_y, + ) + ) + np.fill_diagonal(numerator, 0) + q = numerator / np.sum(numerator) + return q, numerator + + +def apply_tsne( + data_x: ndarray, + n_components: int = 2, + learning_rate: float = 200.0, + n_iter: int = 500, +) -> ndarray: """ - t-SNE class for dimensionality reduction. + Apply t-SNE for dimensionality reduction. Args: - n_components (int): target dimension (default: 2) - learning_rate (float): gradient descent step size (default: 200) - n_iter (int): number of iterations (default: 500) + data_x: Original dataset (features) + n_components: Target dimension (2D or 3D) + learning_rate: Step size for gradient descent + n_iter: Number of iterations + + Returns: + ndarray: Low-dimensional embedding of the data Example: - >>> x, _ = load_iris(return_X_y=True) - >>> tsne = TSNE(n_components=2, n_iter=50) - >>> tsne.fit(x) - >>> emb = tsne.embedding_ - >>> emb.shape + >>> x, _ = collect_dataset() + >>> y_emb = apply_tsne(x, n_components=2, n_iter=50) + >>> y_emb.shape (150, 2) """ + if n_components < 1 or n_iter < 1: + raise ValueError("n_components and n_iter must be >= 1") - def __init__(self, *, n_components: int = 2, learning_rate: float = 200.0, n_iter: int = 500) -> None: - if n_components < 1: - raise ValueError("n_components must be >= 1") - if n_iter < 1: - raise ValueError("n_iter must be >= 1") - self.n_components = n_components - self.learning_rate = learning_rate - self.n_iter = n_iter - self.embedding_: ndarray | None = None - - def fit(self, data_x: ndarray) -> None: - """ - Fit t-SNE on data and compute low-dimensional embedding. - - Args: - data_x (ndarray): shape (n_samples, n_features) - - Example: - >>> x, _ = load_iris(return_X_y=True) - >>> tsne = TSNE(n_iter=10) - >>> tsne.fit(x) - >>> tsne.embedding_.shape - (150, 2) - """ - n_samples = data_x.shape[0] - rng = np.random.default_rng() - y = rng.standard_normal((n_samples, self.n_components)) * 1e-4 - - p = _compute_pairwise_affinities(data_x) - p = np.maximum(p, 1e-12) - - y_inc = np.zeros_like(y) - momentum = 0.5 - - for i in range(self.n_iter): - q, num = _compute_low_dim_affinities(y) - q = np.maximum(q, 1e-12) - pq = p - q - - d_y = 4 * ( - np.dot((pq * num), y) - - np.multiply(np.sum(pq * num, axis=1)[:, np.newaxis], y) - ) - - y_inc = momentum * y_inc - self.learning_rate * d_y - y += y_inc - - if i == int(self.n_iter / 4): - momentum = 0.8 - - self.embedding_ = y - - def transform(self, data_x: ndarray) -> ndarray: - """ - Return the computed embedding after fitting. - - Args: - data_x (ndarray): unused, exists for API consistency - - Returns: - ndarray: low-dimensional embedding - - Example: - >>> x, _ = load_iris(return_X_y=True) - >>> tsne = TSNE(n_iter=10) - >>> tsne.fit(x) - >>> tsne.transform(x).shape - (150, 2) - """ - if self.embedding_ is None: - raise ValueError("Fit the model first using fit()") - return self.embedding_ - - -def collect_dataset() -> tuple[ndarray, ndarray]: - """ - Load Iris dataset. + n_samples = data_x.shape[0] + rng = np.random.default_rng() + y = rng.standard_normal((n_samples, n_components)) * 1e-4 - Returns: - tuple[ndarray, ndarray]: features and labels + p = compute_pairwise_affinities(data_x) + p = np.maximum(p, 1e-12) - Example: - >>> x, y = collect_dataset() - >>> x.shape - (150, 4) - >>> y.shape - (150,) - """ - data = load_iris() - return np.array(data.data), np.array(data.target) + y_inc = np.zeros_like(y) + momentum = 0.5 + + for i in range(n_iter): + q, num = compute_low_dim_affinities(y) + q = np.maximum(q, 1e-12) + + pq = p - q + d_y = 4 * ( + np.dot((pq * num), y) + - np.multiply(np.sum(pq * num, axis=1)[:, np.newaxis], y) + ) + + y_inc = momentum * y_inc - learning_rate * d_y + y += y_inc + + if i == int(n_iter / 4): + momentum = 0.8 + + return y def main() -> None: """ - Run t-SNE on Iris dataset and print first 5 points. + Run t-SNE on Iris dataset and display the first 5 embeddings. Example: >>> main() # runs without errors """ data_x, _ = collect_dataset() - tsne = TSNE(n_components=2, n_iter=300) - tsne.fit(data_x) + y_emb = apply_tsne(data_x, n_components=2, n_iter=300) + + if not isinstance(y_emb, np.ndarray): + raise TypeError("t-SNE embedding must be an ndarray") + print("t-SNE embedding (first 5 points):") - print(tsne.embedding_[:5]) + print(y_emb[:5]) - # Optional visualization + # Optional visualization (commented, Ruff/mypy compliant) # import matplotlib.pyplot as plt - # plt.scatter(tsne.embedding_[:, 0], tsne.embedding_[:, 1], c=_labels, cmap="viridis") + # plt.scatter( + # y_emb[:, 0], + # y_emb[:, 1], + # c=_labels, + # cmap="viridis" + # ) # plt.show() if __name__ == "__main__": - import doctest doctest.testmod() main() From 12b100815fffa3965c1d10264ccbd1f2685f43a2 Mon Sep 17 00:00:00 2001 From: Nikita-Kedari Date: Sun, 12 Oct 2025 03:15:09 +0530 Subject: [PATCH 08/13] Updated tsne.py --- machine_learning/tsne.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/machine_learning/tsne.py b/machine_learning/tsne.py index 043281b96acf..0d5b0a5c0b00 100644 --- a/machine_learning/tsne.py +++ b/machine_learning/tsne.py @@ -14,19 +14,17 @@ """ import doctest -from typing import Tuple - import numpy as np from numpy import ndarray from sklearn.datasets import load_iris -def collect_dataset() -> Tuple[ndarray, ndarray]: +def collect_dataset() -> tuple[ndarray, ndarray]: """ Load Iris dataset and return features and labels. Returns: - Tuple[ndarray, ndarray]: feature matrix and target labels + tuple[ndarray, ndarray]: feature matrix and target labels Example: >>> x, y = collect_dataset() @@ -39,9 +37,7 @@ def collect_dataset() -> Tuple[ndarray, ndarray]: return np.array(data.data), np.array(data.target) -def compute_pairwise_affinities( - data_x: ndarray, sigma: float = 1.0 -) -> ndarray: +def compute_pairwise_affinities(data_x: ndarray, sigma: float = 1.0) -> ndarray: """ Compute high-dimensional affinities (P matrix) using Gaussian kernel. @@ -67,9 +63,7 @@ def compute_pairwise_affinities( return (p + p.T) / (2 * n_samples) -def compute_low_dim_affinities( - low_dim_embedding: ndarray, -) -> Tuple[ndarray, ndarray]: +def compute_low_dim_affinities(low_dim_embedding: ndarray) -> tuple[ndarray, ndarray]: """ Compute low-dimensional affinities (Q matrix) using Student-t distribution. @@ -77,7 +71,7 @@ def compute_low_dim_affinities( low_dim_embedding: shape (n_samples, n_components) Returns: - Tuple[ndarray, ndarray]: Q probability matrix and numerator + tuple[ndarray, ndarray]: Q probability matrix and numerator Example: >>> y = np.array([[0.0, 0.0], [1.0, 0.0]]) From aab671595cf315dc0843bb05efd3d693fe2b8725 Mon Sep 17 00:00:00 2001 From: Nikita-Kedari Date: Sun, 12 Oct 2025 03:17:20 +0530 Subject: [PATCH 09/13] Updated tsne.py --- machine_learning/tsne.py | 1 + 1 file changed, 1 insertion(+) diff --git a/machine_learning/tsne.py b/machine_learning/tsne.py index 0d5b0a5c0b00..5d862c5f5106 100644 --- a/machine_learning/tsne.py +++ b/machine_learning/tsne.py @@ -14,6 +14,7 @@ """ import doctest + import numpy as np from numpy import ndarray from sklearn.datasets import load_iris From c219f4f680a411f043b2e374b1720a3966e28e1b Mon Sep 17 00:00:00 2001 From: Nikita-Kedari Date: Sun, 12 Oct 2025 03:58:50 +0530 Subject: [PATCH 10/13] Updated tsne.py --- machine_learning/tsne.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/machine_learning/tsne.py b/machine_learning/tsne.py index 5d862c5f5106..f1eac1eae319 100644 --- a/machine_learning/tsne.py +++ b/machine_learning/tsne.py @@ -154,7 +154,9 @@ def main() -> None: Run t-SNE on Iris dataset and display the first 5 embeddings. Example: - >>> main() # runs without errors + >>> main() # doctest: +ELLIPSIS + t-SNE embedding (first 5 points): + [[-... """ data_x, _ = collect_dataset() y_emb = apply_tsne(data_x, n_components=2, n_iter=300) From 70ba4cd7f502ac3ef6c1017f35fd0c1cb1034838 Mon Sep 17 00:00:00 2001 From: Nikita-Kedari Date: Mon, 13 Oct 2025 20:36:26 +0530 Subject: [PATCH 11/13] Revert DIRECTORY.md --- DIRECTORY.md | 1 - 1 file changed, 1 deletion(-) diff --git a/DIRECTORY.md b/DIRECTORY.md index e81c4fc4a7d9..36acb3b97f1e 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -623,7 +623,6 @@ * [Sequential Minimum Optimization](machine_learning/sequential_minimum_optimization.py) * [Similarity Search](machine_learning/similarity_search.py) * [Support Vector Machines](machine_learning/support_vector_machines.py) - * [t-SNE](machine_learning/tsne.py) * [Word Frequency Functions](machine_learning/word_frequency_functions.py) * [Xgboost Classifier](machine_learning/xgboost_classifier.py) * [Xgboost Regressor](machine_learning/xgboost_regressor.py) From 850a805dd0c7522875ef635b5480f76ef55ff657 Mon Sep 17 00:00:00 2001 From: Nikita-Kedari Date: Tue, 14 Oct 2025 07:49:13 +0530 Subject: [PATCH 12/13] Updated tsne.py --- machine_learning/tsne.py | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/machine_learning/tsne.py b/machine_learning/tsne.py index f1eac1eae319..390cf8009b86 100644 --- a/machine_learning/tsne.py +++ b/machine_learning/tsne.py @@ -5,7 +5,7 @@ t-SNE is a nonlinear dimensionality reduction algorithm for visualizing high-dimensional data in a low-dimensional space (2D or 3D). -It computes pairwise similarities in both spaces and minimizes the +It computes pairwise similarities in both spaces and minimizes the Kullback-Leibler divergence using gradient descent. References: @@ -149,14 +149,19 @@ def apply_tsne( return y -def main() -> None: +def main() -> ndarray: """ - Run t-SNE on Iris dataset and display the first 5 embeddings. + Run t-SNE on Iris dataset and return the embeddings. + + Returns: + ndarray: t-SNE embedding of the Iris dataset Example: - >>> main() # doctest: +ELLIPSIS - t-SNE embedding (first 5 points): - [[-... + >>> result = main() + >>> result.shape + (150, 2) + >>> isinstance(result, np.ndarray) + True """ data_x, _ = collect_dataset() y_emb = apply_tsne(data_x, n_components=2, n_iter=300) @@ -164,20 +169,13 @@ def main() -> None: if not isinstance(y_emb, np.ndarray): raise TypeError("t-SNE embedding must be an ndarray") - print("t-SNE embedding (first 5 points):") - print(y_emb[:5]) - - # Optional visualization (commented, Ruff/mypy compliant) - # import matplotlib.pyplot as plt - # plt.scatter( - # y_emb[:, 0], - # y_emb[:, 1], - # c=_labels, - # cmap="viridis" - # ) - # plt.show() + return y_emb if __name__ == "__main__": doctest.testmod() - main() + + # Demonstration of the algorithm + result = main() + print("t-SNE embedding (first 5 points):") + print(result[:5]) From b3f231ea5f207279c6bdac6178b3014d7a0dda41 Mon Sep 17 00:00:00 2001 From: Nikita-Kedari Date: Tue, 14 Oct 2025 13:06:21 +0530 Subject: [PATCH 13/13] Updated tsne.py --- machine_learning/tsne.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/machine_learning/tsne.py b/machine_learning/tsne.py index 390cf8009b86..ca8409e0f8db 100644 --- a/machine_learning/tsne.py +++ b/machine_learning/tsne.py @@ -149,19 +149,14 @@ def apply_tsne( return y -def main() -> ndarray: +def main() -> None: """ - Run t-SNE on Iris dataset and return the embeddings. - - Returns: - ndarray: t-SNE embedding of the Iris dataset + Run t-SNE on Iris dataset and display the first 5 embeddings. Example: - >>> result = main() - >>> result.shape - (150, 2) - >>> isinstance(result, np.ndarray) - True + >>> main() # doctest: +ELLIPSIS + t-SNE embedding (first 5 points): + [[... """ data_x, _ = collect_dataset() y_emb = apply_tsne(data_x, n_components=2, n_iter=300) @@ -169,13 +164,18 @@ def main() -> ndarray: if not isinstance(y_emb, np.ndarray): raise TypeError("t-SNE embedding must be an ndarray") - return y_emb + print("t-SNE embedding (first 5 points):") + print(y_emb[:5]) + + # Optional visualization (commented out for automated testing) + # import matplotlib.pyplot as plt + # plt.scatter(y_emb[:, 0], y_emb[:, 1], c=labels, cmap="viridis") + # plt.title("t-SNE Visualization of Iris Dataset") + # plt.xlabel("Dimension 1") + # plt.ylabel("Dimension 2") + # plt.show() if __name__ == "__main__": doctest.testmod() - - # Demonstration of the algorithm - result = main() - print("t-SNE embedding (first 5 points):") - print(result[:5]) + main()