# Dimensionality Reduction Tests
tests the functionality for dim_reduction_tool module.

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_openml, make_swiss_roll, load_digits
from dim_reduction_tool import dim_reduction_tool

sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

## Datasets

In [None]:
# digits dataset, 64 features
digits = load_digits()
X_digits = digits.data
y_digits = digits.target
print(f"Digits dataset shape: {X_digits.shape}")

# 3D data with 2D manifold
X_swiss, y_swiss = make_swiss_roll(n_samples=1000, noise=0.1, random_state=42)
print(f"Swiss roll dataset shape: {X_swiss.shape}")

# Convert to format expected by dim_reduction_tool, in out plan input document should also have embedding field
digits_data = [{'id': i, 'embedding': X_digits[i].tolist()} for i in range(len(X_digits))]
swiss_data = [{'id': i, 'embedding': X_swiss[i].tolist()} for i in range(len(X_swiss))]

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_swiss[:, 0], X_swiss[:, 1], X_swiss[:, 2], c=y_swiss, cmap='viridis')
ax.set_title('Swiss Roll Dataset')
plt.tight_layout()
plt.show()

## PCA Reduction

best for linear dimensionality reduction.

In [None]:
pca_digits = dim_reduction_tool(
    digits_data,
    feature_key="embedding",
    algorithm="pca",
    n_components=2
)

pca_digits_array = np.array(pca_digits)
plt.figure(figsize=(10, 8))
plt.scatter(pca_digits_array[:, 0], pca_digits_array[:, 1], c=y_digits, cmap='viridis', alpha=0.8)
plt.colorbar(label='Digit')
plt.title('PCA Reduction of Digits Dataset (64D → 2D)')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.tight_layout()
plt.show()


pca_swiss = dim_reduction_tool(
    swiss_data,
    feature_key="embedding",
    algorithm="pca",
    n_components=2
)
pca_swiss_array = np.array(pca_swiss)
plt.figure(figsize=(10, 8))
plt.scatter(pca_swiss_array[:, 0], pca_swiss_array[:, 1], c=y_swiss, cmap='viridis', alpha=0.8)
plt.colorbar(label='Position on Roll')
plt.title('PCA Reduction of Swiss Roll Dataset (3D → 2D)')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.tight_layout()
plt.show()

## t-SNE

- best for preserving local structure and visualization.

In [None]:
tsne_digits = dim_reduction_tool(
    digits_data,
    feature_key="embedding",
    algorithm="tsne",
    n_components=2,
    perplexity=30,
    learning_rate=200
)
tsne_digits_array = np.array(tsne_digits)

plt.figure(figsize=(10, 8))
plt.scatter(tsne_digits_array[:, 0], tsne_digits_array[:, 1], c=y_digits, cmap='viridis', alpha=0.8)
plt.colorbar(label='Digit')
plt.title('t-SNE Reduction of Digits Dataset (64D → 2D)')
plt.tight_layout()
plt.show()


tsne_swiss = dim_reduction_tool(
    swiss_data,
    feature_key="embedding",
    algorithm="tsne",
    n_components=2,
    perplexity=30,
    learning_rate=200
)

tsne_swiss_array = np.array(tsne_swiss)
plt.figure(figsize=(10, 8))
plt.scatter(tsne_swiss_array[:, 0], tsne_swiss_array[:, 1], c=y_swiss, cmap='viridis', alpha=0.8)
plt.colorbar(label='Position on Roll')
plt.title('t-SNE Reduction of Swiss Roll Dataset (3D → 2D)')
plt.tight_layout()
plt.show()

## UMAP

- provides better results than t-SNE and is computationally more efficient.

In [None]:
umap_digits = dim_reduction_tool(
    digits_data,
    feature_key="embedding",
    algorithm="umap",
    n_components=2,
    n_neighbors=15,
    min_dist=0.1
)
umap_digits_array = np.array(umap_digits)
plt.figure(figsize=(10, 8))
plt.scatter(umap_digits_array[:, 0], umap_digits_array[:, 1], c=y_digits, cmap='viridis', alpha=0.8)
plt.colorbar(label='Digit')
plt.title('UMAP Reduction of Digits Dataset (64D → 2D)')
plt.tight_layout()
plt.show()



umap_swiss = dim_reduction_tool(
    swiss_data,
    feature_key="embedding",
    algorithm="umap",
    n_components=2,
    n_neighbors=15,
    min_dist=0.1
)
umap_swiss_array = np.array(umap_swiss)
plt.figure(figsize=(10, 8))
plt.scatter(umap_swiss_array[:, 0], umap_swiss_array[:, 1], c=y_swiss, cmap='viridis', alpha=0.8)
plt.colorbar(label='Position on Roll')
plt.title('UMAP Reduction of Swiss Roll Dataset (3D → 2D)')
plt.tight_layout()
plt.show()