# Fit UMAP To Data + Save Manifold at Low Dimension

In [None]:
import torch
import numpy as np
from pathlib import Path
import umap.umap_ as umap

import joblib  # for saving the UMAP model

# --- CONFIG ---
reduced_dir = Path("../Results/EmbeddingDataReduced")
umap_dir = Path("../Results/EmbeddingDataUMAP")
umap_dir.mkdir(parents=True, exist_ok=True)

# UMAP parameters (tunable by user)
n_neighbors = 11
min_dist = 0.1
n_components = 10   # change to 3 if you want 3D embeddings
metric = "euclidean"
random_state = 42

# --- COLLECT REDUCED BATCH FILES ---
batch_files = sorted(reduced_dir.glob("batch_*.pt"))
print(f"Found {len(batch_files)} reduced batch files")

# --- INIT UMAP ---
umap_model = umap.UMAP(
    n_neighbors=n_neighbors,
    min_dist=min_dist,
    n_components=n_components,
    metric=metric,
    random_state=random_state,
    verbose=True,
)

# --- FIT UMAP ON ALL DATA ---
print("Fitting UMAP model...")
# Load all batches into memory for fit (UMAP does not support partial_fit)
all_data = []
for f in batch_files:
    batch_tensor = torch.load(f, weights_only=True)
    all_data.append(batch_tensor.numpy())
all_data = np.vstack(all_data)
print(f"Total data for UMAP fit: {all_data.shape}")

umap_model.fit(all_data)
print("UMAP model trained.")

# --- SAVE UMAP MODEL ---
joblib.dump(umap_model, umap_dir / "umap_model.pkl")
print("Saved UMAP model.")

# --- TRANSFORM & SAVE EACH BATCH ---
for f in batch_files:
    print(f"Transforming {f.name} ...")
    batch_tensor = torch.load(f, weights_only=True)
    reduced = umap_model.transform(batch_tensor.numpy())
    reduced_tensor = torch.from_numpy(reduced).to(torch.float32)

    out_file = umap_dir / f.name
    torch.save(reduced_tensor, out_file)
    print(f"Saved UMAP batch to {out_file}")


### Test UMAP data was saved properly

In [None]:
import torch
from pathlib import Path

# --- CONFIG ---
reduced_dir = Path("../Results/EmbeddingDataUMAP")

# --- CHECK ---
reduced_files = sorted(reduced_dir.glob("batch_*.pt"))
if reduced_files:
    first_file = reduced_files[0]
    print(f"Loading {first_file.name} ...")
    reduced_tensor = torch.load(first_file, weights_only=True)

    print(f"Type: {type(reduced_tensor)}")
    print(f"Shape: {reduced_tensor.shape}")
    print(f"Dtype: {reduced_tensor.dtype}")
    print(f"First 3 rows:\n{reduced_tensor[:3]}")
    print(f"Min/max values: {reduced_tensor.min().item():.6f} / {reduced_tensor.max().item():.6f}")
else:
    print("No reduced batch files found in the directory.")

# Run HDBSCAN To Detect Clusters

In [None]:
import torch
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # registers 3D projection
import hdbscan
from collections import Counter

# --- CONFIG ---
umap_dir = Path("../Results/EmbeddingDataUMAP")

# --- LOAD ALL BATCHES ---
batch_files = sorted(umap_dir.glob("batch_*.pt"))
all_embeds = []

for f in batch_files:
    print(f"Loading {f.name} ...")
    batch_tensor = torch.load(f, weights_only=True)
    all_embeds.append(batch_tensor.numpy())

all_embeds = np.vstack(all_embeds)  # shape: (total_points, n_dim)
print("Final shape:", all_embeds.shape)

# --- RUN HDBSCAN ---
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=30,  # smaller clusters allowed
    min_samples=10        # fewer points needed to avoid noise
)
cluster_labels = clusterer.fit_predict(all_embeds)
counts = Counter(cluster_labels)


# --- SUMMARY STATISTICS ---
total_points = len(cluster_labels)
noise_points = counts.get(-1, 0)

# Exclude noise for cluster size stats
cluster_sizes = [count for cid, count in counts.items() if cid != -1]
num_clusters = len(cluster_sizes)
mean_size = np.mean(cluster_sizes) if cluster_sizes else 0
std_size = np.std(cluster_sizes) if cluster_sizes else 0
noise_pct = (noise_points / total_points) * 100


print(f"Total points: {total_points}")
print(f"Noise points: {noise_points}")
print(f"Noise percentage: {noise_pct:.2f}%")
print(f"Clusters found (excluding noise): {num_clusters}")
print(f"Mean cluster size: {mean_size:.2f}")
print(f"Std cluster size: {std_size:.2f}")
print(f"Max cluster size: {max(cluster_sizes)}")
print(f"Min cluster size: {min(cluster_sizes)}")



In [None]:
# --- Data Prep ---
cluster_sizes = [count for cid, count in counts.items() if cid != -1]
cluster_ids = [cid for cid in counts.keys() if cid != -1]

# ------------------------------------------------------------------
# 1. Histogram of cluster sizes
# ------------------------------------------------------------------
plt.figure(figsize=(10, 6))
plt.hist(cluster_sizes, bins=30, edgecolor="black")
plt.xlabel("Cluster size")
plt.ylabel("Number of clusters")
plt.title("Histogram of Cluster Sizes")
plt.show()


In [None]:
# ------------------------------------------------------------------
# 2. PDF (density estimate) of cluster sizes
# ------------------------------------------------------------------
plt.figure(figsize=(10, 6))
counts_hist, bins = np.histogram(cluster_sizes, bins=30, density=True)
bin_centers = 0.5 * (bins[1:] + bins[:-1])
plt.plot(bin_centers, counts_hist, drawstyle="steps-mid")
plt.xlabel("Cluster size")
plt.ylabel("Probability density")
plt.title("Approximate PDF of Cluster Sizes")
plt.grid(True)
plt.show()

In [None]:

# ------------------------------------------------------------------
# 3. Randomized 2D Bubble Chart (biggest clusters first)
# ------------------------------------------------------------------
plt.figure(figsize=(10, 8))

# Sort clusters by size (largest first)
sorted_sizes = sorted(cluster_sizes, reverse=True)

# Random positions for each cluster
rng = np.random.default_rng(seed=42)  # reproducible random layout
x = rng.uniform(0, 100, len(sorted_sizes))
y = rng.uniform(0, 100, len(sorted_sizes))

# Bubble radii scaled by cluster size
scale_factor = 0.5  # adjust this for bigger/smaller bubbles
bubble_sizes = np.array(sorted_sizes) * scale_factor

plt.scatter(x, y, s=bubble_sizes, alpha=0.5, c="tab:blue", edgecolors="black")

plt.xlabel("Random X")
plt.ylabel("Random Y")
plt.title("Random Bubble Plot of Cluster Sizes (largest clusters drawn bigger)")
plt.axis("equal")
plt.show()

# T-SNE to 3d for visualization

In [None]:
import torch
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # noqa: F401, registers 3D projection
from sklearn.manifold import TSNE
import hdbscan
from collections import Counter



# --- RUN TSNE TO 3D ---
print("Running t-SNE to 3D...")
tsne = TSNE(
    n_components=3,
    random_state=42,
    perplexity=30,
    init="pca",
    learning_rate="auto"
)
embeds_3d = tsne.fit_transform(all_embeds)
print("t-SNE shape:", embeds_3d.shape)


In [None]:
# --- 3D PLOT WITH CLUSTER COLORS ---
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection="3d")

# Use cluster labels as colors (noise = gray)
colors = np.where(cluster_labels == -1, -999, cluster_labels)

scatter = ax.scatter(
    embeds_3d[:, 0],
    embeds_3d[:, 1],
    embeds_3d[:, 2],
    c=colors,
    cmap="tab20",
    s=2,
    alpha=0.7
)

ax.set_xlabel("t-SNE-1")
ax.set_ylabel("t-SNE-2")
ax.set_zlabel("t-SNE-3")
ax.set_title("t-SNE Projection Colored by HDBSCAN Clusters")
plt.show()


In [None]:
# --- FILTER NON-NOISE POINTS ---
mask = cluster_labels != -1
embeds_3d_non_noise = embeds_3d[mask]
labels_non_noise = cluster_labels[mask]

# --- 3D PLOT WITH CLUSTER COLORS (NON-NOISE ONLY) ---
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection="3d")

scatter = ax.scatter(
    embeds_3d_non_noise[:, 0],
    embeds_3d_non_noise[:, 1],
    embeds_3d_non_noise[:, 2],
    c=labels_non_noise,
    cmap="tab20",
    s=2,
    alpha=0.7
)

ax.set_xlabel("t-SNE-1")
ax.set_ylabel("t-SNE-2")
ax.set_zlabel("t-SNE-3")
ax.set_title("t-SNE Projection (Non-Noise HDBSCAN Clusters)")
plt.show()


In [None]:
# --- CHOOSE TOP-K CLUSTERS ---
k = 40  # change this as needed

# Count cluster sizes
unique_labels, counts = np.unique(cluster_labels[cluster_labels != -1], return_counts=True)
sorted_clusters = [lab for lab, _ in sorted(zip(unique_labels, counts), key=lambda x: x[1], reverse=True)]

# Keep only top-k
top_k_clusters = set(sorted_clusters[:k])

mask = np.isin(cluster_labels, list(top_k_clusters))
embeds_3d_topk = embeds_3d[mask]
labels_topk = cluster_labels[mask]

# --- 3D PLOT WITH CLUSTER COLORS (TOP-K ONLY) ---
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection="3d")

scatter = ax.scatter(
    embeds_3d_topk[:, 0],
    embeds_3d_topk[:, 1],
    embeds_3d_topk[:, 2],
    c=labels_topk,
    cmap="tab20",
    s=2,
    alpha=0.7
)

ax.set_xlabel("t-SNE-1")
ax.set_ylabel("t-SNE-2")
ax.set_zlabel("t-SNE-3")
ax.set_title(f"t-SNE Projection (Top {k} Largest HDBSCAN Clusters)")
plt.show()


In [None]:
import torch
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # registers 3D projection

# --- CONFIG ---
umap_dir = Path("../Results/EmbeddingDataUMAP")

# --- LOAD ALL BATCHES ---
batch_files = sorted(umap_dir.glob("batch_*.pt"))
all_embeds = []


for f in batch_files:
    print(f"Loading {f.name} ...")
    batch_tensor = torch.load(f, weights_only=True)
    all_embeds.append(batch_tensor.numpy())




all_embeds = np.vstack(all_embeds)
# Center and scale each axis
all_embeds = (all_embeds - all_embeds.mean(axis=0)) / all_embeds.std(axis=0)

# --- 3D SCATTER PLOT ---
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection="3d")

x, y, z = all_embeds[:, 0], all_embeds[:, 1], all_embeds[:, 2]

ax.scatter(x, y, z, s=1, alpha=0.5)

ax.set_xlabel("UMAP-1")
ax.set_ylabel("UMAP-2")
ax.set_zlabel("UMAP-3")
ax.set_title("3D UMAP Embeddings")

plt.tight_layout()
plt.show()
