In [None]:
import torch
import numpy as np
from pathlib import Path
import umap.umap_ as umap

import joblib  # for saving the UMAP model

# --- CONFIG ---
reduced_dir = Path("../Results/EmbeddingDataReduced")
umap_dir = Path("../Results/EmbeddingDataUMAP")
umap_dir.mkdir(parents=True, exist_ok=True)

# UMAP parameters (tunable by user)
n_neighbors = 15
min_dist = 0.1
n_components = 3   # change to 3 if you want 3D embeddings
metric = "euclidean"
random_state = 42

# --- COLLECT REDUCED BATCH FILES ---
batch_files = sorted(reduced_dir.glob("batch_*.pt"))
print(f"Found {len(batch_files)} reduced batch files")

# --- INIT UMAP ---
umap_model = umap.UMAP(
    n_neighbors=n_neighbors,
    min_dist=min_dist,
    n_components=n_components,
    metric=metric,
    random_state=random_state,
    verbose=True,
)

# --- FIT UMAP ON ALL DATA ---
print("Fitting UMAP model...")
# Load all batches into memory for fit (UMAP does not support partial_fit)
all_data = []
for f in batch_files:
    batch_tensor = torch.load(f, weights_only=True)
    all_data.append(batch_tensor.numpy())
all_data = np.vstack(all_data)
print(f"Total data for UMAP fit: {all_data.shape}")

umap_model.fit(all_data)
print("UMAP model trained.")

# --- SAVE UMAP MODEL ---
joblib.dump(umap_model, umap_dir / "umap_model.pkl")
print("Saved UMAP model.")

# --- TRANSFORM & SAVE EACH BATCH ---
for f in batch_files:
    print(f"Transforming {f.name} ...")
    batch_tensor = torch.load(f, weights_only=True)
    reduced = umap_model.transform(batch_tensor.numpy())
    reduced_tensor = torch.from_numpy(reduced).to(torch.float32)

    out_file = umap_dir / f.name
    torch.save(reduced_tensor, out_file)
    print(f"Saved UMAP batch to {out_file}")


In [None]:
import torch
from pathlib import Path

# --- CONFIG ---
reduced_dir = Path("../Results/EmbeddingDataUMAP")

# --- CHECK ---
reduced_files = sorted(reduced_dir.glob("batch_*.pt"))
if reduced_files:
    first_file = reduced_files[0]
    print(f"Loading {first_file.name} ...")
    reduced_tensor = torch.load(first_file, weights_only=True)

    print(f"Type: {type(reduced_tensor)}")
    print(f"Shape: {reduced_tensor.shape}")
    print(f"Dtype: {reduced_tensor.dtype}")
    print(f"First 3 rows:\n{reduced_tensor[:3]}")
    print(f"Min/max values: {reduced_tensor.min().item():.6f} / {reduced_tensor.max().item():.6f}")
else:
    print("No reduced batch files found in the directory.")

In [None]:
import torch
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # registers 3D projection

# --- CONFIG ---
umap_dir = Path("../Results/EmbeddingDataUMAP")

# --- LOAD ALL BATCHES ---
batch_files = sorted(umap_dir.glob("batch_*.pt"))
all_embeds = []


for f in batch_files:
    print(f"Loading {f.name} ...")
    batch_tensor = torch.load(f, weights_only=True)
    all_embeds.append(batch_tensor.numpy())




all_embeds = np.vstack(all_embeds)
# Center and scale each axis
all_embeds = (all_embeds - all_embeds.mean(axis=0)) / all_embeds.std(axis=0)

# --- 3D SCATTER PLOT ---
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection="3d")

x, y, z = all_embeds[:, 0], all_embeds[:, 1], all_embeds[:, 2]

ax.scatter(x, y, z, s=1, alpha=0.5)

ax.set_xlabel("UMAP-1")
ax.set_ylabel("UMAP-2")
ax.set_zlabel("UMAP-3")
ax.set_title("3D UMAP Embeddings")

plt.tight_layout()
plt.show()
