In [1]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import umap
import joblib
from tqdm import tqdm

2025-10-27 01:50:09.291349: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-27 01:50:09.790784: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-27 01:50:11.544580: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [8]:
HOME = Path(os.environ["HOME"])
REPO_ROOT = HOME / "Uni-stuff/semester-2/applied_Ml/reef_zmsc"

FUSED_BASE = REPO_ROOT / "data/features/embeds_fused_pilot/PAPCA"
OUTPUT_BASE = REPO_ROOT / "data/features/embeds_reduced_pilot"
MODELS_DIR = REPO_ROOT / "models/dimensionality_reduction"

# === Config ===
PCA_VARIANCE = 0.95
UMAP_N_NEIGHBORS = 15
UMAP_MIN_DIST = 0.1
RANDOM_STATE = 42


In [9]:
def load_all_fused_features():
    """Load all fused features into a single DataFrame."""
    print("Loading fused features...")
    
    all_data = []
    for parquet_file in FUSED_BASE.rglob("features.parquet"):
        df = pd.read_parquet(parquet_file)
        all_data.append(df)
    
    combined = pd.concat(all_data, ignore_index=True)
    print(f"Loaded {len(combined):,} clips with shape {combined.shape}")
    
    return combined

In [10]:
def separate_metadata_and_features(df):
    """Separate metadata columns from feature columns."""
    meta_cols = ['logger', 'date', 'start_s', 'clip_idx']
    meta_cols = [c for c in meta_cols if c in df.columns]
    
    feature_cols = [c for c in df.columns if c not in meta_cols]
    
    metadata = df[meta_cols].copy()
    features = df[feature_cols].copy()
    
    print(f"\nMetadata columns: {len(meta_cols)}")
    print(f"Feature columns: {len(feature_cols)}")
    
    return metadata, features, feature_cols

In [12]:
def apply_pca(features_scaled, variance_threshold=0.95):
    """Apply PCA to reduce dimensionality while keeping variance."""
    print(f"\n[2/3] Applying PCA (variance threshold: {variance_threshold})...")
    
    pca = PCA(n_components=variance_threshold, random_state=RANDOM_STATE)
    features_pca = pca.fit_transform(features_scaled)
    
    n_components = pca.n_components_
    explained_var = pca.explained_variance_ratio_.sum()
    
    print(f"Reduced {features_scaled.shape[1]}D to {n_components}D")
    print(f"Explained variance: {explained_var*100:.2f}%")
    
    return features_pca, pca, n_components


In [13]:
def apply_umap(features_pca, n_components=2):
    """Apply UMAP for visualization-ready embeddings."""
    print(f"\n[3/3] Applying UMAP ({features_pca.shape[1]}D to {n_components}D)...")
    
    reducer = umap.UMAP(
        n_components=n_components,
        n_neighbors=UMAP_N_NEIGHBORS,
        min_dist=UMAP_MIN_DIST,
        random_state=RANDOM_STATE,
        verbose=False
    )
    
    embeddings = reducer.fit_transform(features_pca)
    print(f"Generated {n_components}D embeddings")
    
    return embeddings, reducer

In [14]:
def save_results(metadata, features_scaled, features_pca, embeddings_2d, embeddings_3d, 
                 scaler, pca, umap_2d, umap_3d, feature_cols):
    """Save all results and models."""
    print("\nSaving results...")
    
    OUTPUT_BASE.mkdir(parents=True, exist_ok=True)
    MODELS_DIR.mkdir(parents=True, exist_ok=True)
    
    # Save models
    joblib.dump(scaler, MODELS_DIR / "scaler.pkl")
    joblib.dump(pca, MODELS_DIR / "pca.pkl")
    joblib.dump(umap_2d, MODELS_DIR / "umap_2d.pkl")
    joblib.dump(umap_3d, MODELS_DIR / "umap_3d.pkl")
    
    # Save PCA embeddings
    pca_df = metadata.copy()
    for i in range(features_pca.shape[1]):
        pca_df[f'pca_{i}'] = features_pca[:, i]
    pca_df.to_parquet(OUTPUT_BASE / "embeddings_pca.parquet", index=False)
    
    # Save UMAP 2D
    umap2d_df = metadata.copy()
    umap2d_df['umap_x'] = embeddings_2d[:, 0]
    umap2d_df['umap_y'] = embeddings_2d[:, 1]
    umap2d_df.to_parquet(OUTPUT_BASE / "embeddings_umap_2d.parquet", index=False)
    
    # Save UMAP 3D
    umap3d_df = metadata.copy()
    umap3d_df['umap_x'] = embeddings_3d[:, 0]
    umap3d_df['umap_y'] = embeddings_3d[:, 1]
    umap3d_df['umap_z'] = embeddings_3d[:, 2]
    umap3d_df.to_parquet(OUTPUT_BASE / "embeddings_umap_3d.parquet", index=False)
    
    # Save summary
    summary = {
        'n_clips': len(metadata),
        'original_dims': len(feature_cols),
        'pca_dims': features_pca.shape[1],
        'pca_variance_kept': float(pca.explained_variance_ratio_.sum()),
        'umap_2d_neighbors': UMAP_N_NEIGHBORS,
        'umap_min_dist': UMAP_MIN_DIST,
        'random_state': RANDOM_STATE
    }
    pd.Series(summary).to_csv(OUTPUT_BASE / "reduction_summary.txt", header=False)
    
    print(f"Saved to: {OUTPUT_BASE.relative_to(REPO_ROOT)}")
    print(f"Models saved to: {MODELS_DIR.relative_to(REPO_ROOT)}")

In [15]:
def main():
    print("=" * 70)
    print("DIMENSIONALITY REDUCTION: PCA + UMAP")
    print("=" * 70)
    
    df = load_all_fused_features()
    metadata, features, feature_cols = separate_metadata_and_features(df)
    
    features_scaled, scaler = apply_standardization(features)
    features_pca, pca, n_components = apply_pca(features_scaled, PCA_VARIANCE)
    
    embeddings_2d, umap_2d = apply_umap(features_pca, n_components=2)
    embeddings_3d, umap_3d = apply_umap(features_pca, n_components=3)
    
    save_results(
        metadata, features_scaled, features_pca, 
        embeddings_2d, embeddings_3d,
        scaler, pca, umap_2d, umap_3d, feature_cols
    )
    
    print("\n" + "=" * 70)
    print("COMPLETE")
    print("=" * 70)
    print(f"\nOriginal: {len(feature_cols)}D")
    print(f"PCA: {n_components}D ({pca.explained_variance_ratio_.sum()*100:.1f}% variance)")
    print(f"UMAP: 2D and 3D embeddings")
    print(f"\nOutput files:")
    print(f"  - embeddings_pca.parquet")
    print(f"  - embeddings_umap_2d.parquet")
    print(f"  - embeddings_umap_3d.parquet")


if __name__ == "__main__":
    main()

DIMENSIONALITY REDUCTION: PCA + UMAP
Loading fused features...
Loaded 55,920 clips with shape (55920, 20)

Metadata columns: 3
Feature columns: 17

[1/3] Applying standardization...
Standardized 17 features
Mean: -0.000000, Std: 1.000000

[2/3] Applying PCA (variance threshold: 0.95)...
Reduced 17D to 8D
Explained variance: 95.15%

[3/3] Applying UMAP (8D to 2D)...


  warn(
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(
failed. This is likely due to too small an eigengap. Consider
adding some noise 

Generated 2D embeddings

[3/3] Applying UMAP (8D to 3D)...


  warn(
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(


Generated 3D embeddings

Saving results...
Saved to: data/features/embeds_reduced_pilot
Models saved to: models/dimensionality_reduction

COMPLETE

Original: 17D
PCA: 8D (95.1% variance)
UMAP: 2D and 3D embeddings

Output files:
  - embeddings_pca.parquet
  - embeddings_umap_2d.parquet
  - embeddings_umap_3d.parquet
