In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import umap
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

2025-10-27 07:42:56.738074: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-27 07:42:56.773619: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-27 07:42:58.321566: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
# Set modern aesthetic style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
COLORS = sns.color_palette("husl", 8)

In [3]:
# === Configuration ===
HOME = Path.home()
REPO_ROOT = HOME / "Uni-stuff/semester-2/applied_Ml/reef_zmsc"
FUSED_BASE = REPO_ROOT / "data/features/embeds_fused_50k/PAPCA"
OUTPUT_BASE = REPO_ROOT / "data/features/embeds_preprocessed_50k"

In [4]:
# PCA variance to keep
PCA_VARIANCE = 0.95  # Keep 95% of variance
# UMAP settings for visualization
UMAP_N_NEIGHBORS = 15
UMAP_MIN_DIST = 0.1


In [5]:
def load_all_fused_features():
    """Load all fused features into a single DataFrame."""
    print("📂 Loading fused features...")
    
    all_dfs = []
    for parquet_file in tqdm(list(FUSED_BASE.rglob("features.parquet")), desc="Loading files"):
        df = pd.read_parquet(parquet_file)
        all_dfs.append(df)
    
    combined = pd.concat(all_dfs, ignore_index=True)
    print(f"   ✓ Loaded {len(combined):,} clips")
    print(f"   ✓ Shape: {combined.shape}")
    
    return combined

In [6]:
def separate_metadata_features(df):
    """Separate metadata columns from feature columns."""
    metadata_cols = ['filepath', 'start_s', 'end_s', 'logger', 'date']
    feature_cols = [c for c in df.columns if c not in metadata_cols]
    
    metadata = df[metadata_cols].copy()
    features = df[feature_cols].copy()
    
    print(f"\n📊 Feature breakdown:")
    yamnet_cols = [c for c in feature_cols if c.startswith('yamnet_')]
    eco_cols = [c for c in feature_cols if not c.startswith('yamnet_')]
    
    print(f"   YAMNet features: {len(yamnet_cols)}")
    print(f"   Ecoacoustic features: {len(eco_cols)}")
    print(f"   Total features: {len(feature_cols)}")
    
    return metadata, features, yamnet_cols, eco_cols

In [7]:
def plot_feature_distributions(features, yamnet_cols, eco_cols, output_dir):
    """Plot distributions of raw features before standardization."""
    print("\n📊 Creating feature distribution plots...")
    
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle('Raw Feature Distributions (Before Standardization)', fontsize=16, fontweight='bold')
    
    # YAMNet features distribution
    yamnet_sample = features[yamnet_cols[:100]].values.flatten()
    yamnet_sample = yamnet_sample[~np.isnan(yamnet_sample)]
    
    axes[0, 0].hist(yamnet_sample, bins=100, alpha=0.7, color=COLORS[0], edgecolor='black')
    axes[0, 0].set_title('YAMNet Features Distribution', fontsize=14, fontweight='bold')
    axes[0, 0].set_xlabel('Feature Value')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].grid(True, alpha=0.3)
    
    # Ecoacoustic features distribution
    eco_sample = features[eco_cols].values.flatten()
    eco_sample = eco_sample[~np.isnan(eco_sample)]
    
    axes[0, 1].hist(eco_sample, bins=100, alpha=0.7, color=COLORS[1], edgecolor='black')
    axes[0, 1].set_title('Ecoacoustic Features Distribution', fontsize=14, fontweight='bold')
    axes[0, 1].set_xlabel('Feature Value')
    axes[0, 1].set_ylabel('Frequency')
    axes[0, 1].grid(True, alpha=0.3)
    
    # Feature statistics boxplot - YAMNet
    yamnet_stats = features[yamnet_cols].describe().T[['mean', 'std', 'min', 'max']]
    axes[1, 0].boxplot([yamnet_stats['mean'], yamnet_stats['std']], 
                        labels=['Mean', 'Std Dev'],
                        patch_artist=True,
                        boxprops=dict(facecolor=COLORS[2], alpha=0.7),
                        medianprops=dict(color='red', linewidth=2))
    axes[1, 0].set_title('YAMNet Feature Statistics', fontsize=14, fontweight='bold')
    axes[1, 0].set_ylabel('Value')
    axes[1, 0].grid(True, alpha=0.3)
    
    # Feature statistics boxplot - Ecoacoustic
    eco_stats = features[eco_cols].describe().T[['mean', 'std', 'min', 'max']]
    axes[1, 1].boxplot([eco_stats['mean'], eco_stats['std']], 
                        labels=['Mean', 'Std Dev'],
                        patch_artist=True,
                        boxprops=dict(facecolor=COLORS[3], alpha=0.7),
                        medianprops=dict(color='red', linewidth=2))
    axes[1, 1].set_title('Ecoacoustic Feature Statistics', fontsize=14, fontweight='bold')
    axes[1, 1].set_ylabel('Value')
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(output_dir / '01_feature_distributions_raw.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    print(f"   ✓ Saved feature distribution plots")



def standardize_features(features):
    """Standardize features to mean=0, std=1."""
    print("\n🔧 Standardizing features...")
    
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)
    features_scaled_df = pd.DataFrame(
        features_scaled, 
        columns=features.columns,
        index=features.index
    )
    
    print(f"   ✓ Features standardized")
    print(f"   Mean: {features_scaled.mean():.2e}")
    print(f"   Std: {features_scaled.std():.2f}")
    
    return features_scaled_df, scaler



def plot_standardized_comparison(features, features_scaled, yamnet_cols, eco_cols, output_dir):
    """Compare before and after standardization."""
    print("\n📊 Creating standardization comparison plots...")
    
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle('Feature Standardization: Before vs After', fontsize=16, fontweight='bold')
    
    # Sample some features for visualization
    sample_features = yamnet_cols[:5] + eco_cols[:5]
    
    # Before standardization
    before_data = features[sample_features].values
    axes[0, 0].boxplot(before_data, labels=range(1, len(sample_features)+1),
                       patch_artist=True,
                       boxprops=dict(facecolor=COLORS[0], alpha=0.6))
    axes[0, 0].set_title('Before Standardization (Sample Features)', fontsize=14, fontweight='bold')
    axes[0, 0].set_xlabel('Feature Index')
    axes[0, 0].set_ylabel('Value')
    axes[0, 0].grid(True, alpha=0.3, axis='y')
    
    # After standardization
    after_data = features_scaled[sample_features].values
    axes[0, 1].boxplot(after_data, labels=range(1, len(sample_features)+1),
                       patch_artist=True,
                       boxprops=dict(facecolor=COLORS[1], alpha=0.6))
    axes[0, 1].set_title('After Standardization (Sample Features)', fontsize=14, fontweight='bold')
    axes[0, 1].set_xlabel('Feature Index')
    axes[0, 1].set_ylabel('Standardized Value')
    axes[0, 1].grid(True, alpha=0.3, axis='y')
    
    # Distribution comparison - Before
    axes[1, 0].hist(before_data.flatten(), bins=100, alpha=0.7, 
                    color=COLORS[2], edgecolor='black')
    axes[1, 0].set_title('Value Distribution - Before', fontsize=14, fontweight='bold')
    axes[1, 0].set_xlabel('Feature Value')
    axes[1, 0].set_ylabel('Frequency')
    axes[1, 0].grid(True, alpha=0.3)
    
    # Distribution comparison - After
    axes[1, 1].hist(after_data.flatten(), bins=100, alpha=0.7, 
                    color=COLORS[3], edgecolor='black')
    axes[1, 1].set_title('Value Distribution - After', fontsize=14, fontweight='bold')
    axes[1, 1].set_xlabel('Standardized Value')
    axes[1, 1].set_ylabel('Frequency')
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(output_dir / '02_standardization_comparison.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    print(f"   ✓ Saved standardization comparison")



def apply_pca(features_scaled, variance_threshold=0.95):
    """Apply PCA for dimensionality reduction."""
    print(f"\n🔬 Applying PCA (keeping {variance_threshold*100}% variance)...")
    
    pca = PCA(n_components=variance_threshold, random_state=42)
    features_pca = pca.fit_transform(features_scaled)
    
    n_components = pca.n_components_
    explained_var = pca.explained_variance_ratio_
    cumsum_var = np.cumsum(explained_var)
    
    print(f"   ✓ Reduced from {features_scaled.shape[1]} to {n_components} dimensions")
    print(f"   ✓ Explained variance: {cumsum_var[-1]*100:.2f}%")
    print(f"   First 10 components explain: {cumsum_var[9]*100:.2f}%")
    
    # Create DataFrame
    pca_cols = [f'pca_{i}' for i in range(n_components)]
    features_pca_df = pd.DataFrame(
        features_pca,
        columns=pca_cols,
        index=features_scaled.index
    )
    
    return features_pca_df, pca, explained_var



def plot_pca_analysis(explained_var, output_dir):
    """Create comprehensive PCA analysis plots."""
    print("\n📊 Creating PCA analysis plots...")
    
    cumsum_var = np.cumsum(explained_var)
    
    fig = plt.figure(figsize=(18, 12))
    gs = fig.add_gridspec(3, 2, hspace=0.3, wspace=0.3)
    
    # 1. Individual variance (bar plot)
    ax1 = fig.add_subplot(gs[0, 0])
    n_show = min(50, len(explained_var))
    ax1.bar(range(1, n_show + 1), explained_var[:n_show], 
            alpha=0.7, color=COLORS[0], edgecolor='black', linewidth=0.5)
    ax1.set_xlabel('Principal Component', fontsize=12, fontweight='bold')
    ax1.set_ylabel('Variance Explained', fontsize=12, fontweight='bold')
    ax1.set_title('Variance Explained by Each Component (First 50)', fontsize=14, fontweight='bold')
    ax1.grid(True, alpha=0.3)
    
    # 2. Cumulative variance
    ax2 = fig.add_subplot(gs[0, 1])
    ax2.plot(range(1, len(cumsum_var) + 1), cumsum_var, 
            marker='o', markersize=4, linewidth=2, color=COLORS[1])
    ax2.axhline(y=0.95, color='red', linestyle='--', linewidth=2, label='95% threshold')
    ax2.axhline(y=0.99, color='orange', linestyle='--', linewidth=2, label='99% threshold')
    ax2.set_xlabel('Number of Components', fontsize=12, fontweight='bold')
    ax2.set_ylabel('Cumulative Variance Explained', fontsize=12, fontweight='bold')
    ax2.set_title('Cumulative Variance Explained', fontsize=14, fontweight='bold')
    ax2.legend(fontsize=10)
    ax2.grid(True, alpha=0.3)
    
    # 3. Scree plot (log scale)
    ax3 = fig.add_subplot(gs[1, 0])
    ax3.plot(range(1, len(explained_var) + 1), explained_var, 
            marker='o', markersize=3, linewidth=1.5, color=COLORS[2])
    ax3.set_xlabel('Principal Component', fontsize=12, fontweight='bold')
    ax3.set_ylabel('Variance Explained (log scale)', fontsize=12, fontweight='bold')
    ax3.set_title('Scree Plot', fontsize=14, fontweight='bold')
    ax3.set_yscale('log')
    ax3.grid(True, alpha=0.3, which='both')
    ax3.set_xlim(0, min(200, len(explained_var)))
    
    # 4. Components needed for different thresholds
    ax4 = fig.add_subplot(gs[1, 1])
    thresholds = [0.80, 0.85, 0.90, 0.95, 0.99]
    n_components_needed = [np.argmax(cumsum_var >= t) + 1 for t in thresholds]
    
    bars = ax4.barh(range(len(thresholds)), n_components_needed, 
                    alpha=0.7, color=COLORS[3], edgecolor='black', linewidth=1.5)
    ax4.set_yticks(range(len(thresholds)))
    ax4.set_yticklabels([f'{int(t*100)}%' for t in thresholds])
    ax4.set_xlabel('Number of Components', fontsize=12, fontweight='bold')
    ax4.set_ylabel('Variance Threshold', fontsize=12, fontweight='bold')
    ax4.set_title('Components Needed for Different Variance Thresholds', fontsize=14, fontweight='bold')
    ax4.grid(True, alpha=0.3, axis='x')
    
    # Add value labels on bars
    for i, (bar, val) in enumerate(zip(bars, n_components_needed)):
        ax4.text(val + 2, i, str(val), va='center', fontweight='bold')
    
    # 5. Top components distribution
    ax5 = fig.add_subplot(gs[2, :])
    top_n = min(20, len(explained_var))
    x = range(1, top_n + 1)
    y = explained_var[:top_n] * 100
    
    bars = ax5.bar(x, y, alpha=0.7, edgecolor='black', linewidth=0.5)
    # Color bars by importance
    for i, bar in enumerate(bars):
        bar.set_color(plt.cm.viridis(y[i]/max(y)))
    
    ax5.set_xlabel('Principal Component', fontsize=12, fontweight='bold')
    ax5.set_ylabel('Variance Explained (%)', fontsize=12, fontweight='bold')
    ax5.set_title(f'Top {top_n} Principal Components', fontsize=14, fontweight='bold')
    ax5.grid(True, alpha=0.3, axis='y')
    
    # Add percentage labels on top of bars
    for i, (xi, yi) in enumerate(zip(x, y)):
        ax5.text(xi, yi + 0.1, f'{yi:.1f}%', ha='center', va='bottom', 
                fontsize=8, fontweight='bold')
    
    plt.suptitle('PCA Dimensionality Reduction Analysis', 
                fontsize=18, fontweight='bold', y=0.995)
    
    plt.savefig(output_dir / '03_pca_comprehensive_analysis.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    print(f"   ✓ Saved PCA analysis plots")




def plot_pca_feature_space(features_pca, metadata, output_dir):
    """Plot first few PCA components in 2D space."""
    print("\n📊 Creating PCA feature space plots...")
    
    fig, axes = plt.subplots(2, 2, figsize=(16, 14))
    fig.suptitle('PCA Feature Space Visualization', fontsize=16, fontweight='bold')
    
    # Get logger info for coloring
    loggers = metadata['logger'].values
    unique_loggers = np.unique(loggers)
    logger_colors = {logger: COLORS[i % len(COLORS)] for i, logger in enumerate(unique_loggers)}
    
    # PC1 vs PC2
    for logger in unique_loggers:
        mask = loggers == logger
        axes[0, 0].scatter(features_pca.iloc[mask, 0], features_pca.iloc[mask, 1],
                          alpha=0.3, s=1, c=[logger_colors[logger]], label=f'Logger {logger}')
    axes[0, 0].set_xlabel('PC1', fontsize=12, fontweight='bold')
    axes[0, 0].set_ylabel('PC2', fontsize=12, fontweight='bold')
    axes[0, 0].set_title('PC1 vs PC2 (colored by Logger)', fontsize=14, fontweight='bold')
    axes[0, 0].legend(markerscale=10)
    axes[0, 0].grid(True, alpha=0.3)
    
    # PC2 vs PC3
    for logger in unique_loggers:
        mask = loggers == logger
        axes[0, 1].scatter(features_pca.iloc[mask, 1], features_pca.iloc[mask, 2],
                          alpha=0.3, s=1, c=[logger_colors[logger]], label=f'Logger {logger}')
    axes[0, 1].set_xlabel('PC2', fontsize=12, fontweight='bold')
    axes[0, 1].set_ylabel('PC3', fontsize=12, fontweight='bold')
    axes[0, 1].set_title('PC2 vs PC3 (colored by Logger)', fontsize=14, fontweight='bold')
    axes[0, 1].legend(markerscale=10)
    axes[0, 1].grid(True, alpha=0.3)
    
    # PC1 vs PC3
    for logger in unique_loggers:
        mask = loggers == logger
        axes[1, 0].scatter(features_pca.iloc[mask, 0], features_pca.iloc[mask, 2],
                          alpha=0.3, s=1, c=[logger_colors[logger]], label=f'Logger {logger}')
    axes[1, 0].set_xlabel('PC1', fontsize=12, fontweight='bold')
    axes[1, 0].set_ylabel('PC3', fontsize=12, fontweight='bold')
    axes[1, 0].set_title('PC1 vs PC3 (colored by Logger)', fontsize=14, fontweight='bold')
    axes[1, 0].legend(markerscale=10)
    axes[1, 0].grid(True, alpha=0.3)
    
    # PC3 vs PC4
    for logger in unique_loggers:
        mask = loggers == logger
        axes[1, 1].scatter(features_pca.iloc[mask, 2], features_pca.iloc[mask, 3],
                          alpha=0.3, s=1, c=[logger_colors[logger]], label=f'Logger {logger}')
    axes[1, 1].set_xlabel('PC3', fontsize=12, fontweight='bold')
    axes[1, 1].set_ylabel('PC4', fontsize=12, fontweight='bold')
    axes[1, 1].set_title('PC3 vs PC4 (colored by Logger)', fontsize=14, fontweight='bold')
    axes[1, 1].legend(markerscale=10)
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(output_dir / '04_pca_feature_space.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    print(f"   ✓ Saved PCA feature space plots")



def apply_umap_for_viz(features_pca, n_neighbors=15, min_dist=0.1):
    """Apply UMAP for 2D visualization."""
    print(f"\n🗺️ Applying UMAP for visualization...")
    print(f"   n_neighbors={n_neighbors}, min_dist={min_dist}")
    
    reducer = umap.UMAP(
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        n_components=2,
        random_state=42,
        verbose=False
    )
    
    embedding_2d = reducer.fit_transform(features_pca)
    
    umap_df = pd.DataFrame(
        embedding_2d,
        columns=['umap_x', 'umap_y'],
        index=features_pca.index
    )
    
    print(f"   ✓ 2D embedding created")
    
    return umap_df, reducer

In [12]:
def plot_umap_visualizations(umap_df, metadata, output_dir):
    """Create comprehensive UMAP visualizations."""
    print("\n📊 Creating UMAP visualization plots...")
    
    fig = plt.figure(figsize=(20, 12))
    gs = fig.add_gridspec(2, 3, hspace=0.3, wspace=0.3)
    
    # 1. Colored by logger
    ax1 = fig.add_subplot(gs[0, 0])
    for logger in metadata['logger'].unique():
        mask = metadata['logger'] == logger
        ax1.scatter(umap_df.loc[mask, 'umap_x'], umap_df.loc[mask, 'umap_y'],
                   alpha=0.4, s=2, label=f'Logger {logger}')
    ax1.set_xlabel('UMAP 1', fontsize=12, fontweight='bold')
    ax1.set_ylabel('UMAP 2', fontsize=12, fontweight='bold')
    ax1.set_title('UMAP: Colored by Logger', fontsize=14, fontweight='bold')
    ax1.legend(markerscale=10)
    ax1.grid(True, alpha=0.3)
    
    # 2. Density plot
    ax2 = fig.add_subplot(gs[0, 1])
    h = ax2.hexbin(umap_df['umap_x'], umap_df['umap_y'], 
                   gridsize=50, cmap='viridis', mincnt=1)
    ax2.set_xlabel('UMAP 1', fontsize=12, fontweight='bold')
    ax2.set_ylabel('UMAP 2', fontsize=12, fontweight='bold')
    ax2.set_title('UMAP: Density Heatmap', fontsize=14, fontweight='bold')
    plt.colorbar(h, ax=ax2, label='Point Density')
    
    # 3. Colored by date (convert to numeric)
    ax3 = fig.add_subplot(gs[0, 2])
    dates_numeric = pd.to_datetime(metadata['date'], format='%Y%m%d').astype(int) / 10**9
    scatter = ax3.scatter(umap_df['umap_x'], umap_df['umap_y'],
                         c=dates_numeric, alpha=0.4, s=2, cmap='coolwarm')
    ax3.set_xlabel('UMAP 1', fontsize=12, fontweight='bold')
    ax3.set_ylabel('UMAP 2', fontsize=12, fontweight='bold')
    ax3.set_title('UMAP: Colored by Date (Temporal)', fontsize=14, fontweight='bold')
    plt.colorbar(scatter, ax=ax3, label='Date')
    ax3.grid(True, alpha=0.3)
    
    # 4. Contour plot - FIXED
    ax4 = fig.add_subplot(gs[1, 0])
    from scipy.stats import gaussian_kde
    
    # Sample for KDE (too slow with 50K points)
    n_sample = min(5000, len(umap_df))
    sample_idx = np.random.choice(len(umap_df), size=n_sample, replace=False)
    
    umap_sample = umap_df.iloc[sample_idx]
    xy_sample = np.vstack([umap_sample['umap_x'], umap_sample['umap_y']])
    z = gaussian_kde(xy_sample)(xy_sample)
    
    # Plot only the sampled points
    ax4.scatter(umap_sample['umap_x'], umap_sample['umap_y'], 
               c=z, s=2, alpha=0.5, cmap='plasma')
    ax4.set_xlabel('UMAP 1', fontsize=12, fontweight='bold')
    ax4.set_ylabel('UMAP 2', fontsize=12, fontweight='bold')
    ax4.set_title('UMAP: Density Contours (5K sample)', fontsize=14, fontweight='bold')
    ax4.grid(True, alpha=0.3)
    
    # 5. Separate by logger (subplots)
    ax5 = fig.add_subplot(gs[1, 1:])
    loggers = sorted(metadata['logger'].unique())
    n_loggers = len(loggers)
    
    for i, logger in enumerate(loggers):
        mask = metadata['logger'] == logger
        ax5.scatter(umap_df.loc[mask, 'umap_x'], umap_df.loc[mask, 'umap_y'],
                   alpha=0.5, s=3, label=f'Logger {logger}',
                   color=COLORS[i % len(COLORS)])
    
    ax5.set_xlabel('UMAP 1', fontsize=12, fontweight='bold')
    ax5.set_ylabel('UMAP 2', fontsize=12, fontweight='bold')
    ax5.set_title('UMAP: All Loggers Overlaid', fontsize=14, fontweight='bold')
    ax5.legend(markerscale=5, loc='best')
    ax5.grid(True, alpha=0.3)
    
    plt.suptitle('UMAP 2D Embedding: Multiple Perspectives', 
                fontsize=18, fontweight='bold', y=0.995)
    
    plt.savefig(output_dir / '05_umap_comprehensive.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    print(f"   ✓ Saved comprehensive UMAP visualizations")

def plot_logger_date_distribution(metadata, output_dir):
    """Plot distribution of clips across loggers and dates."""
    print("\n📊 Creating data distribution plots...")
    
    fig, axes = plt.subplots(2, 2, figsize=(18, 12))
    fig.suptitle('Data Distribution Analysis', fontsize=16, fontweight='bold')
    
    # 1. Clips per logger
    logger_counts = metadata['logger'].value_counts().sort_index()
    axes[0, 0].bar(range(len(logger_counts)), logger_counts.values, 
                   color=COLORS[:len(logger_counts)], alpha=0.7, edgecolor='black', linewidth=1.5)
    axes[0, 0].set_xticks(range(len(logger_counts)))
    axes[0, 0].set_xticklabels([f'Logger {l}' for l in logger_counts.index], fontweight='bold')
    axes[0, 0].set_ylabel('Number of Clips', fontsize=12, fontweight='bold')
    axes[0, 0].set_title('Clips per Logger', fontsize=14, fontweight='bold')
    axes[0, 0].grid(True, alpha=0.3, axis='y')
    
    # Add value labels
    for i, v in enumerate(logger_counts.values):
        axes[0, 0].text(i, v + 200, f'{v:,}', ha='center', va='bottom', fontweight='bold')
    
    # 2. Clips per date (time series)
    metadata['date_dt'] = pd.to_datetime(metadata['date'], format='%Y%m%d')
    date_counts = metadata.groupby('date_dt').size()
    
    axes[0, 1].plot(date_counts.index, date_counts.values, linewidth=2, color=COLORS[2])
    axes[0, 1].fill_between(date_counts.index, date_counts.values, alpha=0.3, color=COLORS[2])
    axes[0, 1].set_xlabel('Date', fontsize=12, fontweight='bold')
    axes[0, 1].set_ylabel('Number of Clips', fontsize=12, fontweight='bold')
    axes[0, 1].set_title('Clips Over Time', fontsize=14, fontweight='bold')
    axes[0, 1].grid(True, alpha=0.3)
    axes[0, 1].tick_params(axis='x', rotation=45)
    
    # 3. Clips per logger per date (heatmap)
    pivot_data = metadata.groupby(['logger', 'date']).size().reset_index(name='count')
    pivot_table = pivot_data.pivot(index='logger', columns='date', values='count').fillna(0)
    
    # Sample dates for readability (show every Nth date)
    n_dates_show = min(30, pivot_table.shape[1])
    step = max(1, pivot_table.shape[1] // n_dates_show)
    pivot_sample = pivot_table.iloc[:, ::step]
    
    sns.heatmap(pivot_sample, cmap='YlOrRd', cbar_kws={'label': 'Clip Count'},
               ax=axes[1, 0], linewidths=0.5, linecolor='gray')
    axes[1, 0].set_xlabel('Date', fontsize=12, fontweight='bold')
    axes[1, 0].set_ylabel('Logger', fontsize=12, fontweight='bold')
    axes[1, 0].set_title('Clips per Logger per Date (Sampled Dates)', fontsize=14, fontweight='bold')
    axes[1, 0].tick_params(axis='x', rotation=90, labelsize=8)
    
    # 4. Distribution of clips per date
    clips_per_date = metadata.groupby('date').size()
    axes[1, 1].hist(clips_per_date.values, bins=50, alpha=0.7, color=COLORS[4], edgecolor='black')
    axes[1, 1].set_xlabel('Clips per Date', fontsize=12, fontweight='bold')
    axes[1, 1].set_ylabel('Frequency (Number of Dates)', fontsize=12, fontweight='bold')
    axes[1, 1].set_title('Distribution of Clips per Date', fontsize=14, fontweight='bold')
    axes[1, 1].grid(True, alpha=0.3, axis='y')
    axes[1, 1].axvline(clips_per_date.median(), color='red', linestyle='--', 
                      linewidth=2, label=f'Median: {clips_per_date.median():.0f}')
    axes[1, 1].legend()
    
    plt.tight_layout()
    plt.savefig(output_dir / '06_data_distribution.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # Clean up temporary column
    metadata.drop('date_dt', axis=1, inplace=True)
    
    print(f"   ✓ Saved data distribution plots")

def save_preprocessed_data(metadata, features_scaled, features_pca, umap_df, output_dir):
    """Save all preprocessed data."""
    output_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"\n💾 Saving preprocessed data...")
    
    # Combine everything
    final_df = pd.concat([
        metadata.reset_index(drop=True),
        features_scaled.reset_index(drop=True),
        features_pca.reset_index(drop=True),
        umap_df.reset_index(drop=True)
    ], axis=1)
    
    # Save full dataset
    output_file = output_dir / "preprocessed_features_full.parquet"
    final_df.to_parquet(output_file, index=False, compression='snappy')
    print(f"   ✓ Full dataset: {output_file.relative_to(REPO_ROOT)}")
    
    # Save PCA-only version (most useful for clustering)
    pca_cols = [c for c in final_df.columns if c.startswith('pca_')]
    pca_df = pd.concat([
        metadata.reset_index(drop=True),
        final_df[pca_cols + ['umap_x', 'umap_y']].reset_index(drop=True)
    ], axis=1)
    
    output_file_pca = output_dir / "preprocessed_features_pca.parquet"
    pca_df.to_parquet(output_file_pca, index=False, compression='snappy')
    print(f"   ✓ PCA version: {output_file_pca.relative_to(REPO_ROOT)}")
    
    print(f"\n   Saved {len(final_df):,} clips")
    print(f"   Full shape: {final_df.shape}")
    print(f"   PCA shape: {pca_df.shape}")

def create_summary_report(metadata, features, features_pca, explained_var, output_dir):
    """Create a text summary report."""
    print("\n📝 Creating summary report...")
    
    report_path = output_dir / "preprocessing_summary.txt"
    
    with open(report_path, 'w') as f:
        f.write("=" * 70 + "\n")
        f.write("FEATURE PREPROCESSING SUMMARY REPORT\n")
        f.write("=" * 70 + "\n\n")
        
        f.write("DATASET OVERVIEW\n")
        f.write("-" * 70 + "\n")
        f.write(f"Total clips: {len(metadata):,}\n")
        f.write(f"Unique loggers: {metadata['logger'].nunique()}\n")
        f.write(f"Unique dates: {metadata['date'].nunique()}\n")
        f.write(f"Date range: {metadata['date'].min()} to {metadata['date'].max()}\n\n")
        
        f.write("FEATURE DIMENSIONS\n")
        f.write("-" * 70 + "\n")
        f.write(f"Original features: {features.shape[1]}\n")
        yamnet_cols = [c for c in features.columns if c.startswith('yamnet_')]
        eco_cols = [c for c in features.columns if not c.startswith('yamnet_')]
        f.write(f"  - YAMNet embeddings: {len(yamnet_cols)}\n")
        f.write(f"  - Ecoacoustic features: {len(eco_cols)}\n")
        f.write(f"PCA-reduced features: {features_pca.shape[1]}\n")
        f.write(f"Dimensionality reduction: {(1 - features_pca.shape[1]/features.shape[1])*100:.1f}%\n\n")
        
        f.write("PCA ANALYSIS\n")
        f.write("-" * 70 + "\n")
        cumsum_var = np.cumsum(explained_var)
        n_components = len(explained_var)
        
        f.write(f"Total components: {n_components}\n")
        f.write(f"Variance explained by PCA: {cumsum_var[-1]*100:.2f}%\n\n")
        
        # Dynamically show variance for available components
        f.write("Variance explained by top components:\n")
        for n in [5, 10, 20, 30, 50, 100]:
            if n <= n_components:
                f.write(f"  - Top {n:3d} components: {cumsum_var[n-1]*100:.2f}%\n")
        f.write("\n")
        
        f.write("Components needed for variance thresholds:\n")
        for thresh in [0.80, 0.85, 0.90, 0.95, 0.99]:
            idx = np.argmax(cumsum_var >= thresh)
            if cumsum_var[idx] >= thresh:
                n_comp = idx + 1
                f.write(f"  - {int(thresh*100):2d}%: {n_comp} components\n")
            else:
                f.write(f"  - {int(thresh*100):2d}%: Not achievable (max: {cumsum_var[-1]*100:.2f}%)\n")
        
        f.write("\n" + "=" * 70 + "\n")
        f.write("FILES GENERATED\n")
        f.write("-" * 70 + "\n")
        f.write("Data files:\n")
        f.write("  1. preprocessed_features_full.parquet - All features\n")
        f.write("  2. preprocessed_features_pca.parquet - PCA features (use for clustering)\n\n")
        f.write("Visualization files:\n")
        f.write("  3. 01_feature_distributions_raw.png\n")
        f.write("  4. 02_standardization_comparison.png\n")
        f.write("  5. 03_pca_comprehensive_analysis.png\n")
        f.write("  6. 04_pca_feature_space.png\n")
        f.write("  7. 05_umap_comprehensive.png\n")
        f.write("  8. 06_data_distribution.png\n\n")
        f.write("Report:\n")
        f.write("  9. preprocessing_summary.txt (this file)\n")
        
        f.write("\n" + "=" * 70 + "\n")
        f.write("NEXT STEPS\n")
        f.write("-" * 70 + "\n")
        f.write("1. Review the visualizations to understand your data\n")
        f.write("2. Use 'preprocessed_features_pca.parquet' for clustering\n")
        f.write("3. Try different clustering algorithms:\n")
        f.write("   - K-means (fast, assumes spherical clusters)\n")
        f.write("   - HDBSCAN (finds clusters of varying density)\n")
        f.write("   - Gaussian Mixture Models (soft clustering)\n")
        f.write("4. Use UMAP coordinates (umap_x, umap_y) for visualization\n")
        f.write("5. Evaluate clusters using silhouette score, Davies-Bouldin index\n")
    
    print(f"   ✓ Saved summary report")

In [13]:
def main():
    print("=" * 70)
    print("FEATURE PREPROCESSING: Standardization + PCA + UMAP")
    print("=" * 70)
    
    # Load data
    df = load_all_fused_features()
    
    # Separate metadata and features
    metadata, features, yamnet_cols, eco_cols = separate_metadata_features(df)
    
    # Check for missing values
    missing = features.isnull().sum().sum()
    if missing > 0:
        print(f"\n⚠️ Found {missing} missing values, filling with 0")
        features = features.fillna(0)
    
    # Create output directory
    OUTPUT_BASE.mkdir(parents=True, exist_ok=True)
    
    # 1. Plot raw feature distributions
    plot_feature_distributions(features, yamnet_cols, eco_cols, OUTPUT_BASE)
    
    # 2. Standardization
    features_scaled, scaler = standardize_features(features)
    
    # 3. Plot standardization comparison
    plot_standardized_comparison(features, features_scaled, yamnet_cols, eco_cols, OUTPUT_BASE)
    
    # 4. PCA
    features_pca, pca, explained_var = apply_pca(features_scaled, variance_threshold=PCA_VARIANCE)
    
    # 5. PCA analysis plots
    plot_pca_analysis(explained_var, OUTPUT_BASE)
    
    # 6. PCA feature space plots
    plot_pca_feature_space(features_pca, metadata, OUTPUT_BASE)
    
    # 7. UMAP for visualization
    umap_df, umap_reducer = apply_umap_for_viz(features_pca)
    
    # 8. UMAP visualizations
    plot_umap_visualizations(umap_df, metadata, OUTPUT_BASE)
    
    # 9. Data distribution plots
    plot_logger_date_distribution(metadata, OUTPUT_BASE)
    
    # 10. Save preprocessed data
    save_preprocessed_data(metadata, features_scaled, features_pca, umap_df, OUTPUT_BASE)
    
    # 11. Create summary report
    create_summary_report(metadata, features, features_pca, explained_var, OUTPUT_BASE)
    
    # Summary
    print("\n" + "=" * 70)
    print("✅ PREPROCESSING COMPLETE")
    print("=" * 70)
    
    print(f"\n📊 Summary:")
    print(f"   Original dimensions: {len(features.columns)}")
    print(f"   PCA dimensions: {len(features_pca.columns)}")
    print(f"   Reduction: {(1 - len(features_pca.columns)/len(features.columns))*100:.1f}%")
    print(f"   Variance preserved: {np.sum(explained_var)*100:.2f}%")
    
    print(f"\n💾 Output files (9 total):")
    print(f"   📄 Data files:")
    print(f"      • preprocessed_features_full.parquet")
    print(f"      • preprocessed_features_pca.parquet ⭐ (use this for clustering)")
    print(f"   📊 Visualization files:")
    print(f"      • 01_feature_distributions_raw.png")
    print(f"      • 02_standardization_comparison.png")
    print(f"      • 03_pca_comprehensive_analysis.png")
    print(f"      • 04_pca_feature_space.png")
    print(f"      • 05_umap_comprehensive.png")
    print(f"      • 06_data_distribution.png")
    print(f"   📝 Report:")
    print(f"      • preprocessing_summary.txt")

    

if __name__ == "__main__":
    main()


FEATURE PREPROCESSING: Standardization + PCA + UMAP
📂 Loading fused features...


Loading files:   0%|          | 0/271 [00:00<?, ?it/s]

Loading files: 100%|██████████| 271/271 [00:08<00:00, 33.04it/s]


   ✓ Loaded 50,000 clips
   ✓ Shape: (50000, 1046)

📊 Feature breakdown:
   YAMNet features: 1024
   Ecoacoustic features: 17
   Total features: 1041

📊 Creating feature distribution plots...
   ✓ Saved feature distribution plots

🔧 Standardizing features...
   ✓ Features standardized
   Mean: -9.75e-18
   Std: 1.00

📊 Creating standardization comparison plots...
   ✓ Saved standardization comparison

🔬 Applying PCA (keeping 95.0% variance)...
   ✓ Reduced from 1041 to 39 dimensions
   ✓ Explained variance: 95.03%
   First 10 components explain: 87.37%

📊 Creating PCA analysis plots...
   ✓ Saved PCA analysis plots

📊 Creating PCA feature space plots...
   ✓ Saved PCA feature space plots

🗺️ Applying UMAP for visualization...
   n_neighbors=15, min_dist=0.1
   ✓ 2D embedding created

📊 Creating UMAP visualization plots...
   ✓ Saved comprehensive UMAP visualizations

📊 Creating data distribution plots...
   ✓ Saved data distribution plots

💾 Saving preprocessed data...
   ✓ Full datase