In [None]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Paths
HOME = Path(os.environ["HOME"])
REPO_ROOT = HOME / "Uni-stuff/semester-2/applied_Ml/reef_zmsc"

CLUSTERS_PATH = REPO_ROOT / "data/clusters/clusters_hdbscan.parquet"
FUSED_FEATURES_PATH = REPO_ROOT / "data/features/embeds_fused_pilot/PAPCA"
OUTPUT_BASE = REPO_ROOT / "data/auto_labels"
PLOTS_DIR = REPO_ROOT / "plots/auto_labels"

# Label categories
CATEGORIES = {
    'biological': 'Biological sounds (fish, marine mammals, invertebrates)',
    'anthropogenic': 'Human-made sounds (boats, ships, construction)',
    'ambient': 'Natural environmental sounds (waves, weather, currents)',
    'unknown': 'Uncertain or mixed sounds'
}


def load_cluster_features():
    """Load clusters with their original ecoacoustic features."""
    print("Loading clustered data with features...")
    
    # Load all fused features
    all_data = []
    for parquet_file in FUSED_FEATURES_PATH.rglob("features.parquet"):
        df = pd.read_parquet(parquet_file)
        all_data.append(df)
    
    fused_df = pd.concat(all_data, ignore_index=True)
    
    # Load cluster assignments
    clusters_df = pd.read_parquet(CLUSTERS_PATH)
    
    # Merge
    merged = clusters_df.merge(
        fused_df, 
        on=['logger', 'date', 'start_s'], 
        how='left'
    )
    
    print(f"Loaded {len(merged):,} clips with features")
    return merged


def compute_cluster_feature_profiles(df):
    """Compute feature statistics per cluster."""
    print("\nComputing cluster feature profiles...")
    
    # Ecoacoustic feature columns
    eco_features = [
        'spectral_centroid_mean', 'spectral_centroid_std',
        'spectral_bandwidth_mean', 'spectral_rolloff_mean',
        'spectral_flatness_mean', 'spectral_contrast_mean',
        'aci', 'spectral_entropy', 'temporal_entropy',
        'zcr_mean', 'rms_mean', 'rms_std', 'dynamic_range_db',
        'snr_db', 'low_freq_energy', 'mid_freq_energy', 'high_freq_energy'
    ]
    
    profiles = []
    
    for cluster_id in sorted(df['cluster'].unique()):
        if cluster_id == -1:
            continue
        
        cluster_data = df[df['cluster'] == cluster_id]
        
        profile = {'cluster_id': cluster_id, 'size': len(cluster_data)}
        
        # Compute mean and std for each feature
        for feat in eco_features:
            if feat in cluster_data.columns:
                profile[f'{feat}_mean'] = cluster_data[feat].mean()
                profile[f'{feat}_std'] = cluster_data[feat].std()
        
        profiles.append(profile)
    
    return pd.DataFrame(profiles)


def apply_labeling_rules(profiles):
    """
    Apply weak supervision rules based on acoustic features.
    Returns label and confidence for each cluster.
    """
    print("\nApplying auto-labeling rules...")
    
    labels = []
    
    for _, row in profiles.iterrows():
        cluster_id = row['cluster_id']
        
        # Extract key features (already normalized 0-1)
        low_freq = row.get('low_freq_energy_mean_mean', 0.5)
        mid_freq = row.get('mid_freq_energy_mean_mean', 0.3)
        high_freq = row.get('high_freq_energy_mean_mean', 0.2)
        
        spectral_flatness = row.get('spectral_flatness_mean_mean', 0.5)
        aci = row.get('aci_mean', 0.5)
        temporal_entropy = row.get('temporal_entropy_mean_mean', 0.5)
        snr = row.get('snr_db_mean', 0.5)
        dynamic_range = row.get('dynamic_range_db_mean', 0.5)
        spectral_centroid = row.get('spectral_centroid_mean_mean', 0.5)
        
        # Initialize scores
        bio_score = 0
        anthro_score = 0
        ambient_score = 0
        
        # === BIOLOGICAL RULES ===
        
        # Rule B1: High-frequency energy (snapping shrimp signature)
        if high_freq > 0.15:  # Lowered threshold
            bio_score += 3
            if high_freq > 0.25:
                bio_score += 2  # Strong signal
        
        # Rule B2: High ACI (complex temporal patterns = biological activity)
        if aci > 0.45:
            bio_score += 3
            if aci > 0.6:
                bio_score += 2
        
        # Rule B3: Mid-frequency + high dynamic range (fish vocalizations)
        if mid_freq > 0.35 and dynamic_range > 0.25:
            bio_score += 3
        
        # Rule B4: High temporal entropy (variable biological sounds)
        if temporal_entropy > 0.65:
            bio_score += 2
        
        # Rule B5: Moderate-to-high spectral centroid (not deep rumbles)
        if 0.15 < spectral_centroid < 0.4:
            bio_score += 1
        
        # === ANTHROPOGENIC RULES ===
        
        # Rule A1: Strong low-frequency dominance + tonal (boats/ships)
        if low_freq > 0.6 and spectral_flatness < 0.1:
            anthro_score += 4
        
        # Rule A2: Very low ACI (constant drone)
        if aci < 0.35:
            anthro_score += 2
            if aci < 0.25:
                anthro_score += 2
        
        # Rule A3: Low temporal entropy + low frequency (machinery)
        if temporal_entropy < 0.5 and low_freq > 0.55:
            anthro_score += 3
        
        # Rule A4: Very tonal signature (engines)
        if spectral_flatness < 0.05:
            anthro_score += 2
        
        # === AMBIENT RULES ===
        
        # Rule AM1: High spectral flatness (noise-like)
        if spectral_flatness > 0.3:
            ambient_score += 3
            if spectral_flatness > 0.5:
                ambient_score += 2
        
        # Rule AM2: Low SNR (diffuse sound)
        if snr < 0.35:
            ambient_score += 2
        
        # Rule AM3: Low dynamic range (constant level)
        if dynamic_range < 0.15:
            ambient_score += 2
        
        # Rule AM4: Balanced frequency distribution (broadband)
        freq_balance = abs(low_freq - 0.33) + abs(mid_freq - 0.33) + abs(high_freq - 0.33)
        if freq_balance < 0.3:  # All frequencies roughly equal
            ambient_score += 2
        
        # === PENALTY RULES (prevent misclassification) ===
        
        # Penalize anthro if high-frequency present (boats are low-freq)
        if high_freq > 0.2:
            anthro_score = max(0, anthro_score - 2)
        
        # Penalize bio if too tonal (biology is more varied)
        if spectral_flatness < 0.08:
            bio_score = max(0, bio_score - 2)
        
        # Determine label based on scores
        scores = {
            'biological': bio_score,
            'anthropogenic': anthro_score,
            'ambient': ambient_score
        }
        
        max_score = max(scores.values())
        
        # Abstain if no clear winner
        if max_score < 3:
            label = 'unknown'
            confidence = 0.2
        else:
            label = max(scores, key=scores.get)
            
            # Confidence based on score margin
            sorted_scores = sorted(scores.values(), reverse=True)
            margin = sorted_scores[0] - sorted_scores[1] if len(sorted_scores) > 1 else sorted_scores[0]
            
            # Higher threshold for confidence
            confidence = min(margin / 6.0, 0.95)
        
        labels.append({
            'cluster_id': cluster_id,
            'auto_label': label,
            'confidence': confidence,
            'bio_score': bio_score,
            'anthro_score': anthro_score,
            'ambient_score': ambient_score
        })
    
    return pd.DataFrame(labels)

In [5]:
def apply_labels_to_clips(df, cluster_labels):
    """Apply cluster labels to all clips."""
    print("\nPropagating labels to clips...")
    
    # Merge cluster labels
    df_labeled = df.merge(
        cluster_labels[['cluster_id', 'auto_label', 'confidence']], 
        left_on='cluster', 
        right_on='cluster_id', 
        how='left'
    )
    
    # Handle noise
    df_labeled.loc[df_labeled['cluster'] == -1, 'auto_label'] = 'unknown'
    df_labeled.loc[df_labeled['cluster'] == -1, 'confidence'] = 0.0
    
    return df_labeled

In [6]:
def filter_by_confidence(df_labeled, min_confidence=0.3):
    """Create high-confidence pseudo-labeled dataset."""
    print(f"\nFiltering by confidence >= {min_confidence}...")
    
    high_conf = df_labeled[df_labeled['confidence'] >= min_confidence].copy()
    
    print(f"High-confidence clips: {len(high_conf):,} / {len(df_labeled):,} ({len(high_conf)/len(df_labeled)*100:.1f}%)")
    
    label_dist = high_conf['auto_label'].value_counts()
    print("\nLabel distribution (high confidence):")
    for label, count in label_dist.items():
        print(f"  {label}: {count:,} ({count/len(high_conf)*100:.1f}%)")
    
    return high_conf

In [7]:
def create_visualizations(cluster_labels, df_labeled, high_conf):
    """Create visualizations of auto-labeling results."""
    PLOTS_DIR.mkdir(parents=True, exist_ok=True)
    
    fig = plt.figure(figsize=(16, 10))
    
    # 1. Cluster label distribution
    ax1 = fig.add_subplot(2, 3, 1)
    label_counts = cluster_labels['auto_label'].value_counts()
    colors_map = {'biological': 'green', 'anthropogenic': 'red', 'ambient': 'blue', 'unknown': 'gray'}
    colors = [colors_map.get(label, 'gray') for label in label_counts.index]
    ax1.bar(range(len(label_counts)), label_counts.values, color=colors)
    ax1.set_xticks(range(len(label_counts)))
    ax1.set_xticklabels(label_counts.index, rotation=45)
    ax1.set_ylabel('Number of Clusters')
    ax1.set_title('Clusters by Auto-Label')
    ax1.grid(axis='y', alpha=0.3)
    
    # 2. Confidence distribution per label
    ax2 = fig.add_subplot(2, 3, 2)
    for label in cluster_labels['auto_label'].unique():
        label_data = cluster_labels[cluster_labels['auto_label'] == label]
        ax2.hist(label_data['confidence'], bins=20, alpha=0.5, label=label)
    ax2.set_xlabel('Confidence')
    ax2.set_ylabel('Frequency')
    ax2.set_title('Confidence Distribution by Label')
    ax2.legend()
    ax2.grid(axis='y', alpha=0.3)
    
    # 3. Clips per label (all vs high-confidence)
    ax3 = fig.add_subplot(2, 3, 3)
    all_counts = df_labeled['auto_label'].value_counts()
    high_counts = high_conf['auto_label'].value_counts()
    
    x = np.arange(len(all_counts))
    width = 0.35
    ax3.bar(x - width/2, all_counts.values, width, label='All clips', alpha=0.7)
    ax3.bar(x + width/2, [high_counts.get(label, 0) for label in all_counts.index], 
            width, label='High confidence', alpha=0.7)
    ax3.set_xticks(x)
    ax3.set_xticklabels(all_counts.index, rotation=45)
    ax3.set_ylabel('Number of Clips')
    ax3.set_title('Clips by Label (All vs High-Conf)')
    ax3.legend()
    ax3.grid(axis='y', alpha=0.3)
    
    # 4. Score distribution
    ax4 = fig.add_subplot(2, 3, 4)
    score_cols = ['bio_score', 'anthro_score', 'ambient_score']
    for col in score_cols:
        ax4.hist(cluster_labels[col], bins=10, alpha=0.5, label=col.replace('_score', ''))
    ax4.set_xlabel('Rule Score')
    ax4.set_ylabel('Frequency')
    ax4.set_title('Rule Scores Distribution')
    ax4.legend()
    
    # 5. Confidence vs cluster size
    ax5 = fig.add_subplot(2, 3, 5)
    scatter = ax5.scatter(cluster_labels['cluster_id'], cluster_labels['confidence'],
                         c=cluster_labels['auto_label'].map(colors_map), alpha=0.6)
    ax5.set_xlabel('Cluster ID')
    ax5.set_ylabel('Confidence')
    ax5.set_title('Confidence per Cluster')
    ax5.axhline(0.5, color='r', linestyle='--', alpha=0.5, label='Threshold')
    ax5.legend()
    
    # 6. Label transitions (sankey-style bar)
    ax6 = fig.add_subplot(2, 3, 6)
    filtered_out = len(df_labeled) - len(high_conf)
    retained_by_label = high_conf['auto_label'].value_counts()
    
    data = [filtered_out] + list(retained_by_label.values)
    labels_bar = ['Filtered\n(low conf)'] + [f'{label}\n(high conf)' for label in retained_by_label.index]
    colors_bar = ['lightgray'] + [colors_map.get(label, 'gray') for label in retained_by_label.index]
    
    ax6.bar(range(len(data)), data, color=colors_bar)
    ax6.set_xticks(range(len(data)))
    ax6.set_xticklabels(labels_bar, rotation=45, ha='right')
    ax6.set_ylabel('Number of Clips')
    ax6.set_title('Filtering by Confidence')
    ax6.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plot_path = PLOTS_DIR / "auto_labeling_overview.png"
    plt.savefig(plot_path, dpi=150, bbox_inches='tight')
    plt.close()
    
    print(f"\nPlot saved: {plot_path.relative_to(REPO_ROOT)}")

In [8]:
def save_results(cluster_labels, df_labeled, high_conf):
    """Save auto-labeled datasets."""
    OUTPUT_BASE.mkdir(parents=True, exist_ok=True)
    
    # Save cluster-level labels
    cluster_labels.to_csv(OUTPUT_BASE / "cluster_auto_labels.csv", index=False)
    print(f"\nCluster labels saved: {OUTPUT_BASE / 'cluster_auto_labels.csv'}")
    
    # Save all clips with labels
    df_labeled.to_parquet(OUTPUT_BASE / "clips_auto_labeled_all.parquet", index=False)
    print(f"All labeled clips: {OUTPUT_BASE / 'clips_auto_labeled_all.parquet'}")
    
    # Save high-confidence subset (for training)
    high_conf.to_parquet(OUTPUT_BASE / "clips_auto_labeled_high_confidence.parquet", index=False)
    print(f"High-confidence clips: {OUTPUT_BASE / 'clips_auto_labeled_high_confidence.parquet'}")
    
    # Save summary
    with open(OUTPUT_BASE / "labeling_summary.txt", 'w') as f:
        f.write("AUTO-LABELING SUMMARY\n")
        f.write("="*60 + "\n\n")
        
        f.write(f"Total clips: {len(df_labeled):,}\n")
        f.write(f"High-confidence clips: {len(high_conf):,} ({len(high_conf)/len(df_labeled)*100:.1f}%)\n\n")
        
        f.write("All clips label distribution:\n")
        for label, count in df_labeled['auto_label'].value_counts().items():
            f.write(f"  {label}: {count:,} ({count/len(df_labeled)*100:.1f}%)\n")
        
        f.write("\nHigh-confidence label distribution:\n")
        for label, count in high_conf['auto_label'].value_counts().items():
            f.write(f"  {label}: {count:,} ({count/len(high_conf)*100:.1f}%)\n")
    
    print(f"Summary saved: {OUTPUT_BASE / 'labeling_summary.txt'}")

In [9]:
def main():
    print("="*60)
    print("AUTO-LABELING WITH WEAK SUPERVISION")
    print("="*60)
    
    # Load data
    df = load_cluster_features()
    
    # Compute cluster feature profiles
    profiles = compute_cluster_feature_profiles(df)
    
    # Apply labeling rules
    cluster_labels = apply_labeling_rules(profiles)
    
    print("\nCluster labeling results:")
    print(cluster_labels.groupby('auto_label').size())
    
    # Apply labels to all clips
    df_labeled = apply_labels_to_clips(df, cluster_labels)
    
    # Filter by confidence
    high_conf = filter_by_confidence(df_labeled, min_confidence=0.5)
    
    # Visualize
    create_visualizations(cluster_labels, df_labeled, high_conf)
    
    # Save
    save_results(cluster_labels, df_labeled, high_conf)
    
    print("\n" + "="*60)
    print("AUTO-LABELING COMPLETE")
    print("="*60)
    print("\nNext step: Train classifier on high-confidence pseudo-labels")
    print(f"  Training data: {len(high_conf):,} clips")


if __name__ == "__main__":
    main()

AUTO-LABELING WITH WEAK SUPERVISION
Loading clustered data with features...
Loaded 786,180 clips with features

Computing cluster feature profiles...

Applying auto-labeling rules...

Cluster labeling results:
auto_label
anthropogenic    177
dtype: int64

Propagating labels to clips...

Filtering by confidence >= 0.5...
High-confidence clips: 19,523 / 786,180 (2.5%)

Label distribution (high confidence):
  anthropogenic: 19,523 (100.0%)

Plot saved: plots/auto_labels/auto_labeling_overview.png

Cluster labels saved: /home/sparch/Uni-stuff/semester-2/applied_Ml/reef_zmsc/data/auto_labels/cluster_auto_labels.csv
All labeled clips: /home/sparch/Uni-stuff/semester-2/applied_Ml/reef_zmsc/data/auto_labels/clips_auto_labeled_all.parquet
High-confidence clips: /home/sparch/Uni-stuff/semester-2/applied_Ml/reef_zmsc/data/auto_labels/clips_auto_labeled_high_confidence.parquet
Summary saved: /home/sparch/Uni-stuff/semester-2/applied_Ml/reef_zmsc/data/auto_labels/labeling_summary.txt

AUTO-LABELING