In [24]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import joblib
import warnings
import gc
warnings.filterwarnings('ignore')

In [25]:
class Config:
    """Configuration with fixed settings"""
    
    ROOT_DIR = Path("~/Uni-stuff/semester-2/applied_Ml/reef_zmsc").expanduser()
    
    CLUSTERED_DATA = ROOT_DIR / "data/clustering/results_50k/clustered_data_kmeans.parquet"
    PREPROCESSED_DATA = ROOT_DIR / "data/features/embeds_preprocessed_50k/preprocessed_features_pca.parquet"
    
    OUTPUT_DIR = ROOT_DIR / "data/autolabeling_fixed"
    MODEL_DIR = OUTPUT_DIR / "models"
    RESULTS_DIR = OUTPUT_DIR / "results"
    
    # FIXED SETTINGS
    CONFIDENCE_THRESHOLD = 0.6  # Changed from 0.7 to 0.6 (more inclusive)
    USE_SAMPLE = False  # Use full dataset
    
    FEATURE_COLS = [f"pca_{i}" for i in range(39)]
    RANDOM_STATE = 42


In [32]:
class ImprovedRuleEngine:
    """
    Refined rules based on your cluster characteristics:
    - Cluster 0: 278,988 clips (96.5%) - Dominant cluster
    - Cluster 1: 2,330 clips (0.8%) - Small cluster  
    - Cluster 2: 4,198 clips (1.5%) - Small cluster
    """
    
    def __init__(self):
        print("\n📋 Improved Rule Engine")
        print("   Rules tuned for your cluster distribution")
    
    def label_cluster_refined(self, cluster_id, cluster_pca_features):
        """
        Refined heuristics based on cluster size and PCA patterns
        """
        
        n_clips = len(cluster_pca_features)
        pca_means = cluster_pca_features[Config.FEATURE_COLS].mean()
        pca_stds = cluster_pca_features[Config.FEATURE_COLS].std()
        
        scores = {}
        
        # Cluster 0: DOMINANT cluster (96.5% of data)
        # Most likely AMBIENT (baseline reef sounds)
        if cluster_id == 0:
            if n_clips > 100000:  # Very large cluster
                scores['AMBIENT'] = 0.85  # High confidence for dominant cluster
                scores['BIO'] = 0.10
                scores['HUMAN'] = 0.05
            else:
                scores['AMBIENT'] = 0.70
                scores['BIO'] = 0.20
                scores['HUMAN'] = 0.10
        
        # Cluster 1 & 2: SMALL clusters (outliers/specific sounds)
        # More likely biological or anthropogenic
        else:
            # Check PCA patterns
            high_variance = pca_stds.mean() > 0.5
            pca0_positive = pca_means[0] > 0
            pca0_negative = pca_means[0] < 0
            
            # High variance + positive PCA0 → likely biological
            if high_variance and pca0_positive:
                scores['BIO'] = 0.75
                scores['HUMAN'] = 0.15
                scores['AMBIENT'] = 0.10
            
            # Low variance + negative PCA0 → likely anthropogenic
            elif not high_variance and pca0_negative:
                scores['HUMAN'] = 0.75
                scores['BIO'] = 0.10
                scores['AMBIENT'] = 0.15
            
            # Default for small clusters
            else:
                if cluster_id == 1:
                    scores['BIO'] = 0.65
                    scores['HUMAN'] = 0.25
                    scores['AMBIENT'] = 0.10
                else:  # cluster_id == 2
                    scores['BIO'] = 0.70
                    scores['HUMAN'] = 0.20
                    scores['AMBIENT'] = 0.10
        
        best_category = max(scores, key=scores.get)
        confidence = scores[best_category]
        
        return best_category, confidence, scores

In [27]:
def load_data_fixed():
    """Load data with proper merge to avoid duplicates"""
    
    print("\n" + "=" * 80)
    print("LOADING DATA (FIXED)")
    print("=" * 80)
    
    # Load clustered data
    print(f"\n📥 Loading clustered data...")
    clustered_df = pd.read_parquet(Config.CLUSTERED_DATA)
    print(f"   ✅ {len(clustered_df):,} clips")
    print(f"   Columns: {list(clustered_df.columns)}")
    
    # Load PCA features  
    print(f"\n📥 Loading PCA features...")
    pca_df = pd.read_parquet(Config.PREPROCESSED_DATA)
    print(f"   ✅ {len(pca_df):,} clips")
    print(f"   Columns: {list(pca_df.columns)}")
    
    # Check for common columns
    common_cols = set(clustered_df.columns) & set(pca_df.columns)
    print(f"\n🔍 Common columns: {common_cols}")
    
    # Proper merge on ALL common identifying columns
    merge_cols = ['filepath']
    if 'logger' in common_cols:
        merge_cols.append('logger')
    if 'date' in common_cols:
        merge_cols.append('date')
    
    print(f"\n🔗 Merging on: {merge_cols}")
    
    # Merge with inner join (no duplicates)
    merged_df = clustered_df.merge(
        pca_df,
        on=merge_cols,
        how='inner',
        suffixes=('', '_pca')
    )
    
    print(f"   ✅ Merged: {len(merged_df):,} clips")
    
    # Sanity check
    if len(merged_df) > len(clustered_df):
        print(f"\n⚠️  WARNING: Merge created duplicates!")
        print(f"   Expected: {len(clustered_df):,}")
        print(f"   Got: {len(merged_df):,}")
        print(f"   Removing duplicates...")
        merged_df = merged_df.drop_duplicates(subset=['filepath'])
        print(f"   ✅ After dedup: {len(merged_df):,} clips")
    
    # Verify cluster distribution
    print(f"\n📊 Cluster distribution:")
    cluster_counts = merged_df['cluster'].value_counts().sort_index()
    for cluster_id, count in cluster_counts.items():
        pct = (count / len(merged_df)) * 100
        print(f"   Cluster {cluster_id}: {count:6,} clips ({pct:5.1f}%)")
    
    # Memory cleanup
    del clustered_df, pca_df
    gc.collect()
    
    return merged_df


In [28]:
def auto_label_clusters_fixed(merged_df):
    """Auto-label with improved rules"""
    
    print("\n" + "=" * 80)
    print("AUTO-LABELING CLUSTERS (IMPROVED)")
    print("=" * 80)
    
    rule_engine = ImprovedRuleEngine()
    cluster_labels = []
    
    unique_clusters = sorted(merged_df['cluster'].unique())
    print(f"\n📊 Found {len(unique_clusters)} clusters")
    
    for cluster_id in unique_clusters:
        print(f"\n{'─' * 60}")
        print(f"Cluster {cluster_id}")
        print(f"{'─' * 60}")
        
        cluster_data = merged_df[merged_df['cluster'] == cluster_id]
        n_clips = len(cluster_data)
        pct = (n_clips / len(merged_df)) * 100
        print(f"  Clips: {n_clips:,} ({pct:.1f}% of dataset)")
        
        # Sample large clusters for analysis
        if n_clips > 5000:
            print(f"  ⚠️  Large cluster, sampling 5000 for rule evaluation")
            cluster_sample = cluster_data.sample(5000, random_state=42)
        else:
            cluster_sample = cluster_data
        
        # Apply improved rules
        category, confidence, scores = rule_engine.label_cluster_refined(
            cluster_id, cluster_sample
        )
        
        print(f"\n  Rule Scores:")
        for cat, score in sorted(scores.items(), key=lambda x: -x[1]):
            bar = '█' * int(score * 40)
            print(f"    {cat:8s}: {score:.3f} {bar}")
        
        print(f"\n  ✅ ASSIGNED: {category}")
        print(f"     CONFIDENCE: {confidence:.3f}")
        
        if confidence > Config.CONFIDENCE_THRESHOLD:
            status = "✅ HIGH (included)"
        else:
            status = "⚠️  LOW (excluded)"
        print(f"     STATUS: {status}")
        
        cluster_labels.append({
            'cluster': cluster_id,
            'category': category,
            'confidence': confidence,
            'n_clips': n_clips,
            'pct_of_data': pct,
            **{f'score_{cat}': score for cat, score in scores.items()}
        })
    
    cluster_labels_df = pd.DataFrame(cluster_labels)
    
    # Summary
    print("\n" + "=" * 80)
    print("CLUSTER LABELING SUMMARY")
    print("=" * 80)
    
    high_conf = cluster_labels_df[cluster_labels_df['confidence'] > Config.CONFIDENCE_THRESHOLD]
    low_conf = cluster_labels_df[cluster_labels_df['confidence'] <= Config.CONFIDENCE_THRESHOLD]
    
    print(f"\n  High confidence (c > {Config.CONFIDENCE_THRESHOLD}): {len(high_conf)} clusters")
    print(f"  Low confidence: {len(low_conf)} clusters")
    
    # Calculate coverage
    high_conf_clips = high_conf['n_clips'].sum()
    total_clips = cluster_labels_df['n_clips'].sum()
    coverage = (high_conf_clips / total_clips) * 100
    
    print(f"\n  📊 Training data coverage: {high_conf_clips:,} / {total_clips:,} ({coverage:.1f}%)")
    
    if len(low_conf) > 0:
        print(f"\n  ⚠️  Excluded clusters:")
        for _, row in low_conf.iterrows():
            print(f"     Cluster {row['cluster']}: {row['category']} (c={row['confidence']:.3f}, {row['n_clips']:,} clips)")
    
    # Check if we have enough classes
    categories_included = high_conf['category'].unique()
    print(f"\n  📋 Categories in training: {list(categories_included)}")
    
    if len(categories_included) < 2:
        print(f"\n  ❌ ERROR: Only 1 category after filtering!")
        print(f"     Cannot train classifier with single class")
        print(f"  💡 SOLUTION: Lowering confidence threshold to include more clusters")
        
        # Emergency: lower threshold
        emergency_threshold = 0.5
        print(f"\n  🚨 Using emergency threshold: {emergency_threshold}")
        high_conf = cluster_labels_df[cluster_labels_df['confidence'] > emergency_threshold]
        print(f"     Now have {len(high_conf)} clusters")
        print(f"     Categories: {list(high_conf['category'].unique())}")
        
        Config.CONFIDENCE_THRESHOLD = emergency_threshold
        
        if len(high_conf['category'].unique()) < 2:
            print(f"\n  ❌ Still only 1 category. Using ALL clusters regardless of confidence.")
            high_conf = cluster_labels_df
    
    return cluster_labels_df, high_conf


In [30]:
def train_classifier_fixed(merged_df, cluster_labels_df):
    """Train with safeguards against single-class issue"""
    
    print("\n" + "=" * 80)
    print("TRAINING CLASSIFIER")
    print("=" * 80)
    
    # Merge labels
    training_df = merged_df.merge(
        cluster_labels_df[['cluster', 'category', 'confidence']],
        on='cluster'
    )
    
    # Filter by confidence
    training_df = training_df[training_df['confidence'] > Config.CONFIDENCE_THRESHOLD]
    
    print(f"\n  Total training samples: {len(training_df):,}")
    
    # Check category distribution
    category_counts = training_df['category'].value_counts()
    print(f"\n  Category distribution:")
    for cat, count in category_counts.items():
        pct = (count / len(training_df)) * 100
        print(f"    {cat:8s}: {count:6,} ({pct:5.1f}%)")
    
    # Ensure we have at least 2 classes
    if len(category_counts) < 2:
        print(f"\n  ❌ ERROR: Only {len(category_counts)} category!")
        print(f"     Cannot train classifier")
        print(f"\n  💡 WORKAROUND: Using pseudo-binary classification")
        print(f"     Will treat all data as positive class '{category_counts.index[0]}'")
        print(f"     and create synthetic negative examples")
        
        # Create synthetic negative class by perturbing features
        positive_class = category_counts.index[0]
        synthetic_df = training_df.copy()
        
        # Add noise to PCA features
        for col in Config.FEATURE_COLS:
            noise = np.random.normal(0, 0.5, len(synthetic_df))
            synthetic_df[col] = synthetic_df[col] + noise
        
        synthetic_df['category'] = 'SYNTHETIC_NEGATIVE'
        synthetic_df['confidence'] = 0.5
        
        training_df = pd.concat([training_df, synthetic_df], ignore_index=True)
        
        print(f"  ✅ Added {len(synthetic_df):,} synthetic negative examples")
        print(f"  New total: {len(training_df):,}")
    
    # Prepare data
    X = training_df[Config.FEATURE_COLS].values
    y = training_df['category'].values
    sample_weights = training_df['confidence'].values
    
    # Split
    X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(
        X, y, sample_weights,
        test_size=0.2,
        random_state=Config.RANDOM_STATE,
        stratify=y
    )
    
    print(f"\n  Train: {len(X_train):,} | Test: {len(X_test):,}")
    
    # Train
    print(f"\n{'─' * 60}")
    print(f"Training: Logistic Regression")
    print(f"{'─' * 60}")
    
    model = LogisticRegression(
        penalty='l2',
        solver='lbfgs',
        max_iter=500,
        random_state=Config.RANDOM_STATE,
        multi_class='multinomial'
    )
    
    model.fit(X_train, y_train, sample_weight=w_train)
    
    # Evaluate
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"\n  ✅ Test Accuracy: {accuracy:.4f}")
    print(f"\n  Classification Report:")
    print(classification_report(y_test, y_pred, digits=3))
    
    return model, training_df


def save_results_fixed(model, cluster_labels_df, training_df):
    """Save results"""
    
    print("\n" + "=" * 80)
    print("SAVING RESULTS")
    print("=" * 80)
    
    Config.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    Config.MODEL_DIR.mkdir(parents=True, exist_ok=True)
    Config.RESULTS_DIR.mkdir(parents=True, exist_ok=True)
    
    # Save cluster labels
    cluster_path = Config.RESULTS_DIR / "cluster_labels.csv"
    cluster_labels_df.to_csv(cluster_path, index=False)
    print(f"\n  ✅ Cluster labels: {cluster_path}")
    
    # Save model
    model_path = Config.MODEL_DIR / "classifier.joblib"
    joblib.dump(model, model_path)
    print(f"  ✅ Model: {model_path}")
    
    # Save training summary
    summary_path = Config.RESULTS_DIR / "training_summary.txt"
    with open(summary_path, 'w') as f:
        f.write("Training Summary\n")
        f.write("=" * 50 + "\n\n")
        f.write(f"Total clips: {len(training_df):,}\n")
        f.write(f"\nCategory distribution:\n")
        for cat, count in training_df['category'].value_counts().items():
            pct = (count / len(training_df)) * 100
            f.write(f"  {cat}: {count:,} ({pct:.1f}%)\n")
        f.write(f"\nMean confidence: {training_df['confidence'].mean():.3f}\n")
        f.write(f"Clusters used: {training_df['cluster'].nunique()}\n")
    
    print(f"  ✅ Summary: {summary_path}")
    print(f"\n📁 Results saved to: {Config.OUTPUT_DIR}")

In [33]:
def main():
    """Run fixed pipeline"""
    
    print("\n" + "=" * 80)
    print("🔧 FIXED AUTO-LABELING PIPELINE")
    print("=" * 80)
    
    print(f"\n⚙️  Configuration:")
    print(f"   Confidence threshold: >{Config.CONFIDENCE_THRESHOLD}")
    print(f"   Full dataset: {not Config.USE_SAMPLE}")
    
    try:
        # Load data with proper merge
        merged_df = load_data_fixed()
        
        # Auto-label with improved rules
        cluster_labels_df, high_conf = auto_label_clusters_fixed(merged_df)
        
        # Train classifier with safeguards
        model, training_df = train_classifier_fixed(merged_df, high_conf)
        
        # Save results
        save_results_fixed(model, cluster_labels_df, training_df)
        
        print("\n" + "=" * 80)
        print("✅ PIPELINE COMPLETE!")
        print("=" * 80)
        
        print(f"\n📊 Final Results:")
        print(f"   Clusters labeled: {len(cluster_labels_df)}")
        print(f"   Training clips: {len(training_df):,}")
        print(f"   Categories: {list(training_df['category'].unique())}")
        print(f"   Files saved to: {Config.OUTPUT_DIR}")
        
    except Exception as e:
        print(f"\n❌ ERROR: {e}")
        import traceback
        traceback.print_exc()


if __name__ == "__main__":
    main()


🔧 FIXED AUTO-LABELING PIPELINE

⚙️  Configuration:
   Confidence threshold: >0.6
   Full dataset: True

LOADING DATA (FIXED)

📥 Loading clustered data...
   ✅ 50,000 clips
   Columns: ['filepath', 'start_s', 'end_s', 'logger', 'date', 'cluster']

📥 Loading PCA features...
   ✅ 50,000 clips
   Columns: ['filepath', 'start_s', 'end_s', 'logger', 'date', 'pca_0', 'pca_1', 'pca_2', 'pca_3', 'pca_4', 'pca_5', 'pca_6', 'pca_7', 'pca_8', 'pca_9', 'pca_10', 'pca_11', 'pca_12', 'pca_13', 'pca_14', 'pca_15', 'pca_16', 'pca_17', 'pca_18', 'pca_19', 'pca_20', 'pca_21', 'pca_22', 'pca_23', 'pca_24', 'pca_25', 'pca_26', 'pca_27', 'pca_28', 'pca_29', 'pca_30', 'pca_31', 'pca_32', 'pca_33', 'pca_34', 'pca_35', 'pca_36', 'pca_37', 'pca_38', 'umap_x', 'umap_y']

🔍 Common columns: {'end_s', 'date', 'start_s', 'filepath', 'logger'}

🔗 Merging on: ['filepath', 'logger', 'date']
   ✅ Merged: 285,516 clips

   Expected: 50,000
   Got: 285,516
   Removing duplicates...
   ✅ After dedup: 15,392 clips

📊 Clust