In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import warnings
warnings.filterwarnings('ignore')


In [2]:
class Config:
    """Configuration for two-stage detector"""
    
    ROOT_DIR = Path("~/Uni-stuff/semester-2/applied_Ml/reef_zmsc").expanduser()
    
    # Existing Stage 1 model (AMBIENT vs BIO)
    STAGE1_MODEL = ROOT_DIR / "data/autolabeling_fixed/models/classifier.joblib"
    
    # Input data
    CLUSTERED_DATA = ROOT_DIR / "data/clustering/results_50k/clustered_data_kmeans.parquet"
    PREPROCESSED_DATA = ROOT_DIR / "data/features/embeds_preprocessed_50k/preprocessed_features_pca.parquet"
    
    # Output directory
    OUTPUT_DIR = ROOT_DIR / "data/two_stage_detector"
    MODEL_DIR = OUTPUT_DIR / "models"
    RESULTS_DIR = OUTPUT_DIR / "results"
    
    # Feature columns
    FEATURE_COLS = [f"pca_{i}" for i in range(39)]
    
    RANDOM_STATE = 42


In [3]:
class HumanSoundDetector:
    """
    Detects anthropogenic sounds (boats, engines, equipment)
    within clips classified as AMBIENT by Stage 1.
    
    Uses acoustic signatures known for human sounds:
    - Low frequency rumble (20-200 Hz) → boat engines
    - Tonal/harmonic patterns → vessels, machinery
    - Sustained energy → continuous operation
    """
    
    def __init__(self):
        """Initialize with acoustic rules for human sounds"""
        self.rules = self._define_human_rules()
        self.model = None
        self.threshold = 0.6  # Confidence threshold for HUMAN classification
    
    def _define_human_rules(self):
        """
        Define acoustic signatures for human/anthropogenic sounds
        Based on marine acoustic monitoring literature
        """
        rules = {
            'boat_engine': {
                'description': 'Boat/vessel engine (low freq rumble, 20-200 Hz)',
                'pca_signature': {
                    'pca_0': (-2.0, -0.5),  # Very negative (low freq indicator)
                    'pca_2': (-1.5, 1.5),   # Mid-range
                    'variance': (None, 0.4)  # Low variance (sustained)
                }
            },
            'vessel_propeller': {
                'description': 'Vessel propeller (tonal harmonics, 50-500 Hz)',
                'pca_signature': {
                    'pca_0': (-1.5, 0.0),
                    'pca_1': (-1.0, 1.0),
                    'variance': (None, 0.5)
                }
            },
            'machinery': {
                'description': 'Underwater machinery/equipment',
                'pca_signature': {
                    'pca_3': (None, -1.0),  # Specific pattern
                    'pca_4': (None, -1.0),
                    'variance': (None, 0.3)  # Very low variance
                }
            }
        }
        return rules
    
    def calculate_human_score(self, features_df):
        """
        Calculate likelihood that clips are HUMAN sounds
        
        Args:
            features_df: DataFrame with PCA features
        
        Returns:
            Array of human likelihood scores (0-1)
        """
        
        n_clips = len(features_df)
        human_scores = np.zeros(n_clips)
        
        for rule_name, rule_def in self.rules.items():
            rule_match = np.ones(n_clips, dtype=bool)
            
            sig = rule_def['pca_signature']
            
            # Check PCA component ranges
            for pca_col, (min_val, max_val) in sig.items():
                if pca_col == 'variance':
                    # Check overall variance across components
                    variance = features_df[Config.FEATURE_COLS].std(axis=1)
                    if min_val is not None:
                        rule_match &= (variance >= min_val)
                    if max_val is not None:
                        rule_match &= (variance <= max_val)
                else:
                    if pca_col in features_df.columns:
                        vals = features_df[pca_col]
                        if min_val is not None:
                            rule_match &= (vals >= min_val)
                        if max_val is not None:
                            rule_match &= (vals <= max_val)
            
            # Add to score (normalized)
            human_scores += rule_match.astype(float) / len(self.rules)
        
        return human_scores
    
    def find_human_candidates(self, ambient_df, top_n=500):
        """
        From AMBIENT-labeled clips, find most likely HUMAN sounds
        
        Args:
            ambient_df: DataFrame of clips labeled AMBIENT by Stage 1
            top_n: Number of top candidates to extract
        
        Returns:
            DataFrame of likely HUMAN sounds
        """
        
        print(f"\n🔍 Searching for HUMAN sounds in {len(ambient_df):,} AMBIENT clips...")
        
        # Calculate human scores
        human_scores = self.calculate_human_score(ambient_df)
        ambient_df = ambient_df.copy()
        ambient_df['human_score'] = human_scores
        
        # Get top candidates
        human_candidates = ambient_df.nlargest(top_n, 'human_score')
        
        print(f"\n📊 Human score distribution:")
        print(f"   Mean: {human_scores.mean():.3f}")
        print(f"   Std:  {human_scores.std():.3f}")
        print(f"   Max:  {human_scores.max():.3f}")
        print(f"   Top {top_n} scores: {human_candidates['human_score'].min():.3f} - {human_candidates['human_score'].max():.3f}")
        
        # Show sample files for manual verification
        print(f"\n📁 Top 10 HUMAN candidates (for manual verification):")
        for idx, row in human_candidates.head(10).iterrows():
            print(f"   {row['filepath']} (score: {row['human_score']:.3f})")
        
        return human_candidates
    
    def create_training_data(self, ambient_df, human_candidates, negative_sample_size=1000):
        """
        Create training data for HUMAN detector
        
        Args:
            ambient_df: All AMBIENT clips
            human_candidates: Likely HUMAN sounds
            negative_sample_size: How many true AMBIENT to include
        
        Returns:
            X, y for training
        """
        
        print(f"\n📦 Creating HUMAN detector training data...")
        
        # Positive class: Top HUMAN candidates
        human_df = human_candidates.copy()
        human_df['stage2_label'] = 'HUMAN'
        
        # Negative class: Sample from remaining AMBIENT
        # Exclude human candidates
        true_ambient = ambient_df[~ambient_df.index.isin(human_candidates.index)]
        true_ambient_sample = true_ambient.sample(
            min(negative_sample_size, len(true_ambient)),
            random_state=Config.RANDOM_STATE
        )
        true_ambient_sample = true_ambient_sample.copy()
        true_ambient_sample['stage2_label'] = 'AMBIENT'
        
        # Combine
        training_df = pd.concat([human_df, true_ambient_sample], ignore_index=True)
        
        print(f"\n   HUMAN samples: {len(human_df):,}")
        print(f"   AMBIENT samples: {len(true_ambient_sample):,}")
        print(f"   Total: {len(training_df):,}")
        print(f"   Class balance: {len(human_df) / len(training_df):.1%} HUMAN")
        
        return training_df
    
    def train(self, X_train, y_train, X_test, y_test):
        """Train the Stage 2 HUMAN detector"""
        
        print(f"\n{'=' * 60}")
        print(f"TRAINING STAGE 2: HUMAN DETECTOR")
        print(f"{'=' * 60}")
        
        print(f"\n  Training samples: {len(X_train):,}")
        print(f"  Test samples: {len(X_test):,}")
        print(f"\n  Training distribution:")
        for label, count in pd.Series(y_train).value_counts().items():
            print(f"    {label}: {count:,}")
        
        # Train Random Forest (better for imbalanced data)
        self.model = RandomForestClassifier(
            n_estimators=100,
            max_depth=10,
            min_samples_split=20,
            class_weight='balanced',  # Handle imbalance
            random_state=Config.RANDOM_STATE,
            n_jobs=-1
        )
        
        self.model.fit(X_train, y_train)
        
        # Evaluate
        y_pred = self.model.predict(X_test)
        y_proba = self.model.predict_proba(X_test)
        
        from sklearn.metrics import accuracy_score
        accuracy = accuracy_score(y_test, y_pred)
        
        print(f"\n  ✅ Test Accuracy: {accuracy:.4f}")
        print(f"\n  Classification Report:")
        print(classification_report(y_test, y_pred, digits=3))
        
        # Show confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        print(f"\n  Confusion Matrix:")
        print(f"                Predicted")
        print(f"                AMBIENT  HUMAN")
        print(f"  Actual AMBIENT  {cm[0,0]:5d}  {cm[0,1]:5d}")
        print(f"  Actual HUMAN    {cm[1,0]:5d}  {cm[1,1]:5d}")
        
        return self.model
    
    def predict(self, features, return_proba=False):
        """Predict HUMAN vs AMBIENT"""
        
        if self.model is None:
            raise ValueError("Model not trained! Call train() first.")
        
        if return_proba:
            return self.model.predict_proba(features)
        else:
            return self.model.predict(features)



In [4]:
class TwoStageDetector:
    """
    Complete two-stage hierarchical classifier
    
    Stage 1: Separate BIO from everything else
    Stage 2: Within non-BIO, separate HUMAN from AMBIENT
    """
    
    def __init__(self, stage1_model_path, stage2_detector=None):
        """
        Initialize with Stage 1 model
        
        Args:
            stage1_model_path: Path to Stage 1 classifier (AMBIENT vs BIO)
            stage2_detector: HumanSoundDetector instance (optional)
        """
        
        print(f"\n{'=' * 80}")
        print(f"TWO-STAGE HIERARCHICAL DETECTOR")
        print(f"{'=' * 80}")
        
        # Load Stage 1 model
        print(f"\n📥 Loading Stage 1 model (AMBIENT vs BIO)...")
        self.stage1_model = joblib.load(stage1_model_path)
        print(f"   ✅ Loaded from: {stage1_model_path}")
        
        # Stage 2 detector
        self.stage2_detector = stage2_detector
        if stage2_detector is not None:
            print(f"   ✅ Stage 2 HUMAN detector ready")
        else:
            print(f"   ⚠️  Stage 2 HUMAN detector not provided (will be trained)")
    
    def predict(self, features_df, return_confidence=False):
        """
        Run complete two-stage prediction
        
        Args:
            features_df: DataFrame with PCA features
            return_confidence: If True, also return confidence scores
        
        Returns:
            predictions (and optionally confidences)
        """
        
        X = features_df[Config.FEATURE_COLS].values
        
        # Stage 1: AMBIENT vs BIO
        stage1_pred = self.stage1_model.predict(X)
        stage1_proba = self.stage1_model.predict_proba(X)
        
        # Initialize final predictions
        final_pred = stage1_pred.copy()
        final_confidence = stage1_proba.max(axis=1)
        
        # Stage 2: For AMBIENT clips, check if HUMAN
        if self.stage2_detector is not None and self.stage2_detector.model is not None:
            ambient_mask = (stage1_pred == 'AMBIENT')
            
            if ambient_mask.sum() > 0:
                # Run HUMAN detector on AMBIENT clips
                X_ambient = X[ambient_mask]
                stage2_pred = self.stage2_detector.predict(X_ambient)
                stage2_proba = self.stage2_detector.predict(X_ambient, return_proba=True)
                
                # Update predictions
                final_pred[ambient_mask] = stage2_pred
                
                # Update confidence (min of both stages for HUMAN)
                human_in_stage2 = (stage2_pred == 'HUMAN')
                if human_in_stage2.sum() > 0:
                    stage1_conf = stage1_proba[ambient_mask][human_in_stage2].max(axis=1)
                    stage2_conf = stage2_proba[human_in_stage2].max(axis=1)
                    final_confidence[ambient_mask][human_in_stage2] = np.minimum(stage1_conf, stage2_conf)
        
        if return_confidence:
            return final_pred, final_confidence
        else:
            return final_pred
    
    def predict_batch(self, features_df):
        """Predict with detailed output"""
        
        predictions, confidences = self.predict(features_df, return_confidence=True)
        
        results_df = features_df.copy()
        results_df['predicted_category'] = predictions
        results_df['confidence'] = confidences
        
        return results_df



In [5]:
def main():
    """Build and train two-stage detector"""
    
    print("\n" + "=" * 80)
    print("TWO-STAGE DETECTOR TRAINING PIPELINE")
    print("=" * 80)
    
    # Create output directories
    Config.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    Config.MODEL_DIR.mkdir(parents=True, exist_ok=True)
    Config.RESULTS_DIR.mkdir(parents=True, exist_ok=True)
    
    # ========================================================================
    # LOAD DATA
    # ========================================================================
    
    print("\n" + "=" * 80)
    print("LOADING DATA")
    print("=" * 80)
    
    # Load clustered data
    print(f"\n📥 Loading clustered data...")
    clustered_df = pd.read_parquet(Config.CLUSTERED_DATA)
    print(f"   ✅ {len(clustered_df):,} clips")
    
    # Load PCA features
    print(f"\n📥 Loading PCA features...")
    pca_df = pd.read_parquet(Config.PREPROCESSED_DATA)
    print(f"   ✅ {len(pca_df):,} clips")
    
    # Merge (proper deduplication)
    print(f"\n🔗 Merging...")
    merged_df = clustered_df.merge(
        pca_df,
        on=['filepath', 'logger', 'date'],
        how='inner'
    )
    merged_df = merged_df.drop_duplicates(subset=['filepath'])
    print(f"   ✅ {len(merged_df):,} unique clips")
    
    # ========================================================================
    # STAGE 1: RUN EXISTING MODEL
    # ========================================================================
    
    print("\n" + "=" * 80)
    print("STAGE 1: AMBIENT vs BIO CLASSIFICATION")
    print("=" * 80)
    
    # Load Stage 1 model
    stage1_model = joblib.load(Config.STAGE1_MODEL)
    
    # Predict
    X = merged_df[Config.FEATURE_COLS].values
    stage1_predictions = stage1_model.predict(X)
    
    merged_df['stage1_prediction'] = stage1_predictions
    
    print(f"\n📊 Stage 1 Results:")
    for category, count in pd.Series(stage1_predictions).value_counts().items():
        pct = (count / len(stage1_predictions)) * 100
        print(f"   {category:8s}: {count:6,} ({pct:5.1f}%)")
    
    # ========================================================================
    # STAGE 2: BUILD HUMAN DETECTOR
    # ========================================================================
    
    print("\n" + "=" * 80)
    print("STAGE 2: HUMAN SOUND DETECTION")
    print("=" * 80)
    
    # Initialize HUMAN detector
    human_detector = HumanSoundDetector()
    
    # Get AMBIENT clips from Stage 1
    ambient_clips = merged_df[merged_df['stage1_prediction'] == 'AMBIENT']
    print(f"\n📊 AMBIENT clips from Stage 1: {len(ambient_clips):,}")
    
    # Find HUMAN candidates
    human_candidates = human_detector.find_human_candidates(
        ambient_clips,
        top_n=min(500, int(len(ambient_clips) * 0.05))  # Top 5% or 500
    )
    
    # Create training data
    training_df = human_detector.create_training_data(
        ambient_clips,
        human_candidates,
        negative_sample_size=len(human_candidates) * 2  # 1:2 ratio
    )
    
    # Prepare features
    X = training_df[Config.FEATURE_COLS].values
    y = training_df['stage2_label'].values
    
    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        random_state=Config.RANDOM_STATE,
        stratify=y
    )
    
    # Train Stage 2
    human_detector.train(X_train, y_train, X_test, y_test)
    
    # ========================================================================
    # SAVE MODELS
    # ========================================================================
    
    print("\n" + "=" * 80)
    print("SAVING MODELS")
    print("=" * 80)
    
    # Save Stage 2 detector
    stage2_path = Config.MODEL_DIR / "stage2_human_detector.joblib"
    joblib.dump(human_detector, stage2_path)
    print(f"\n  ✅ Stage 2 detector: {stage2_path}")
    
    # Create and save complete pipeline
    two_stage = TwoStageDetector(Config.STAGE1_MODEL, human_detector)
    pipeline_path = Config.MODEL_DIR / "two_stage_pipeline.joblib"
    joblib.dump(two_stage, pipeline_path)
    print(f"  ✅ Complete pipeline: {pipeline_path}")
    
    # ========================================================================
    # TEST ON FULL DATASET
    # ========================================================================
    
    print("\n" + "=" * 80)
    print("TESTING ON FULL DATASET")
    print("=" * 80)
    
    # Run two-stage prediction
    final_predictions = two_stage.predict(merged_df)
    
    print(f"\n📊 Final Classification Results:")
    for category, count in pd.Series(final_predictions).value_counts().items():
        pct = (count / len(final_predictions)) * 100
        print(f"   {category:8s}: {count:6,} ({pct:5.1f}%)")
    
    # Save results
    merged_df['final_category'] = final_predictions
    results_path = Config.RESULTS_DIR / "two_stage_predictions.parquet"
    merged_df.to_parquet(results_path, index=False)
    print(f"\n  ✅ Results saved: {results_path}")
    
    # Summary
    summary_path = Config.RESULTS_DIR / "summary.txt"
    with open(summary_path, 'w') as f:
        f.write("Two-Stage Detector Summary\n")
        f.write("=" * 50 + "\n\n")
        f.write(f"Total clips: {len(merged_df):,}\n\n")
        f.write("Stage 1 (AMBIENT vs BIO):\n")
        for cat, count in merged_df['stage1_prediction'].value_counts().items():
            f.write(f"  {cat}: {count:,}\n")
        f.write("\nFinal (with HUMAN detection):\n")
        for cat, count in merged_df['final_category'].value_counts().items():
            f.write(f"  {cat}: {count:,}\n")
    
    print(f"  ✅ Summary: {summary_path}")
    
    print("\n" + "=" * 80)
    print("✅ TWO-STAGE DETECTOR COMPLETE!")
    print("=" * 80)
    
    print(f"\n📦 Outputs:")
    print(f"   • Stage 2 detector: {stage2_path}")
    print(f"   • Complete pipeline: {pipeline_path}")
    print(f"   • Predictions: {results_path}")
    
    print(f"\n🎯 Next steps:")
    print(f"   1. Review HUMAN candidates manually")
    print(f"   2. Listen to top-scored files")
    print(f"   3. Refine rules if needed")
    print(f"   4. Deploy for real-time monitoring")


if __name__ == "__main__":
    main()


TWO-STAGE DETECTOR TRAINING PIPELINE

LOADING DATA

📥 Loading clustered data...
   ✅ 50,000 clips

📥 Loading PCA features...
   ✅ 50,000 clips

🔗 Merging...
   ✅ 15,392 unique clips

STAGE 1: AMBIENT vs BIO CLASSIFICATION

📊 Stage 1 Results:
   AMBIENT : 14,524 ( 94.4%)
   BIO     :    868 (  5.6%)

STAGE 2: HUMAN SOUND DETECTION

📊 AMBIENT clips from Stage 1: 14,524

🔍 Searching for HUMAN sounds in 14,524 AMBIENT clips...

📊 Human score distribution:
   Mean: 0.000
   Std:  0.000
   Max:  0.000
   Top 500 scores: 0.000 - 0.000

📁 Top 10 HUMAN candidates (for manual verification):
   /home/sparch/Uni-stuff/semester-2/applied_Ml/reef_zmsc/data/wav/PAPCA_test/2802/20080226/wav/47C3ED9D.wav (score: 0.000)
   /home/sparch/Uni-stuff/semester-2/applied_Ml/reef_zmsc/data/wav/PAPCA_test/2802/20080226/wav/47C3E311.wav (score: 0.000)
   /home/sparch/Uni-stuff/semester-2/applied_Ml/reef_zmsc/data/wav/PAPCA_test/2802/20080226/wav/47C3E695.wav (score: 0.000)
   /home/sparch/Uni-stuff/semester-2/ap