In [9]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, cohen_kappa_score
import joblib
import warnings
warnings.filterwarnings('ignore')


In [10]:
class Config:
    """Configuration for validation tests"""
    
    ROOT_DIR = Path("~/Uni-stuff/semester-2/applied_Ml/reef_zmsc").expanduser()
    
    # Test data
    TEST_PREDICTIONS = ROOT_DIR / "data/model_testing/results/test_predictions.csv"
    
    # Models
    STAGE1_MODEL = ROOT_DIR / "data/autolabeling_fixed/models/classifier.joblib"
    
    # Training data for Test 2
    TRAINING_DATA = ROOT_DIR / "data/autolabeling_fixed/results/cluster_labels.csv"
    CLUSTERED_DATA = ROOT_DIR / "data/clustering/results_50k/clustered_data_kmeans.parquet"
    PREPROCESSED_DATA = ROOT_DIR / "data/features/embeds_preprocessed_50k/preprocessed_features_pca.parquet"
    
    # Output
    OUTPUT_DIR = ROOT_DIR / "data/model_validation"
    
    FEATURE_COLS = [f"pca_{i}" for i in range(39)]
    RANDOM_STATE = 42


In [11]:
class RobustnessTest:
    """
    Test model robustness to audio perturbations
    
    A good model should predict the same category for:
    - Original clip
    - Slightly noisy version
    - Slightly pitch-shifted version
    - Slightly time-stretched version
    """
    
    def __init__(self, model):
        self.model = model
    
    def add_noise(self, features, noise_level=0.05):
        """
        Add Gaussian noise to features
        
        Args:
            features: PCA features (n, 39)
            noise_level: Std of noise relative to feature std
        
        Returns:
            Noisy features
        """
        noise = np.random.randn(*features.shape) * noise_level
        return features + noise
    
    def scale_features(self, features, scale_factor=1.1):
        """
        Scale features slightly (simulates volume/energy change)
        
        Args:
            features: PCA features
            scale_factor: Multiplication factor
        
        Returns:
            Scaled features
        """
        return features * scale_factor
    
    def perturb_features(self, features, perturbation_type='noise', strength=0.05):
        """
        Apply various perturbations to features
        
        Args:
            features: PCA features
            perturbation_type: Type of perturbation
            strength: Perturbation strength
        
        Returns:
            Perturbed features
        """
        
        if perturbation_type == 'noise':
            return self.add_noise(features, strength)
        
        elif perturbation_type == 'scale':
            return self.scale_features(features, 1 + strength)
        
        elif perturbation_type == 'shift':
            # Shift all features by small amount
            return features + strength
        
        elif perturbation_type == 'dropout':
            # Randomly set some features to 0 (like dropout)
            mask = np.random.random(features.shape) > strength
            return features * mask
        
        else:
            raise ValueError(f"Unknown perturbation: {perturbation_type}")
    
    def test_consistency(self, features_df, n_tests=5):
        """
        Test model consistency under multiple perturbations
        
        Args:
            features_df: DataFrame with PCA features
            n_tests: Number of perturbation tests per clip
        
        Returns:
            Dictionary with consistency metrics
        """
        
        print("\n" + "=" * 80)
        print("TEST 1: CONSISTENCY / ROBUSTNESS TESTING")
        print("=" * 80)
        
        print(f"\n📊 Testing {len(features_df)} clips with {n_tests} perturbations each")
        
        X = features_df[Config.FEATURE_COLS].values
        
        # Original predictions
        original_preds = self.model.predict(X)
        
        # Test different perturbation types
        perturbation_types = ['noise', 'scale', 'shift', 'dropout']
        results = {}
        
        for pert_type in perturbation_types:
            print(f"\n{'─' * 60}")
            print(f"Testing: {pert_type.upper()}")
            print(f"{'─' * 60}")
            
            agreements = []
            
            for strength in [0.01, 0.05, 0.10]:  # Different strengths
                
                # Run multiple tests
                test_agreements = []
                
                for test_idx in range(n_tests):
                    # Perturb features
                    X_perturbed = self.perturb_features(X, pert_type, strength)
                    
                    # Predict
                    perturbed_preds = self.model.predict(X_perturbed)
                    
                    # Check agreement
                    agreement = (original_preds == perturbed_preds).mean()
                    test_agreements.append(agreement)
                
                mean_agreement = np.mean(test_agreements)
                std_agreement = np.std(test_agreements)
                
                agreements.append({
                    'strength': strength,
                    'mean': mean_agreement,
                    'std': std_agreement
                })
                
                print(f"   Strength {strength:.2f}: {mean_agreement:.3f} ± {std_agreement:.3f} agreement")
            
            results[pert_type] = agreements
        
        # Overall summary
        print(f"\n" + "=" * 80)
        print("ROBUSTNESS SUMMARY")
        print("=" * 80)
        
        all_agreements = []
        for pert_type, agreements in results.items():
            for ag in agreements:
                all_agreements.append(ag['mean'])
        
        overall_robustness = np.mean(all_agreements)
        
        print(f"\n📊 Overall Robustness Score: {overall_robustness:.3f}")
        
        if overall_robustness > 0.95:
            print(f"   🌟 EXCELLENT! Model is very robust")
            print(f"      Predictions are stable under perturbations")
        elif overall_robustness > 0.85:
            print(f"   ✅ GOOD! Model is reasonably robust")
            print(f"      Most predictions remain stable")
        elif overall_robustness > 0.75:
            print(f"   ⚠️  FAIR. Model has moderate robustness")
            print(f"      Some sensitivity to perturbations")
        else:
            print(f"   ❌ POOR. Model is not robust")
            print(f"      Predictions change easily with small changes")
        
        print(f"\n💡 Interpretation:")
        print(f"   • High robustness (>0.9) = Model learned meaningful patterns")
        print(f"   • Low robustness (<0.8) = Model may be overfitting to noise")
        
        return {
            'overall_robustness': overall_robustness,
            'by_perturbation': results
        }

In [12]:
class ModelAgreementTest:
    """
    Train multiple different models on same data
    Check if they agree on predictions
    
    High agreement = Features and labels are good
    Low agreement = Problems with data or labels
    """
    
    def __init__(self):
        self.models = {}
        self.predictions = {}
    
    def create_models(self):
        """
        Create diverse set of models
        Different algorithms, different biases
        """
        
        models = {
            'Logistic Regression': LogisticRegression(
                penalty='l2',
                solver='lbfgs',
                max_iter=500,
                random_state=Config.RANDOM_STATE,
                multi_class='multinomial'
            ),
            
            'Random Forest': RandomForestClassifier(
                n_estimators=100,
                max_depth=10,
                random_state=Config.RANDOM_STATE,
                n_jobs=-1
            ),
            
            'Gradient Boosting': GradientBoostingClassifier(
                n_estimators=100,
                max_depth=5,
                learning_rate=0.1,
                random_state=Config.RANDOM_STATE
            ),
            
            'SVM': SVC(
                kernel='rbf',
                C=1.0,
                random_state=Config.RANDOM_STATE
            ),
            
            'Neural Network': MLPClassifier(
                hidden_layer_sizes=(50, 30),
                max_iter=500,
                random_state=Config.RANDOM_STATE
            )
        }
        
        return models
    
    def train_all_models(self, X_train, y_train, X_test, y_test):
        """
        Train all models on same data
        
        Args:
            X_train, y_train: Training data
            X_test, y_test: Test data
        
        Returns:
            Dictionary with all trained models
        """
        
        print("\n" + "=" * 80)
        print("TEST 2: MODEL AGREEMENT TESTING")
        print("=" * 80)
        
        print(f"\n📊 Training {len(self.create_models())} different models...")
        print(f"   Training samples: {len(X_train):,}")
        print(f"   Test samples: {len(X_test):,}")
        
        models = self.create_models()
        results = {}
        
        for name, model in models.items():
            print(f"\n{'─' * 60}")
            print(f"Training: {name}")
            print(f"{'─' * 60}")
            
            # Train
            model.fit(X_train, y_train)
            
            # Predict
            train_pred = model.predict(X_train)
            test_pred = model.predict(X_test)
            
            # Evaluate
            train_acc = accuracy_score(y_train, train_pred)
            test_acc = accuracy_score(y_test, test_pred)
            
            print(f"   Train accuracy: {train_acc:.4f}")
            print(f"   Test accuracy:  {test_acc:.4f}")
            
            if test_acc < train_acc - 0.1:
                print(f"   ⚠️  Significant drop (overfitting?)")
            else:
                print(f"   ✅ Good generalization")
            
            results[name] = {
                'model': model,
                'train_acc': train_acc,
                'test_acc': test_acc,
                'train_pred': train_pred,
                'test_pred': test_pred
            }
        
        self.models = results
        return results
    
    def analyze_agreement(self, X_test):
        """
        Analyze agreement between different models
        
        Args:
            X_test: Test features
        
        Returns:
            Agreement metrics
        """
        
        print(f"\n" + "=" * 80)
        print("MODEL AGREEMENT ANALYSIS")
        print("=" * 80)
        
        # Get all predictions
        model_names = list(self.models.keys())
        all_predictions = {name: self.models[name]['test_pred'] 
                          for name in model_names}
        
        # Pairwise agreement
        print(f"\n📊 Pairwise Agreement Matrix:")
        print(f"\n{'Model':<20}", end='')
        for name in model_names:
            print(f"{name[:12]:<14}", end='')
        print()
        print(f"{'-' * (20 + 14 * len(model_names))}")
        
        agreement_matrix = np.zeros((len(model_names), len(model_names)))
        
        for i, name1 in enumerate(model_names):
            print(f"{name1:<20}", end='')
            
            for j, name2 in enumerate(model_names):
                agreement = (all_predictions[name1] == all_predictions[name2]).mean()
                agreement_matrix[i, j] = agreement
                
                if i == j:
                    print(f"{'1.000':<14}", end='')
                else:
                    print(f"{agreement:.3f}{' ':<9}", end='')
            print()
        
        # Overall agreement
        upper_triangle = agreement_matrix[np.triu_indices_from(agreement_matrix, k=1)]
        mean_agreement = upper_triangle.mean()
        
        print(f"\n📊 Overall Model Agreement: {mean_agreement:.3f}")
        
        # Interpretation
        print(f"\n💡 Interpretation:")
        if mean_agreement > 0.95:
            print(f"   🌟 EXCELLENT! Models strongly agree")
            print(f"      → Features are highly informative")
            print(f"      → Pseudo-labels are very reliable")
            print(f"      → High confidence in predictions")
        elif mean_agreement > 0.85:
            print(f"   ✅ GOOD! Models mostly agree")
            print(f"      → Features capture meaningful patterns")
            print(f"      → Pseudo-labels are reasonable")
            print(f"      → Confident in most predictions")
        elif mean_agreement > 0.75:
            print(f"   ⚠️  FAIR. Moderate agreement")
            print(f"      → Some ambiguity in data")
            print(f"      → Pseudo-labels may have errors")
            print(f"      → Verify disagreement cases")
        else:
            print(f"   ❌ POOR. Models disagree significantly")
            print(f"      → Features may not be informative enough")
            print(f"      → Pseudo-labels may be unreliable")
            print(f"      → Need better training data")
        
        # Find disagreement cases
        print(f"\n🔍 Analyzing disagreements...")
        
        # For each test sample, count how many models agree
        n_samples = len(X_test)
        agreement_counts = np.zeros(n_samples)
        
        for i in range(n_samples):
            # Get predictions from all models for this sample
            sample_preds = [all_predictions[name][i] for name in model_names]
            
            # Count most common prediction
            from collections import Counter
            counts = Counter(sample_preds)
            most_common_count = counts.most_common(1)[0][1]
            
            agreement_counts[i] = most_common_count / len(model_names)
        
        # Classify samples by agreement level
        unanimous = (agreement_counts == 1.0).sum()
        majority = ((agreement_counts >= 0.6) & (agreement_counts < 1.0)).sum()
        split = (agreement_counts < 0.6).sum()
        
        print(f"\n   Unanimous (all models agree): {unanimous} ({unanimous/n_samples*100:.1f}%)")
        print(f"   Majority (≥60% agree):         {majority} ({majority/n_samples*100:.1f}%)")
        print(f"   Split (<60% agree):            {split} ({split/n_samples*100:.1f}%)")
        
        if split > 0:
            print(f"\n   ⚠️  {split} samples have low agreement")
            print(f"      These are ambiguous cases - check manually")
        
        # Cohen's Kappa (inter-rater reliability)
        print(f"\n📊 Cohen's Kappa (Inter-Model Agreement):")
        kappas = []
        for i, name1 in enumerate(model_names):
            for j, name2 in enumerate(model_names):
                if i < j:
                    kappa = cohen_kappa_score(all_predictions[name1], all_predictions[name2])
                    kappas.append(kappa)
                    print(f"   {name1[:15]:15s} vs {name2[:15]:15s}: κ = {kappa:.3f}")
        
        mean_kappa = np.mean(kappas)
        print(f"\n   Mean Cohen's Kappa: {mean_kappa:.3f}")
        
        if mean_kappa > 0.8:
            print(f"   🌟 Almost perfect agreement")
        elif mean_kappa > 0.6:
            print(f"   ✅ Substantial agreement")
        elif mean_kappa > 0.4:
            print(f"   ⚠️  Moderate agreement")
        else:
            print(f"   ❌ Slight/poor agreement")
        
        return {
            'mean_agreement': mean_agreement,
            'agreement_matrix': agreement_matrix,
            'unanimous': unanimous,
            'majority': majority,
            'split': split,
            'mean_kappa': mean_kappa
        }

In [13]:
def run_validation_tests():
    """Run both validation tests"""
    
    print("\n" + "=" * 80)
    print("🧪 ADVANCED MODEL VALIDATION")
    print("No ground truth labels required!")
    print("=" * 80)
    
    Config.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    
    # ========================================================================
    # LOAD DATA
    # ========================================================================
    
    print("\n📥 Loading data...")
    
    # Test data (for Test 1)
    test_predictions = pd.read_csv(Config.TEST_PREDICTIONS)
    print(f"   Test clips: {len(test_predictions)}")
    
    # Training data (for Test 2)
    cluster_labels = pd.read_csv(Config.TRAINING_DATA)
    clustered_df = pd.read_parquet(Config.CLUSTERED_DATA)
    pca_df = pd.read_parquet(Config.PREPROCESSED_DATA)
    
    # Merge training data
    training_df = clustered_df.merge(cluster_labels[['cluster', 'category']], on='cluster')
    training_df = training_df.merge(pca_df, on=['filepath', 'logger', 'date'], how='inner')
    training_df = training_df.drop_duplicates(subset=['filepath'])
    
    # Filter high confidence
    training_df = training_df[training_df['category'].isin(['AMBIENT', 'BIO'])]
    
    print(f"   Training clips: {len(training_df)}")
    
    # ========================================================================
    # TEST 1: ROBUSTNESS
    # ========================================================================
    
    # Load model
    model = joblib.load(Config.STAGE1_MODEL)
    
    robustness_test = RobustnessTest(model)
    robustness_results = robustness_test.test_consistency(test_predictions, n_tests=5)
    
    # ========================================================================
    # TEST 2: MODEL AGREEMENT
    # ========================================================================
    
    # Prepare training/test split
    X = training_df[Config.FEATURE_COLS].values
    y = training_df['category'].values
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        random_state=Config.RANDOM_STATE,
        stratify=y
    )
    
    agreement_test = ModelAgreementTest()
    model_results = agreement_test.train_all_models(X_train, y_train, X_test, y_test)
    agreement_results = agreement_test.analyze_agreement(X_test)
    
    # ========================================================================
    # COMBINED SUMMARY
    # ========================================================================
    
    print("\n" + "=" * 80)
    print("🎯 COMBINED VALIDATION SUMMARY")
    print("=" * 80)
    
    robustness_score = robustness_results['overall_robustness']
    agreement_score = agreement_results['mean_agreement']
    
    print(f"\n📊 Validation Scores:")
    print(f"   Robustness:     {robustness_score:.3f} (consistency under perturbations)")
    print(f"   Model Agreement: {agreement_score:.3f} (different models agree)")
    
    # Combined confidence
    combined_score = (robustness_score + agreement_score) / 2
    print(f"\n   Combined Confidence Score: {combined_score:.3f}")
    
    print(f"\n💡 Overall Assessment:")
    if combined_score > 0.9:
        print(f"   🌟 EXCELLENT VALIDATION")
        print(f"      ✅ Model is robust")
        print(f"      ✅ Multiple models agree")
        print(f"      ✅ High confidence in predictions")
        print(f"      → Ready for deployment!")
    elif combined_score > 0.8:
        print(f"   ✅ GOOD VALIDATION")
        print(f"      ✅ Model is reasonably robust")
        print(f"      ✅ Models mostly agree")
        print(f"      → Suitable for most applications")
    elif combined_score > 0.7:
        print(f"   ⚠️  FAIR VALIDATION")
        print(f"      ⚠️  Some robustness issues")
        print(f"      ⚠️  Some model disagreement")
        print(f"      → Verify critical predictions manually")
    else:
        print(f"   ❌ POOR VALIDATION")
        print(f"      ❌ Model is not robust")
        print(f"      ❌ Models disagree significantly")
        print(f"      → Need to improve training data or features")
    
    # Save results
    summary_path = Config.OUTPUT_DIR / "validation_summary.txt"
    with open(summary_path, 'w') as f:
        f.write("Model Validation Summary\n")
        f.write("=" * 80 + "\n\n")
        f.write(f"Robustness Score: {robustness_score:.3f}\n")
        f.write(f"Model Agreement Score: {agreement_score:.3f}\n")
        f.write(f"Combined Confidence: {combined_score:.3f}\n\n")
        f.write(f"Test 1 - Robustness:\n")
        f.write(f"  Model predictions remain stable under perturbations\n\n")
        f.write(f"Test 2 - Model Agreement:\n")
        f.write(f"  {len(model_results)} different models trained\n")
        f.write(f"  Mean agreement: {agreement_score:.3f}\n")
        f.write(f"  Mean Cohen's Kappa: {agreement_results['mean_kappa']:.3f}\n\n")
        f.write(f"Unanimous predictions: {agreement_results['unanimous']}/{len(X_test)}\n")
        f.write(f"Majority predictions: {agreement_results['majority']}/{len(X_test)}\n")
        f.write(f"Split predictions: {agreement_results['split']}/{len(X_test)}\n")
    
    print(f"\n💾 Results saved: {summary_path}")
    
    print(f"\n" + "=" * 80)
    print("✅ VALIDATION COMPLETE!")
    print("=" * 80)
    
    return {
        'robustness': robustness_results,
        'agreement': agreement_results,
        'combined_score': combined_score
    }


if __name__ == "__main__":
    run_validation_tests()


🧪 ADVANCED MODEL VALIDATION
No ground truth labels required!

📥 Loading data...
   Test clips: 20
   Training clips: 15392

TEST 1: CONSISTENCY / ROBUSTNESS TESTING

📊 Testing 20 clips with 5 perturbations each

────────────────────────────────────────────────────────────
Testing: NOISE
────────────────────────────────────────────────────────────
   Strength 0.01: 1.000 ± 0.000 agreement
   Strength 0.05: 1.000 ± 0.000 agreement
   Strength 0.10: 1.000 ± 0.000 agreement

────────────────────────────────────────────────────────────
Testing: SCALE
────────────────────────────────────────────────────────────
   Strength 0.01: 1.000 ± 0.000 agreement
   Strength 0.05: 1.000 ± 0.000 agreement
   Strength 0.10: 1.000 ± 0.000 agreement

────────────────────────────────────────────────────────────
Testing: SHIFT
────────────────────────────────────────────────────────────
   Strength 0.01: 1.000 ± 0.000 agreement
   Strength 0.05: 1.000 ± 0.000 agreement
   Strength 0.10: 1.000 ± 0.000 agreem