In [None]:
def collate_fn_cnn(batch):
    """
    Input: List of (audio, label) tuples from dataset
    Output: Padded batch ready for CNN
    """
    audios, labels = zip(*batch)  # Separate audio and labels
    
    # 1. Get actual lengths of each audio
    lengths = torch.LongTensor([len(a) for a in audios])
    # Example: [44100, 88200, 22050]
    
    # 2. Pad all audios to the same length (longest in batch)
    padded_audio = nn.utils.rnn.pad_sequence(audios, batch_first=True)
    # Example shape: (3, 88200) - all padded to longest (88200)
    
    # 3. Stack labels
    labels = torch.stack(labels)
    
    return padded_audio, lengths, labels

In [None]:
class AudioCNN(nn.Module):

    def __init__(self, classes=10, config={}):
        super(AudioCNN, self).__init__()
        
        self.num_classes = classes
        self.config = config

        self.net = nn.ModuleDict() 

        self.input_channels = 1

        # Spectrogram transform
        self.spec = MelspectrogramStretch(
            hop_length=config.get('hop_length', None),
            num_mels=config.get('num_mels', 128),
            fft_length=config.get('fft_length', 2048),
            norm=config.get('norm', 'whiten'),
            stretch_param=config.get('stretch_param', [0.4, 0.4])
        )

        # CNN parameters
        self.hidden_channels = self.config.get('hidden_channels', 32)
        self.num_layers = self.config.get('num_layers', 3)
        self.cnn_dropout = self.config.get('cnn_dropout', 0.3)

        # self.final_flatten_size = self.config.get('final_flatten_size')
        self.padding = self.config.get('padding', 0)


        # Build network from cfg
        # Input shape: [channel, frequency, time]
        self.net['convs'] = nn.Sequential(
            # Layer 1: Input channel 1 -> Hidden channels
            nn.Conv2d(self.input_channels, self.hidden_channels, kernel_size=3, padding=0),
            nn.BatchNorm2d(self.hidden_channels),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            
            # Layer 2 to n_layers+1: All take hidden_channels -> Hidden channels
            *[nn.Sequential(
                nn.Conv2d(self.hidden_channels, self.hidden_channels, kernel_size=3, padding=0),
                nn.BatchNorm2d(self.hidden_channels),
                nn.ReLU(),
                nn.MaxPool2d(kernel_size=2, stride=2)
            ) for _ in range(self.num_layers)],
            
            # Final Pooling
            nn.AvgPool2d(kernel_size=(2, 2)), 
        )

        # Calculate the flattened size after convolutions
        with torch.no_grad():
            dummy_input = torch.zeros(1, self.input_channels, self.spec.n_mels, 400)
            dummy_output = self.net['convs'](dummy_input)
            self.final_flatten_size = dummy_output.view(1, -1).size(1)
        
        # --- 3. Classification Head ---
        # The first Linear layer size must match the output size of the CNN body after flattening.
        # final_flatten_size needs to be calculated dynamically or estimated. 
        # For simplicity, we use a placeholder and a dense head.
        self.net['dense'] = nn.Sequential(
            nn.Linear(self.final_flatten_size, self.hidden_channels), 
            nn.ReLU(),
            nn.Dropout(p=self.cnn_dropout),
            nn.Linear(self.hidden_channels, self.num_classes) # Final output layer for classification
        )

    def forward(self, audio, lengths):
        
        # Add channel dimension: (batch, time) -> (batch, 1, time)
        if audio.dim() == 2:
            audio = audio.unsqueeze(1)

        audio = audio.float()

        # Compute mel spectrogram: (batch, 1, time) -> (batch, 1, freq, time)
        x, lengths = self.spec(audio, lengths)

        # CNN processing: (batch, channel, freq, time)
        x = self.net['convs'](x)

        # Flatten: (batch, time*freq*channel)
        x = x.view(x.size(0), -1)

        # Classification: (batch, classes)
        x = self.net['dense'](x)

        # Return raw logits for Cross Entropy Loss
        return x
    
    @staticmethod
    def train(model, train_loader, val_loader, test_fold, history, config, fold_dir, epochs: int, optimizer, criterion,  device):

        best_val_loss = float('inf')
        best_val_acc = 0.0

        for epoch in range(1, epochs + 1):
            print(f"\nEpoch {epoch}/{epochs}")
            print("-" * 40)

            # TRAIN
            model.train()
            train_loss = 0
            train_correct = 0
            train_total = 0
            
            train_pbar = tqdm(train_loader, desc='Training', leave=False)
            for audio, lengths, labels in train_pbar:

                audio = audio.to(device)
                lengths = lengths.to(device)
                labels = labels.to(device)
                
                # Forward pass
                optimizer.zero_grad()
                outputs = model(audio, lengths)
                loss = criterion(outputs, labels)

                # Backward pass
                loss.backward()
                optimizer.step()
                
                # Track metrics
                train_loss += loss.item()
                pred = outputs.argmax(dim=1)
                train_correct += (pred == labels).sum().item()
                train_total += labels.size(0)
                
                train_pbar.set_postfix({'loss': f'{loss.item():.4f}'})
            
            avg_train_loss = train_loss / len(train_loader)
            train_acc = train_correct / train_total
            
            # VALIDATE
            model.eval()
            val_loss = 0
            val_correct = 0
            val_total = 0
            
            with torch.no_grad():
                val_pbar = tqdm(val_loader, desc='Validation', leave=False)
                for audio, lengths, labels in val_pbar:
                    audio = audio.to(device)
                    lengths = lengths.to(device)
                    labels = labels.to(device)
                    
                    outputs = model(audio, lengths)
                    loss = criterion(outputs, labels)
                    
                    val_loss += loss.item()
                    pred = outputs.argmax(dim=1)
                    val_correct += (pred == labels).sum().item()
                    val_total += labels.size(0)
                    
                    val_pbar.set_postfix({'loss': f'{loss.item():.4f}'})
            
            avg_val_loss = val_loss / len(val_loader)
            val_acc = val_correct / val_total
            
            # Log metrics
            history.add('train_loss', avg_train_loss)
            history.add('train_accuracy', train_acc)
            history.add('val_loss', avg_val_loss)
            history.add('val_accuracy', val_acc)
            
            print(f"Train Loss: {avg_train_loss:.4f}, Train Acc: {train_acc:.4f}")
            print(f"Val Loss: {avg_val_loss:.4f}, Val Acc: {val_acc:.4f}")
            
            # Save best model for this fold
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                best_val_acc = val_acc
                torch.save({
                    'fold': test_fold,
                    'epoch': epoch,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'val_loss': best_val_loss,
                    'val_acc': best_val_acc,
                    'config': config
                }, os.path.join(fold_dir, 'best_model.pth'))
                print(f"✓ Best model saved (Val Loss: {best_val_loss:.4f})")

        return best_val_loss, best_val_acc

class AudioCNNDataset(Dataset):
    def __init__(self, audio_data, labels):
        self.audio_data = audio_data
        self.labels = labels
        
    def __len__(self):
        return len(self.audio_data)
    
    def __getitem__(self, idx):
        audio = self.audio_data[idx]
        label = self.labels[idx]
        
        if not isinstance(audio, torch.Tensor):
            audio = torch.FloatTensor(audio)
        if audio.dim() > 1:
            audio = audio.squeeze()
        
        if not isinstance(label, torch.Tensor):
            label = torch.LongTensor([label])[0]
        
        return audio, label

In [None]:
def train_audio_cnn_cross_validation(
    df,
    audio_base_path,
    config,
    epochs=20,
    batch_size=32,
    lr=0.001,
    device=None,
    num_classes=10
):
    """
    10-Fold Cross-Validation Pipeline for UrbanSound8K
    
    In each iteration:
    - 1 fold for test
    - 1 fold for validation  
    - 8 folds for training
    
    Parameters:
    -----------
    df : pd.DataFrame
        UrbanSound8K metadata (must have 'fold' column)
    audio_base_path : str
        Path to audio files
    config : dict
        Model configuration
    epochs : int
        Training epochs per fold
    batch_size : int
        Batch size
    lr : float
        Learning rate
    device : str
        'cuda' or 'cpu'
    num_classes : int
        Number of classes
    
    Returns:
    --------
    dict with:
        - fold_results: list of results per fold
        - cumulative_confusion_matrix: confusion matrix summed over all folds
        - mean_accuracy: mean accuracy over 10 folds
        - std_accuracy: standard deviation of accuracy
        - All precision, recall, F1 metrics (mean and std)
        - save_dir: directory where results were saved
    """
    
    # device = torch.device("cuda:1")  # RX 6650M - the dedicated GPU
    device = torch.device("cpu")  


    print(f"Using device: {device}")
    
    # Create save directory
    timestamp = datetime.now().strftime("%m%d_%H%M%S")
    save_dir = f"saved_cv/{timestamp}"
    os.makedirs(save_dir, exist_ok=True)
    print(f"Save directory: {save_dir}")
    
    # Save configuration
    with open(os.path.join(save_dir, 'config.json'), 'w') as f:
        json.dump(config, f, indent=2)
    
    # Initialize results storage
    fold_results = []
    all_fold_accuracies = []
    cumulative_confusion_matrix = np.zeros((num_classes, num_classes), dtype=int)
    
    print("\n" + "="*60)
    print("10-FOLD CROSS-VALIDATION")
    print("="*60)
    print("Scheme: 1 fold test, 1 fold validation, 8 folds training")
    print("="*60)
    
    # Iterate through 10 folds
    for test_fold in range(1, 11):
        # Determine validation fold (next fold, wrapping around)
        val_fold = (test_fold % 10) + 1
        
        # Training folds are all others
        train_folds = [f for f in range(1, 11) if f != test_fold and f != val_fold]
        
        print(f"\n{'#'*60}")
        print(f"FOLD {test_fold}/10")
        print(f"{'#'*60}")
        print(f"Test fold: {test_fold}")
        print(f"Validation fold: {val_fold}")
        print(f"Training folds: {train_folds}")
        
        # Create fold-specific save directory
        fold_dir = os.path.join(save_dir, f'fold_{test_fold}')
        os.makedirs(fold_dir, exist_ok=True)
        
        # ============================================
        # STEP 1: PREPARE DATA FOR THIS FOLD
        # ============================================
        print("\n" + "="*60)
        print("STEP 1: Preparing Data")
        print("="*60)
        
        # Split dataframe by folds
        train_df = df[df['fold'].isin(train_folds)]
        val_df = df[df['fold'] == val_fold]
        test_df = df[df['fold'] == test_fold]
        
        print(f"Train samples: {len(train_df)}")
        print(f"Val samples: {len(val_df)}")
        print(f"Test samples: {len(test_df)}")
        
        # Preprocess data
        print("Preprocessing training data...")
        train_data = preprocess_dataset(
            train_df, audio_base_path,
            mode='train',
            channel_mode='mono',
            sr=config.get('sample_rate', 22050),
            noise_prob=0.5,
            crop_prob=0.5,
            augment_prob=0.5
        )
        
        print("Preprocessing validation data...")
        val_data = preprocess_dataset(
            val_df, audio_base_path,
            mode='val',
            channel_mode='mono',
            sr=config.get('sample_rate', 22050)
        )
        
        print("Preprocessing test data...")
        test_data = preprocess_dataset(
            test_df, audio_base_path,
            mode='val',
            channel_mode='mono',
            sr=config.get('sample_rate', 22050)
        )
        
        # Create datasets and loaders
        train_dataset = AudioCNNDataset(train_data['audio'], train_data['labels'])
        val_dataset = AudioCNNDataset(val_data['audio'], val_data['labels'])
        test_dataset = AudioCNNDataset(test_data['audio'], test_data['labels'])
        
        train_loader = DataLoader(
            train_dataset, batch_size=batch_size, shuffle=True,
            collate_fn=collate_fn_cnn, num_workers=0
        )
        val_loader = DataLoader(
            val_dataset, batch_size=batch_size, shuffle=False,
            collate_fn=collate_fn_cnn, num_workers=0
        )
        test_loader = DataLoader(
            test_dataset, batch_size=batch_size, shuffle=False,
            collate_fn=collate_fn_cnn, num_workers=0
        )
        
        # ============================================
        # STEP 2: CREATE MODEL
        # ============================================
        print("\n" + "="*60)
        print("STEP 2: Creating Model")
        print("="*60)
        
        model = AudioCNN(classes=num_classes, config=config)
        model = model.to(device)
        
        total_params = sum(p.numel() for p in model.parameters())
        print(f"Total parameters: {total_params:,}")
        
        # ============================================
        # STEP 3: TRAIN MODEL
        # ============================================
        print("\n" + "="*60)
        print("STEP 3: Training Model")
        print("="*60)
        
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        history = TrainingHistory()
        
        best_val_loss, best_val_acc = AudioCNN.train(
            model=model,
            train_loader=train_loader,
            val_loader=val_loader,
            test_fold=test_fold,
            history=history,
            config=config,
            fold_dir=fold_dir,
            epochs=epochs,
            optimizer=optimizer,
            criterion=criterion,
            device=device
        )
                
        
        # ============================================
        # STEP 4: TEST ON HELD-OUT FOLD
        # ============================================
        print("\n" + "="*60)
        print("STEP 4: Testing on Held-Out Fold")
        print("="*60)
        
        # Load best model for this fold
        checkpoint = torch.load(os.path.join(fold_dir, 'best_model.pth'))
        model.load_state_dict(checkpoint['model_state_dict'])
        
        model.eval()
        test_loss = 0
        test_correct = 0
        test_total = 0
        
        all_predictions = []
        all_targets = []
        
        with torch.no_grad():
            test_pbar = tqdm(test_loader, desc='Testing')
            for audio, lengths, labels in test_pbar:
                audio = audio.to(device)
                lengths = lengths.to(device)
                labels = labels.to(device)
                
                outputs = model(audio, lengths)
                loss = criterion(outputs, labels)
                
                test_loss += loss.item()
                pred = outputs.argmax(dim=1)
                test_correct += (pred == labels).sum().item()
                test_total += labels.size(0)
                
                all_predictions.extend(pred.cpu().numpy())
                all_targets.extend(labels.cpu().numpy())
                
                test_pbar.set_postfix({'loss': f'{loss.item():.4f}'})
        
        avg_test_loss = test_loss / len(test_loader)
        test_acc = test_correct / test_total
        
        print(f"\nTest Loss: {avg_test_loss:.4f}")
        print(f"Test Accuracy: {test_acc:.4f}")
        
        # Compute all metrics for this fold
        test_metrics = compute_metrics(all_predictions, all_targets, num_classes)
        print_metrics(test_metrics, prefix="Test ")
        
        # Compute confusion matrix for this fold
        fold_confusion_matrix = confusion_matrix(all_targets, all_predictions, labels=range(num_classes))
        
        # Add to cumulative confusion matrix
        cumulative_confusion_matrix += fold_confusion_matrix
        
        # Store results with all metrics
        fold_result = {
            'fold': test_fold,
            'test_fold': test_fold,
            'val_fold': val_fold,
            'train_folds': train_folds,
            'test_loss': avg_test_loss,
            'test_accuracy': test_metrics['accuracy'],
            'test_precision_macro': test_metrics['precision_macro'],
            'test_precision_weighted': test_metrics['precision_weighted'],
            'test_recall_macro': test_metrics['recall_macro'],
            'test_recall_weighted': test_metrics['recall_weighted'],
            'test_f1_macro': test_metrics['f1_macro'],
            'test_f1_weighted': test_metrics['f1_weighted'],
            'test_precision_per_class': test_metrics['precision_per_class'],
            'test_recall_per_class': test_metrics['recall_per_class'],
            'test_f1_per_class': test_metrics['f1_per_class'],
            'best_val_accuracy': best_val_acc,
            'best_val_loss': best_val_loss,
            'confusion_matrix': fold_confusion_matrix.tolist()
        }
        fold_results.append(fold_result)
        all_fold_accuracies.append(test_acc)
        
        # Save fold results
        with open(os.path.join(fold_dir, 'fold_results.json'), 'w') as f:
            json.dump(fold_result, f, indent=2)
        
        # Save classification report
        class_names = [f'Class_{i}' for i in range(num_classes)]
        report = classification_report(all_targets, all_predictions, 
                                      target_names=class_names, 
                                      digits=4)
        with open(os.path.join(fold_dir, 'classification_report.txt'), 'w') as f:
            f.write(f"Classification Report - Fold {test_fold}\n")
            f.write("="*60 + "\n")
            f.write(report)
        
        # Save history and plot
        history.save(os.path.join(fold_dir, 'history.json'))
        history.plot(save_path=os.path.join(fold_dir, 'training_curves.png'))
        
        # Plot confusion matrix for this fold
        plot_confusion_matrix(
            fold_confusion_matrix,
            save_path=os.path.join(fold_dir, 'confusion_matrix.png'),
            title=f'Confusion Matrix - Fold {test_fold}'
        )
        
        print(f"\nFold {test_fold} completed!")
        print(f"Results saved to: {fold_dir}")
        
        del train_data, val_data, test_data
        del train_dataset, val_dataset, test_dataset
        del train_loader, val_loader, test_loader

        del model
        del optimizer

        gc.collect()
        torch.cuda.empty_cache()
        plt.close('all')
    
    # ============================================
    # FINAL RESULTS ACROSS ALL FOLDS
    # ============================================
    print("\n" + "="*60)
    print("CROSS-VALIDATION RESULTS")
    print("="*60)
    
    # Compute aggregate metrics
    mean_accuracy = np.mean([r['test_accuracy'] for r in fold_results])
    std_accuracy = np.std([r['test_accuracy'] for r in fold_results])
    
    mean_precision_macro = np.mean([r['test_precision_macro'] for r in fold_results])
    std_precision_macro = np.std([r['test_precision_macro'] for r in fold_results])
    
    mean_recall_macro = np.mean([r['test_recall_macro'] for r in fold_results])
    std_recall_macro = np.std([r['test_recall_macro'] for r in fold_results])
    
    mean_f1_macro = np.mean([r['test_f1_macro'] for r in fold_results])
    std_f1_macro = np.std([r['test_f1_macro'] for r in fold_results])
    
    mean_precision_weighted = np.mean([r['test_precision_weighted'] for r in fold_results])
    std_precision_weighted = np.std([r['test_precision_weighted'] for r in fold_results])
    
    mean_recall_weighted = np.mean([r['test_recall_weighted'] for r in fold_results])
    std_recall_weighted = np.std([r['test_recall_weighted'] for r in fold_results])
    
    mean_f1_weighted = np.mean([r['test_f1_weighted'] for r in fold_results])
    std_f1_weighted = np.std([r['test_f1_weighted'] for r in fold_results])
    
    # Print summary
    print(f"\nMetrics per fold:")
    print(f"{'Fold':<6} {'Acc':<8} {'Prec(M)':<10} {'Rec(M)':<10} {'F1(M)':<10}")
    print("-" * 50)
    for i, result in enumerate(fold_results, 1):
        print(f"{i:<6} {result['test_accuracy']:<8.4f} "
              f"{result['test_precision_macro']:<10.4f} "
              f"{result['test_recall_macro']:<10.4f} "
              f"{result['test_f1_macro']:<10.4f}")
    
    print("\n" + "="*60)
    print("AGGREGATE RESULTS (Mean ± Std)")
    print("="*60)
    print(f"Accuracy:              {mean_accuracy:.4f} ± {std_accuracy:.4f}")
    print(f"Precision (macro):     {mean_precision_macro:.4f} ± {std_precision_macro:.4f}")
    print(f"Recall (macro):        {mean_recall_macro:.4f} ± {std_recall_macro:.4f}")
    print(f"F1-Score (macro):      {mean_f1_macro:.4f} ± {std_f1_macro:.4f}")
    print(f"Precision (weighted):  {mean_precision_weighted:.4f} ± {std_precision_weighted:.4f}")
    print(f"Recall (weighted):     {mean_recall_weighted:.4f} ± {std_recall_weighted:.4f}")
    print(f"F1-Score (weighted):   {mean_f1_weighted:.4f} ± {std_f1_weighted:.4f}")
    print("="*60)
    
    # Plot cumulative confusion matrix
    plot_confusion_matrix(
        cumulative_confusion_matrix,
        save_path=os.path.join(save_dir, 'cumulative_confusion_matrix.png'),
        title='Cumulative Confusion Matrix (10 Folds)',
        normalize=False
    )
    
    # Also plot normalized version
    plot_confusion_matrix(
        cumulative_confusion_matrix,
        save_path=os.path.join(save_dir, 'cumulative_confusion_matrix_normalized.png'),
        title='Cumulative Confusion Matrix (Normalized)',
        normalize=True
    )
    
    # Save final summary with all metrics
    final_results = {
        'fold_results': fold_results,
        'mean_accuracy': float(mean_accuracy),
        'std_accuracy': float(std_accuracy),
        'mean_precision_macro': float(mean_precision_macro),
        'std_precision_macro': float(std_precision_macro),
        'mean_recall_macro': float(mean_recall_macro),
        'std_recall_macro': float(std_recall_macro),
        'mean_f1_macro': float(mean_f1_macro),
        'std_f1_macro': float(std_f1_macro),
        'mean_precision_weighted': float(mean_precision_weighted),
        'std_precision_weighted': float(std_precision_weighted),
        'mean_recall_weighted': float(mean_recall_weighted),
        'std_recall_weighted': float(std_recall_weighted),
        'mean_f1_weighted': float(mean_f1_weighted),
        'std_f1_weighted': float(std_f1_weighted),
        'all_fold_accuracies': [float(a) for a in all_fold_accuracies],
        'cumulative_confusion_matrix': cumulative_confusion_matrix.tolist(),
        'config': config
    }
    
    with open(os.path.join(save_dir, 'cross_validation_results.json'), 'w') as f:
        json.dump(final_results, f, indent=2)
    
    # Save summary text report
    with open(os.path.join(save_dir, 'summary_report.txt'), 'w') as f:
        f.write("="*60 + "\n")
        f.write("10-FOLD CROSS-VALIDATION SUMMARY\n")
        f.write("="*60 + "\n\n")
        
        f.write("Aggregate Results (Mean ± Std):\n")
        f.write("-"*60 + "\n")
        f.write(f"Accuracy:              {mean_accuracy:.4f} ± {std_accuracy:.4f}\n")
        f.write(f"Precision (macro):     {mean_precision_macro:.4f} ± {std_precision_macro:.4f}\n")
        f.write(f"Recall (macro):        {mean_recall_macro:.4f} ± {std_recall_macro:.4f}\n")
        f.write(f"F1-Score (macro):      {mean_f1_macro:.4f} ± {std_f1_macro:.4f}\n")
        f.write(f"Precision (weighted):  {mean_precision_weighted:.4f} ± {std_precision_weighted:.4f}\n")
        f.write(f"Recall (weighted):     {mean_recall_weighted:.4f} ± {std_recall_weighted:.4f}\n")
        f.write(f"F1-Score (weighted):   {mean_f1_weighted:.4f} ± {std_f1_weighted:.4f}\n")
        f.write("\n" + "="*60 + "\n\n")
        
        f.write("Per-Fold Results:\n")
        f.write("-"*60 + "\n")
        f.write(f"{'Fold':<6} {'Acc':<8} {'Prec(M)':<10} {'Rec(M)':<10} {'F1(M)':<10}\n")
        f.write("-"*60 + "\n")
        for i, result in enumerate(fold_results, 1):
            f.write(f"{i:<6} {result['test_accuracy']:<8.4f} "
                   f"{result['test_precision_macro']:<10.4f} "
                   f"{result['test_recall_macro']:<10.4f} "
                   f"{result['test_f1_macro']:<10.4f}\n")
    
    # Plot accuracy across folds
    plot_fold_accuracies(
        all_fold_accuracies,
        save_path=os.path.join(save_dir, 'fold_accuracies.png')
    )
    
    # Plot all metrics across folds
    plot_all_metrics_across_folds(
        fold_results,
        save_path=os.path.join(save_dir, 'all_metrics_across_folds.png')
    )
    
    print(f"\nAll results saved to: {save_dir}")
    
    return {
        'fold_results': fold_results,
        'cumulative_confusion_matrix': cumulative_confusion_matrix,
        'mean_accuracy': mean_accuracy,
        'std_accuracy': std_accuracy,
        'mean_precision_macro': mean_precision_macro,
        'std_precision_macro': std_precision_macro,
        'mean_recall_macro': mean_recall_macro,
        'std_recall_macro': std_recall_macro,
        'mean_f1_macro': mean_f1_macro,
        'std_f1_macro': std_f1_macro,
        'save_dir': save_dir
    }

In [None]:
import pandas as pd
import json 

model = AudioCNN(classes=10, config=config)

# Load dataset
df = pd.read_csv('UrbanSound8K/metadata/UrbanSound8K.csv')
audio_base_path = 'UrbanSound8K/audio'

# Define configuration
config = {
    # Spectrogram settings
    'sample_rate': 22050,
    'num_mels': 128,
    'fft_length': 2048,
    'hop_length': 512,
    'norm': 'whiten',
    'stretch_param': [0.4, 0.4],
    
    # CNN settings
    'hidden_channels': 32,
    'num_layers': 3,
    'cnn_dropout': 0.3,
}

# Run complete pipeline
results = train_audio_cnn_cross_validation(
    df=df,
    audio_base_path=audio_base_path,
    config=config,
    epochs=20,
    batch_size=32,
    lr=0.001,
    num_classes=10
)

print(f"Model saved to: {results['save_dir']}")