In [None]:
from torch_geometric.data import Dataset, Data
from torch_geometric.nn import knn_graph
import os.path as osp
import pandas as pd
import numpy as np
import torch
import h5py

In [None]:
class AudioGraphDataset(Dataset):
    def __init__(self, root, path, k_neigh=5, enable=False, transform=None, pre_transform=None, pre_filter=None):
        """
        Initialize the Audio Graph Dataset
        Args:
            root: Root directory for processed files
            path: List containing paths to [csv_file, hdf5_file]
            k_neigh: Number of nearest neighbors for graph construction
            enable: Flag for extended processing mode
            transform: Transform to be applied to the data
            pre_transform: Transform to be applied before processing
            pre_filter: Filter to be applied before processing
        """
        self.csv_path = path[0]
        self.hdf5_path = path[1]
        self.enable = enable
        self.k_neigh = k_neigh
        super().__init__(root, transform, pre_transform, pre_filter)

    @property
    def raw_file_names(self):
        # Return the actual filenames without full paths
        return [osp.basename(self.csv_path), osp.basename(self.hdf5_path)]

    @property
    def raw_dir(self) -> str:
        # Return the directory containing the raw files
        return osp.dirname(self.csv_path)

    @property
    def processed_file_names(self):
        # Return list of processed file names based on enable flag
        if not self.enable:
            return [f'data_{i}.pt' for i in range(10)]
        else:
            return [f'data_{i}.pt' for i in range(109)]

    def download(self):
        # No download needed as we're using local files
        pass

    def process(self):
        """
        Process the raw data into graph format
        - Loads features and labels from HDF5
        - Performs cross-validation splits
        - Creates graph structure using kNN
        - Saves processed graphs
        """
        # Load metadata and features using full paths
        self.df = pd.read_csv(self.csv_path)
        self.data = h5py.File(self.hdf5_path, 'r')
        
        # Convert features and labels to torch tensors
        X_tensor = torch.tensor(np.array(self.data['features']))
        Y_tensor = torch.tensor(np.array(self.data['labels']))

        if not self.enable:
            # Standard processing mode - 10 fold cross validation
            self._process_folds(X_tensor, Y_tensor, 0, 10)
        else:
            # Extended processing mode - varying k neighbors
            for k in range(5, 115, 10):
                start_idx = (k - 5) // 10 * 10
                self._process_folds(X_tensor, Y_tensor, start_idx, 10, k)

    def _process_folds(self, X_tensor, Y_tensor, start_idx, num_folds, k_neighbors=None):
        """
        Process data for multiple cross-validation folds
        """
        test_idx = 1
        val_idx = 2
        k = k_neighbors if k_neighbors is not None else self.k_neigh

        for j in range(start_idx, start_idx + num_folds):
            # Get train/test/validation splits
            idx_train, idx_test, idx_val = self._cross_validation_split(
                self.df, test_idx, val_idx)
            
            # Normalize features
            X_tensor_norm = self._normalize_features(X_tensor, idx_train)
            
            # Create graph data object
            data = self._create_graph(X_tensor_norm, Y_tensor, 
                                    idx_train, idx_test, idx_val, k)
            
            # Save processed data
            torch.save(data, osp.join(self.processed_dir, f'data_{j}.pt'))
            
            # Update indices for next fold
            test_idx = (test_idx + 1) % 11
            val_idx = (val_idx + 1) % 11
            if val_idx == 0:
                val_idx = 1

    def _normalize_features(self, X_tensor, idx_train):
        """
        Normalize features using min-max scaling based on training set
        """
        mini = X_tensor[idx_train].min()
        maxi = X_tensor[idx_train].max()
        return (X_tensor - mini) / (maxi - mini)

    def _cross_validation_split(self, df, idx_test, idx_val):
        """
        Create train/test/validation splits using fold information
        """
        df = df.reset_index(drop=True)
        df_train = df.copy()
        
        # Get test indices and remove from training
        idx_test_mask = df.index[df.fold == idx_test]
        df_train.drop(idx_test_mask, inplace=True)
        
        # Get validation indices and remove from training
        idx_val_mask = df.index[df.fold == idx_val]
        df_train.drop(idx_val_mask, inplace=True)
        
        # Remaining indices are training set
        idx_train_mask = df_train.index
        
        return idx_train_mask, idx_test_mask, idx_val_mask

    def _create_graph(self, X_tensor, Y_tensor, idx_train, idx_test, idx_val, k):
        """
        Create a graph using feature tensors and kNN
        """
        # Convert indices to tensors
        train_mask = torch.tensor(idx_train)
        test_mask = torch.tensor(idx_test)
        val_mask = torch.tensor(idx_val)

        # Create edge connections using kNN
        edge_index = knn_graph(X_tensor, k=k, batch=Y_tensor, loop=False)

        # Create and return the graph data object
        return Data(
            x=X_tensor,
            edge_index=edge_index,
            y=Y_tensor,
            train_mask=train_mask,
            test_mask=test_mask,
            val_mask=val_mask
        )

    def len(self):
        return len(self.processed_file_names)

    def get(self, idx):
        """Load and return a specific graph"""
        return torch.load(osp.join(self.processed_dir, f'data_{idx}.pt')) 

In [None]:
# Example usage
dataset = AudioGraphDataset(
    root='processed_data',
    path=['nepali_music_metadata.csv', 'nepali_features.hdf5'],
    k_neigh=5
)

In [None]:
# Access a specific graph
graph = dataset[0]

In [None]:
class GCNAudioClassifier(torch.nn.Module):
    def __init__(self, num_features, hidden_channels, num_classes, dropout=0.3):
        """
        GCN model for audio classification
        Args:
            num_features: Number of input features (1024 for YaMNet)
            hidden_channels: Number of hidden units
            num_classes: Number of output classes
            dropout: Dropout rate
        """
        super().__init__()
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, num_classes)
        self.dropout = dropout
        self.bn1 = torch.nn.BatchNorm1d(hidden_channels)
        self.bn2 = torch.nn.BatchNorm1d(hidden_channels)

    def forward(self, x, edge_index):
        # First Graph Convolution
        x = self.conv1(x, edge_index)
        x = self.bn1(x)  # Apply batch normalization
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        
        # Second Graph Convolution
        x = self.conv2(x, edge_index)
        x = self.bn2(x)  # Apply batch normalization
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        
        # Final Graph Convolution
        x = self.conv3(x, edge_index)
        
        return F.log_softmax(x, dim=1)

In [None]:
def train_model(model, data, optimizer, epochs=200):
    """
    Train the GCN model with early stopping
    Args:
        model: GCN model instance
        data: PyG Data object containing the graph
        optimizer: PyTorch optimizer
        epochs: Number of training epochs
    Returns:
        lists of training/validation losses and accuracies
    """
    train_losses, val_losses = [], []
    train_accs, val_accs = [], []
    
    # Early stopping parameters
    best_val_loss = float('inf')
    patience = 20  # Number of epochs to wait for improvement
    epochs_without_improvement = 0
    
    for epoch in range(epochs):
        # Training
        model.train()
        optimizer.zero_grad()
        out = model(data.x, data.edge_index)
        
        # Calculate loss only on training nodes
        loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()
        
        # Evaluation
        model.eval()
        with torch.no_grad():
            out = model(data.x, data.edge_index)
            
            # Training metrics
            train_loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask]).item()
            train_acc = accuracy_score(data.y[data.train_mask].cpu(), 
                                    out[data.train_mask].argmax(dim=1).cpu())
            
            # Validation metrics
            val_loss = F.nll_loss(out[data.val_mask], data.y[data.val_mask]).item()
            val_acc = accuracy_score(data.y[data.val_mask].cpu(), 
                                   out[data.val_mask].argmax(dim=1).cpu())
            
        # Append metrics
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        train_accs.append(train_acc)
        val_accs.append(val_acc)
        
        # Early stopping check
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1
            if epochs_without_improvement >= patience:
                print(f"Early stopping at epoch {epoch + 1}")
                break
        
        # Print progress
        if (epoch + 1) % 10 == 0:
            print(f'Epoch {epoch+1:03d}, '
                  f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, '
                  f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')
    
    return train_losses, val_losses, train_accs, val_accs

In [None]:
def evaluate_model(model, data):
    """
    Evaluate the model on test set
    Args:
        model: Trained GCN model
        data: PyG Data object containing the graph
    Returns:
        dict containing various metrics
    """
    model.eval()
    with torch.no_grad():
        out = model(data.x, data.edge_index)
        pred = out[data.test_mask].argmax(dim=1).cpu()
        true = data.y[data.test_mask].cpu()
        
        # Calculate metrics
        accuracy = accuracy_score(true, pred)
        precision, recall, f1, _ = precision_recall_fscore_support(true, pred, average='weighted')
        
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [None]:
from collections import defaultdict

def train_k_fold(dataset, hidden_channels=32, dropout=0.3, epochs=200, lr=0.01):
    """
    Perform k-fold cross validation training with L2 regularization and learning rate scheduling
    Args:
        dataset: AudioGraphDataset instance
        hidden_channels: Number of hidden units in GCN
        dropout: Dropout rate
        epochs: Number of epochs per fold
        lr: Learning rate
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Store metrics for each fold
    fold_metrics = defaultdict(list)
    all_train_losses = []
    all_val_losses = []
    all_train_accs = []
    all_val_accs = []
    
    print(f"Training on {len(dataset)} folds...")
    
    for fold in range(len(dataset)):
        print(f"\nTraining Fold {fold + 1}/{len(dataset)}")
        
        # Get graph data for current fold
        data = dataset[fold].to(device)
        
        # # Initialize model with YaMNet dimensionality
        model = GCNAudioClassifier(
            num_features=1024,  # Changed from 128 to 1024
            hidden_channels=hidden_channels,
            num_classes=10,    
            dropout=dropout
        ).to(device)
        
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)  # L2 regularization
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10, verbose=True)  # LR scheduling
        
        # Train on this fold
        train_losses, val_losses, train_accs, val_accs = train_model(
            model=model,
            data=data,
            optimizer=optimizer,
            epochs=epochs
        )
        
        # Store training curves
        all_train_losses.append(train_losses)
        all_val_losses.append(val_losses)
        all_train_accs.append(train_accs)
        all_val_accs.append(val_accs)
        
        # Evaluate on test set
        test_metrics = evaluate_model(model, data)
        for metric, value in test_metrics.items():
            fold_metrics[metric].append(value)
        
        print(f"Fold {fold + 1} Test Metrics:")
        for metric, value in test_metrics.items():
            print(f"{metric.capitalize()}: {value:.4f}")
    
    # Calculate average metrics across folds
    print("\nOverall Cross-Validation Results:")
    for metric, values in fold_metrics.items():
        mean_value = np.mean(values)
        std_value = np.std(values)
        print(f"{metric.capitalize()}: {mean_value:.4f} ± {std_value:.4f}")
    
    return fold_metrics, (all_train_losses, all_val_losses, all_train_accs, all_val_accs) 

In [None]:
def plot_k_fold_curves(all_train_losses, all_val_losses, all_train_accs, all_val_accs):
    """Plot average training curves across all folds with standard deviation"""
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # Find the maximum length across all folds
    max_length = max(len(fold) for fold in all_train_losses)
    
    # Pad shorter folds with NaN values to make all folds the same length
    def pad_fold(fold, max_length):
        return np.pad(fold, (0, max_length - len(fold)), mode='constant', constant_values=np.nan)
    
    all_train_losses = [pad_fold(fold, max_length) for fold in all_train_losses]
    all_val_losses = [pad_fold(fold, max_length) for fold in all_val_losses]
    all_train_accs = [pad_fold(fold, max_length) for fold in all_train_accs]
    all_val_accs = [pad_fold(fold, max_length) for fold in all_val_accs]
    
    # Convert to numpy arrays
    all_train_losses = np.array(all_train_losses)
    all_val_losses = np.array(all_val_losses)
    all_train_accs = np.array(all_train_accs)
    all_val_accs = np.array(all_val_accs)
    
    # Calculate mean and std for losses
    train_losses_mean = np.nanmean(all_train_losses, axis=0)  # Ignore NaNs
    train_losses_std = np.nanstd(all_train_losses, axis=0)    # Ignore NaNs
    val_losses_mean = np.nanmean(all_val_losses, axis=0)      # Ignore NaNs
    val_losses_std = np.nanstd(all_val_losses, axis=0)        # Ignore NaNs
    
    # Calculate mean and std for accuracies
    train_accs_mean = np.nanmean(all_train_accs, axis=0)      # Ignore NaNs
    train_accs_std = np.nanstd(all_train_accs, axis=0)        # Ignore NaNs
    val_accs_mean = np.nanmean(all_val_accs, axis=0)          # Ignore NaNs
    val_accs_std = np.nanstd(all_val_accs, axis=0)            # Ignore NaNs
    
    # Plot losses
    epochs = range(1, len(train_losses_mean) + 1)
    ax1.plot(epochs, train_losses_mean, label='Train Loss')
    ax1.fill_between(epochs, train_losses_mean - train_losses_std, 
                    train_losses_mean + train_losses_std, alpha=0.2)
    ax1.plot(epochs, val_losses_mean, label='Validation Loss')
    ax1.fill_between(epochs, val_losses_mean - val_losses_std,
                    val_losses_mean + val_losses_std, alpha=0.2)
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    ax1.legend()
    ax1.set_title('Average Training and Validation Loss')
    
    # Plot accuracies
    ax2.plot(epochs, train_accs_mean, label='Train Accuracy')
    ax2.fill_between(epochs, train_accs_mean - train_accs_std,
                    train_accs_mean + train_accs_std, alpha=0.2)
    ax2.plot(epochs, val_accs_mean, label='Validation Accuracy')
    ax2.fill_between(epochs, val_accs_mean - val_accs_std,
                    val_accs_mean + val_accs_std, alpha=0.2)
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Accuracy')
    ax2.legend()
    ax2.set_title('Average Training and Validation Accuracy')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Use your existing dataset
fold_metrics, curves = train_k_fold(
    dataset=dataset,
    hidden_channels=32,  # Reduced hidden channels
    dropout=0.3,         # Adjusted dropout
    epochs=200,
    lr=0.01
)

# Unpack curves
all_train_losses, all_val_losses, all_train_accs, all_val_accs = curves

# Plot average learning curves across folds
plot_k_fold_curves(all_train_losses, all_val_losses, all_train_accs, all_val_accs)