In [1]:
import os.path as osp
import pandas as pd
import numpy as np
import torch
import h5py
from torch_geometric.data import Dataset, Data
from torch_geometric.nn import knn_graph

In [2]:
class AudioGraphDataset(Dataset):
    def __init__(self, root, path, k_neigh=5, split_ratio=(0.7, 0.15), transform=None):
        """
        Initialize the Audio Graph Dataset
        Args:
            root: Root directory for processed files
            path: List containing paths to [csv_file, hdf5_file]
            k_neigh: Number of nearest neighbors for graph construction
            split_ratio: Tuple of (train_ratio, val_ratio)
        """
        self.csv_path = path[0]
        self.hdf5_path = path[1]
        self.k_neigh = k_neigh
        self.split_ratio = split_ratio
        super().__init__(root, transform)

    @property
    def raw_file_names(self):
        return [osp.basename(self.csv_path), osp.basename(self.hdf5_path)]

    @property
    def processed_file_names(self):
        return ['data.pt']

    def process(self):
        # Load data
        df = pd.read_csv(self.csv_path)
        with h5py.File(self.hdf5_path, 'r') as hf:
            features = torch.tensor(hf['features'][:], dtype=torch.float32)
            labels = torch.tensor(hf['labels'][:], dtype=torch.long)

        # Create single split
        num_nodes = features.size(0)
        indices = torch.randperm(num_nodes)
        train_end = int(num_nodes * self.split_ratio[0])
        val_end = train_end + int(num_nodes * self.split_ratio[1])

        # Create masks
        train_mask = torch.zeros(num_nodes, dtype=torch.bool)
        val_mask = torch.zeros(num_nodes, dtype=torch.bool)
        test_mask = torch.zeros(num_nodes, dtype=torch.bool)
        
        train_mask[indices[:train_end]] = True
        val_mask[indices[train_end:val_end]] = True
        test_mask[indices[val_end:]] = True

        # Normalize features using training data
        mean = features[train_mask].mean(dim=0)
        std = features[train_mask].std(dim=0)
        features = (features - mean) / std

        # Create graph structure
        edge_index = knn_graph(features, k=self.k_neigh, loop=False)

        # Create single data object
        data = Data(
            x=features,
            edge_index=edge_index,
            y=labels,
            train_mask=train_mask,
            val_mask=val_mask,
            test_mask=test_mask
        )

        torch.save(data, osp.join(self.processed_dir, 'data.pt'))

    def len(self):
        return 1  # Single graph dataset

    def get(self, idx):
        return torch.load(osp.join(self.processed_dir, 'data.pt'))

In [3]:
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self, num_features, num_classes):
        super().__init__()
        self.conv1 = GCNConv(num_features, 64)
        self.conv2 = GCNConv(64, 32)
        self.conv3 = GCNConv(32, num_classes)
        self.dropout = torch.nn.Dropout(0.5)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)
        
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)
        
        x = self.conv3(x, edge_index)
        return F.log_softmax(x, dim=1)

In [4]:
def train(model, data, optimizer):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def evaluate(model, data):
    model.eval()
    with torch.no_grad():
        out = model(data)
        pred = out.argmax(dim=1)
        
        acc_train = (pred[data.train_mask] == data.y[data.train_mask]).float().mean()
        acc_val = (pred[data.val_mask] == data.y[data.val_mask]).float().mean()
        acc_test = (pred[data.test_mask] == data.y[data.test_mask]).float().mean()
    
    return acc_train.item(), acc_val.item(), acc_test.item()

def run_training(root, path, num_epochs=200):
    # Load dataset
    dataset = AudioGraphDataset(root=root, path=path)
    data = dataset[0].to('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Initialize model
    model = GCN(num_features=data.num_features, 
               num_classes=data.y.unique().size(0)).to(data.x.device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
    
    best_val_acc = 0
    for epoch in range(1, num_epochs+1):
        loss = train(model, data, optimizer)
        train_acc, val_acc, test_acc = evaluate(model, data)
        
        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'best_model.pth')
        
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, '
              f'Train: {train_acc:.4f}, Val: {val_acc:.4f}, Test: {test_acc:.4f}')
    
    print('\nFinal Results:')
    model.load_state_dict(torch.load('best_model.pth'))
    train_acc, val_acc, test_acc = evaluate(model, data)
    print(f'Best Model - Train: {train_acc:.4f}, Val: {val_acc:.4f}, Test: {test_acc:.4f}')

In [5]:
if __name__ == "__main__":
    run_training(
        root='./processed_data',
        path=['nepali_music_metadata.csv', 'nepali_features.hdf5'],
        num_epochs=200
    )

Processing...
Done!
  return torch.load(osp.join(self.processed_dir, 'data.pt'))


Epoch: 001, Loss: 2.5913, Train: 0.2348, Val: 0.2290, Test: 0.2651
Epoch: 002, Loss: 4.9368, Train: 0.3756, Val: 0.3610, Test: 0.3488
Epoch: 003, Loss: 3.9707, Train: 0.5452, Val: 0.5170, Test: 0.5137
Epoch: 004, Loss: 3.0368, Train: 0.5474, Val: 0.5328, Test: 0.5468
Epoch: 005, Loss: 3.4758, Train: 0.5940, Val: 0.5859, Test: 0.5915
Epoch: 006, Loss: 3.1075, Train: 0.6196, Val: 0.6216, Test: 0.6098
Epoch: 007, Loss: 2.5600, Train: 0.6240, Val: 0.6166, Test: 0.6056
Epoch: 008, Loss: 2.2971, Train: 0.6432, Val: 0.6349, Test: 0.6123
Epoch: 009, Loss: 2.1712, Train: 0.6480, Val: 0.6307, Test: 0.6305
Epoch: 010, Loss: 2.0167, Train: 0.6528, Val: 0.6448, Test: 0.6413
Epoch: 011, Loss: 1.9106, Train: 0.6752, Val: 0.6656, Test: 0.6587
Epoch: 012, Loss: 1.6155, Train: 0.6750, Val: 0.6747, Test: 0.6678
Epoch: 013, Loss: 1.5939, Train: 0.6777, Val: 0.6739, Test: 0.6578
Epoch: 014, Loss: 1.4183, Train: 0.6921, Val: 0.6896, Test: 0.6686
Epoch: 015, Loss: 1.2795, Train: 0.6910, Val: 0.6830, Test: 0.

  model.load_state_dict(torch.load('best_model.pth'))


## Testing 

In [6]:
import torch
import numpy as np
import pandas as pd
from pathlib import Path

In [7]:
class AudioTester:
    def __init__(self, model_path, train_stats, k_neigh=5, device='cuda'):
        self.device = device
        self.k_neigh = k_neigh
        self.model = torch.load(model_path).to(device).eval()
        self.train_mean, self.train_std = train_stats
        
        # YamNet initialization (match your training setup)
        self.yamnet = hub.load('https://tfhub.dev/google/yamnet/1')

    def process_audio(self, audio_dir):
        """Process raw audio files into graph format"""
        # Feature extraction
        features, paths = self._extract_features(audio_dir)
        
        # Normalize using training statistics
        features = (features - self.train_mean) / self.train_std
        
        # Create test graph
        edge_index = knn_graph(torch.tensor(features), k=self.k_neigh, loop=False)
        
        return Data(
            x=torch.tensor(features, dtype=torch.float32).to(self.device),
            edge_index=edge_index.to(self.device),
            paths=paths
        )

    def _extract_features(self, audio_dir):
        """Extract YamNet features matching training setup"""
        features = []
        paths = []
        
        for audio_path in tqdm(list(Path(audio_dir).glob('**/*.wav'))):
            try:
                # Match your training feature extraction
                audio = librosa.load(audio_path, sr=16000, mono=True)[0]
                _, embeddings, _ = self.yamnet(audio)
                features.append(np.mean(embeddions.numpy(), axis=0))
                paths.append(str(audio_path))
            except Exception as e:
                print(f"Skipping {audio_path}: {str(e)}")
        
        return np.array(features), paths

    def predict(self, test_data):
        """Run inference on processed test graph"""
        with torch.no_grad():
            outputs = self.model(test_data)
            preds = outputs.argmax(dim=1).cpu().numpy()
            probs = torch.softmax(outputs, dim=1).cpu().numpy()
        
        return pd.DataFrame({
            'path': test_data.paths,
            'prediction': preds,
            'confidence': probs.max(axis=1),
            'probabilities': list(probs)
        })
