In [None]:
!pip install torch torchaudio librosa scikit-learn tqdm

In [12]:
import torch
import torchaudio
import librosa

import os
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch import nn
from tqdm import tqdm
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder
import torch.optim as optim
from sklearn.model_selection import train_test_split


# Custom Dataset Class for UrbanSound8K
class UrbanSound8KDataset(Dataset):
    def __init__(self, audio_dir, file, sample_rate=22050, transform=None):
        """
        Args:
            audio_dir (str): Path to the directory containing audio files.
            file (str): Path to the CSV file containing metadata (with fsID, classID, etc.).
            sample_rate (int): The sample rate to resample audio to (default is 22050).
            transform (callable, optional): A function/transform to apply to the audio (e.g., MFCC).
        """
        self.audio_dir = audio_dir
        self.metadata = pd.read_csv('/kaggle/input/urbansound8k/UrbanSound8K.csv')
        self.sample_rate = sample_rate
        self.transform = transform
        
        # Label encoding for classID to numerical labels
        self.label_encoder = LabelEncoder()
        self.metadata['classID'] = self.label_encoder.fit_transform(self.metadata['class'])

    def __len__(self):
        """Return the total number of samples in the dataset."""
        return len(self.metadata)

    def extract_features(self, X):
        result = np.array([])

        # MFCC
        mfccs = np.mean(librosa.feature.mfcc(y=X, sr=self.sample_rate, n_mfcc=40).T, axis=0)
        result = np.hstack((result, mfccs))

        # Chroma_STFT
        stft = np.abs(librosa.stft(X))
        chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=self.sample_rate, n_chroma=32, window="hamming", n_fft=1024).T, axis=0)
        result = np.hstack((result, chroma))

        # Mel Spectrogram
        mel = np.mean(librosa.feature.melspectrogram(y=X, sr=self.sample_rate, n_mels=128, fmax=8000, n_fft=1024, hop_length=512, window="hamming").T, axis=0)
        result = np.hstack((result, mel))

        # Zero Crossing Rate
        Z = np.mean(librosa.feature.zero_crossing_rate(y=X), axis=1)
        result = np.hstack((result, Z))

        # Root Mean Square Energy
        rms = np.mean(librosa.feature.rms(y=X).T, axis=0)
        result = np.hstack((result, rms))

        return result

    def __getitem__(self, idx):
        """Return the sample (audio, label, metadata) at index `idx`."""
        # Get the metadata for the current sample
        row = self.metadata.iloc[idx]
        start_time = row['start']
        end_time = row['end']
        fold = row['fold']
        file_name = row['slice_file_name']
        label = row['classID']
        
        # Load the audio file using librosa
        audio_path = os.path.join(self.audio_dir, f"fold{fold}", file_name)
        waveform, sample_rate = librosa.load(audio_path, sr=self.sample_rate)
        
        # Resample if the sample rate does not match the desired rate
        if sample_rate != self.sample_rate:
            waveform = librosa.resample(waveform, sample_rate, self.sample_rate)
        
        # Extract features
        features = self.extract_features(waveform)
        
        # Convert features to tensor and label to long
        features_tensor = torch.tensor(features, dtype=torch.float32)
        label_tensor = torch.tensor(label, dtype=torch.long)  # Ensure label is of type long

        sample = {
            'features': features_tensor,
            'start': start_time,
            'end': end_time,
            'fold': fold,
            'file_name': file_name,
            'label': label_tensor
        }
        
        return sample



class AudioClassifierANN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(AudioClassifierANN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, num_classes)  # Raw logits, no softmax here

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)  # Raw logits
        return x



# Create function to split dataset into train, validation, and test sets
def create_datasets(audio_dir, file, test_size=0.2, val_size=0.2):
    dataset = UrbanSound8KDataset(audio_dir=audio_dir, file=file)
    
    # Splitting the data into train, validation, and test sets
    train_metadata, temp_metadata = train_test_split(dataset.metadata, test_size=test_size + val_size, stratify=dataset.metadata['classID'])
    val_metadata, test_metadata = train_test_split(temp_metadata, test_size=test_size / (test_size + val_size), stratify=temp_metadata['classID'])
    
    # Create Dataset instances for train, validation, and test
    train_dataset = UrbanSound8KDataset(audio_dir=audio_dir, file=train_metadata)
    val_dataset = UrbanSound8KDataset(audio_dir=audio_dir, file=val_metadata)
    test_dataset = UrbanSound8KDataset(audio_dir=audio_dir, file=test_metadata)
    
    return train_dataset, val_dataset, test_dataset


# Create DataLoaders for train, validation, and test datasets
def create_dataloaders(train_dataset, val_dataset, test_dataset, batch_size=16):
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    return train_loader, val_loader, test_loader




def train(model, train_loader, criterion, optimizer, device):
    print("Starting training for the epoch...")
    model.train()
    running_loss = 0.0
    correct_preds = 0
    total_preds = 0

    # Initialize tqdm for progress bar
    with tqdm(train_loader, desc="Training", unit="batch", ncols=100) as pbar:
        for batch in pbar:
            features = batch['features'].to(device)
            labels = batch['label'].to(device)

            # Flatten features for input to the ANN (if necessary)
            features = features.view(features.size(0), -1)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(features)

            # Calculate the loss
            loss = criterion(outputs, labels)
            running_loss += loss.item()

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            # Calculate accuracy
            _, predicted = torch.max(outputs, 1)
            total_preds += labels.size(0)
            correct_preds += (predicted == labels).sum().item()

            # Update progress bar with current loss and accuracy
            avg_loss = running_loss / (pbar.n + 1)  # Average loss till now
            accuracy = (correct_preds / total_preds) * 100
            pbar.set_postfix(loss=avg_loss, accuracy=accuracy)

    # Return epoch's average loss and accuracy
    avg_loss = running_loss / len(train_loader)
    accuracy = correct_preds / total_preds * 100
    return avg_loss, accuracy


# Define testing/validation function
def test(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct_preds = 0
    total_preds = 0

    with torch.no_grad():
        for batch in dataloader:
            features = batch['features'].to(device)
            labels = batch['label'].to(device)

            # Flatten features for input to the ANN (if necessary)
            features = features.view(features.size(0), -1)

            # Forward pass
            outputs = model(features)

            # Calculate the loss
            loss = criterion(outputs, labels)
            running_loss += loss.item()

            # Calculate accuracy
            _, predicted = torch.max(outputs, 1)
            total_preds += labels.size(0)
            correct_preds += (predicted == labels).sum().item()

    # Print statistics
    avg_loss = running_loss / len(dataloader)
    accuracy = correct_preds / total_preds * 100
    return avg_loss, accuracy

# Hann

In [2]:
if __name__ == '__main__':
    # Load datasets and DataLoaders
    audio_dir = '/kaggle/input/urbansound8k'
    file = '/kaggle/input/urbansound8k/UrbanSound8K.csv'

    # Split dataset into train, validation, and test sets
    train_dataset, val_dataset, test_dataset = create_datasets(audio_dir, file)

    # Create DataLoaders for each set
    train_loader, val_loader, test_loader = create_dataloaders(train_dataset, val_dataset, test_dataset)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)

    # Initialize model, loss function, and optimizer
    input_size = 40 + 32 + 128 + 1 + 1  # Features for MFCC, Chroma, Mel, ZCR, RMS
    num_classes = len(train_dataset.label_encoder.classes_)  # Number of sound classes
    model = AudioClassifierANN(input_size=input_size, num_classes=num_classes).to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Initialize best validation accuracy and model checkpoint
    best_val_acc = 0.0
    best_model_path = '/kaggle/working/best_model.pth'  # Path where the best model will be saved

    # Training loop
    num_epochs = 10
    for epoch in range(num_epochs):
        train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
        val_loss, val_acc = test(model, val_loader, criterion, device)
        
        print(f'Epoch {epoch+1}/{num_epochs}, '
              f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.2f}%, '
              f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_acc:.2f}%')

        # Save the model if it has better validation accuracy
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), best_model_path)
            print(f'Saved best model with Val Accuracy: {val_acc:.2f}%')

    # Load the best model for final evaluation
    model.load_state_dict(torch.load(best_model_path))
    model.to(device)

    # Evaluate on the test set
    test_loss, test_acc = test(model, test_loader, criterion, device)
    print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.2f}%')

cuda
Starting training for the epoch...


  return pitch_tuning(
Training: 100%|████████████████████████| 546/546 [13:01<00:00,  1.43s/batch, accuracy=55, loss=1.33]
  labels = torch.tensor(batch['label']).to(device)


Epoch 1/10, Train Loss: 1.3291, Train Accuracy: 54.98%, Val Loss: 1.0251, Val Accuracy: 66.23%
Saved best model with Val Accuracy: 66.23%
Starting training for the epoch...


Training: 100%|███████████████████████| 546/546 [09:19<00:00,  1.02s/batch, accuracy=73, loss=0.833]


Epoch 2/10, Train Loss: 0.8335, Train Accuracy: 72.96%, Val Loss: 0.6512, Val Accuracy: 79.82%
Saved best model with Val Accuracy: 79.82%
Starting training for the epoch...


Training: 100%|█████████████████████| 546/546 [09:19<00:00,  1.02s/batch, accuracy=79.9, loss=0.624]


Epoch 3/10, Train Loss: 0.6238, Train Accuracy: 79.88%, Val Loss: 0.4886, Val Accuracy: 84.05%
Saved best model with Val Accuracy: 84.05%
Starting training for the epoch...


Training: 100%|█████████████████████| 546/546 [09:20<00:00,  1.03s/batch, accuracy=83.7, loss=0.498]


Epoch 4/10, Train Loss: 0.4983, Train Accuracy: 83.68%, Val Loss: 0.4311, Val Accuracy: 85.52%
Saved best model with Val Accuracy: 85.52%
Starting training for the epoch...


Training: 100%|█████████████████████| 546/546 [09:17<00:00,  1.02s/batch, accuracy=86.2, loss=0.412]


Epoch 5/10, Train Loss: 0.4120, Train Accuracy: 86.21%, Val Loss: 0.3441, Val Accuracy: 88.67%
Saved best model with Val Accuracy: 88.67%
Starting training for the epoch...


Training:   2%|▍                     | 11/546 [00:12<10:03,  1.13s/batch, accuracy=88.1, loss=0.298]


KeyboardInterrupt: 

# Hamming

In [13]:
if __name__ == '__main__':
    # Load datasets and DataLoaders
    audio_dir = '/kaggle/input/urbansound8k'
    file = '/kaggle/input/urbansound8k/UrbanSound8K.csv'

    # Split dataset into train, validation, and test sets
    train_dataset, val_dataset, test_dataset = create_datasets(audio_dir, file)

    # Create DataLoaders for each set
    train_loader, val_loader, test_loader = create_dataloaders(train_dataset, val_dataset, test_dataset)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)

    # Initialize model, loss function, and optimizer
    input_size = 40 + 32 + 128 + 1 + 1  # Features for MFCC, Chroma, Mel, ZCR, RMS
    num_classes = len(train_dataset.label_encoder.classes_)  # Number of sound classes
    model = AudioClassifierANN(input_size=input_size, num_classes=num_classes).to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Initialize best validation accuracy and model checkpoint
    best_val_acc = 0.0
    best_model_path = '/kaggle/working/best_model_hamming.pth'  # Path where the best model will be saved

    # Training loop
    num_epochs = 5
    for epoch in range(num_epochs):
        train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
        val_loss, val_acc = test(model, val_loader, criterion, device)
        
        print(f'Epoch {epoch+1}/{num_epochs}, '
              f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.2f}%, '
              f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_acc:.2f}%')

        # Save the model if it has better validation accuracy
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), best_model_path)
            print(f'Saved best model with Val Accuracy: {val_acc:.2f}%')

    # Load the best model for final evaluation
    model.load_state_dict(torch.load(best_model_path))
    model.to(device)

    # Evaluate on the test set
    test_loss, test_acc = test(model, test_loader, criterion, device)
    print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.2f}%')

cuda
Starting training for the epoch...


  return pitch_tuning(
Training: 100%|██████████████████████| 546/546 [09:24<00:00,  1.03s/batch, accuracy=55.8, loss=1.32]


Epoch 1/5, Train Loss: 1.3210, Train Accuracy: 55.78%, Val Loss: 0.8546, Val Accuracy: 72.34%
Saved best model with Val Accuracy: 72.34%
Starting training for the epoch...


Training: 100%|███████████████████████| 546/546 [09:23<00:00,  1.03s/batch, accuracy=73, loss=0.817]


Epoch 2/5, Train Loss: 0.8169, Train Accuracy: 72.96%, Val Loss: 0.6706, Val Accuracy: 79.29%
Saved best model with Val Accuracy: 79.29%
Starting training for the epoch...


Training: 100%|█████████████████████| 546/546 [09:21<00:00,  1.03s/batch, accuracy=80.2, loss=0.618]


Epoch 3/5, Train Loss: 0.6184, Train Accuracy: 80.20%, Val Loss: 0.5094, Val Accuracy: 83.34%
Saved best model with Val Accuracy: 83.34%
Starting training for the epoch...


Training: 100%|█████████████████████| 546/546 [09:19<00:00,  1.02s/batch, accuracy=83.4, loss=0.492]


Epoch 4/5, Train Loss: 0.4918, Train Accuracy: 83.39%, Val Loss: 0.4852, Val Accuracy: 83.30%
Starting training for the epoch...


Training: 100%|█████████████████████| 546/546 [09:18<00:00,  1.02s/batch, accuracy=86.3, loss=0.414]


Epoch 5/5, Train Loss: 0.4136, Train Accuracy: 86.26%, Val Loss: 0.3623, Val Accuracy: 87.71%
Saved best model with Val Accuracy: 87.71%


  model.load_state_dict(torch.load(best_model_path))


Test Loss: 0.3623, Test Accuracy: 87.71%
