# Environmental Sound Classification (UrbanSound8K dataset)

## Goal
Build RNN and Transformer model that can identify 10 types of urban sounds from UrbanSound8K dataset. 

## 0. Imports

In [4]:
import soundata
import numpy as np
import torchaudio
import torch
from tqdm import tqdm
import librosa

## 1. Data preparation

In [None]:
class DataLoader:
    def __init__(self,config):
        self.config = config
        self.dataset = None
        self.clips = None 
        self.feature_data = {
            'train': {'features':[],'labels':[],'folds':[]},
            'val':{'features':[],'labels':[],'folds':[]},
            'test':{'features':[],'labels':[],'folds':[]}
        }

    def download_and_validate(self):
        print("Initializing dataset")
        self.dataset = soundata.initialize('urbansound8k')

        try:
            # in case it is already downloaded
            print("Validating dataset")
            self.dataset.validate()
        except Exception as e:
            print("Downloading dataset")
            self.dataset.download()

        self.clips = self.dataset.load_clips()
        print(f"Loaded {len(self.clips)} audio clips")

    def get_fold_number(self,clip_id):
        clip = self.clips[clip_id]
        if hasattr(clip, '_clip_metadata'):
            metadata = clip._clip_metadata
            if isinstance(metadata, dict) and 'fold' in metadata:
                return int(metadata['fold'])
                
    def preprocess_audio(self, audio, original):
        # 1. resample audio to standardize inputs ensuring all audio files have same sample rate for consistent processing
        if original != self.config['sample_rate']:
            # initialize pytorch resampler with original and target sample rates
            resampler = torchaudio.transforms.Resample(
                orig_freq=original,                     # original sample rate
                new_freq=self.config['sample_rate']     # target sample rate
            )

            audio = resampler(torch.tensor(audio).float()).numpy()
        
        # 2. channel normalization 
        # converting to mono if multi channel so it could train faster and it is better for sound recognition
        if audio.ndim > 1:
            audio = librosa.to_mono(audio)
        # 3. duration standarization
        # target length = duration * sample rate
        target_samples = int(self.config['max_duration'] * self.config['sample_rate'])
        
        # case1: audio is shorter than target duration
        if len(audio) < target_samples:
            # calculate padding needed (only at end of audio)
            pad_amount = target_samples - len(audio)
            # add zero padding (silence) at the end
            audio = np.pad(audio, (0, pad_amount))
        
        # case2: audio is longer than target duration
        else:
            # truncate to the target length (keep beginning portion)
            audio = audio[:target_samples]
        return audio
    
    def extract_features(self,audio):
        # mel spectrogram
        mel_spec = librosa.feature.melspectrogram(
            y=audio,  
            sr=self.config['sample_rate'], 
            n_fft=self.config['n_fft'],  
            hop_length=self.config['hop_length'],  
            n_mels=self.config['n_mels']  
        )
        # convert power spectrogram to dB scale (log transform) which matches human hearing's perception
        log_mel = librosa.power_to_db(mel_spec, ref=np.max)

        # RMS -> per frame loudness estimate
        energy = librosa.feature.rms(
            y=audio,
            frame_length=self.config['n_fft'],  
            hop_length=self.config['hop_length']
        )

        # MFCCs -> Mel Frequency Cepstral Coefficients
        # compact spectral representation
        mfcc = librosa.feature.mfcc(
            y=audio,
            sr=self.config['sample_rate'],
            n_mfcc=self.config['n_mfcc'],  
            n_fft=self.config['n_fft'],
            hop_length=self.config['hop_length']
        )
        return {
            'waveform': audio,  
            'log_mel': log_mel,
            'energy': energy,  
            'mfcc': mfcc  
        }
    def process_dataset(self):
        print("Processing dataset")
        for clip_id, clip in tqdm(self.clips.items()):
            audio, sr = clip.audio
            fold = self.get_fold_number(clip_id)
            class_name = clip.tags.labels[0]
            
            audio = self.preprocess_audio(audio, sr)
            features = self.extract_features(audio)
            
            split = self.get_split(fold)
            if split:
                self.feature_data[split]['features'].append(features)
                self.feature_data[split]['labels'].append(class_name)
                self.feature_data[split]['folds'].append(fold)

    def get_split(self, fold):
        if fold in self.config['train_folds']:
            return 'train'
        elif fold in self.config['val_folds']:
            return 'val'
        elif fold in self.config['test_folds']:
            return 'test'
        return None
    
    def get_processed_data(self):
        return {
            'train': {
                'features': np.array(self.feature_data['train']['features']),
                'labels': np.array(self.feature_data['train']['labels']),
                'folds': np.array(self.feature_data['train']['folds'])
            },
            'val': {
                'features': np.array(self.feature_data['val']['features']),
                'labels': np.array(self.feature_data['val']['labels']),
                'folds': np.array(self.feature_data['val']['folds'])
            },
            'test': {
                'features': np.array(self.feature_data['test']['features']),
                'labels': np.array(self.feature_data['test']['labels']),
                'folds': np.array(self.feature_data['test']['folds'])
            }
        }


In [6]:
config = {
    'sample_rate': 16000,
    'max_duration': 4.0,
    'n_fft': 2048,
    'hop_length': 512,
    'n_mels': 64,
    'n_mfcc': 20,
    'train_folds': list(range(1, 7)),
    'val_folds': [7, 8],
    'test_folds': [9, 10],
    'resample': True
}

data_loader = DataLoader(config)
data_loader.download_and_validate()
data_loader.process_dataset()

Initializing dataset
Validating dataset


100%|██████████| 1/1 [00:00<00:00, 335.89it/s]
100%|██████████| 8732/8732 [00:28<00:00, 309.09it/s]
INFO: Success: the dataset is complete and all files are valid.
INFO: --------------------


Loaded 8732 audio clips
Processing dataset


100%|██████████| 8732/8732 [05:34<00:00, 26.11it/s]


In [7]:
processed_data = data_loader.get_processed_data()

# training data
train_features = processed_data['train']['features']
train_labels = processed_data['train']['labels']
train_folds = processed_data['train']['folds']

# validation data
val_features = processed_data['val']['features']
val_labels = processed_data['val']['labels']
val_folds = processed_data['val']['folds']

# test data
test_features = processed_data['test']['features']
test_labels = processed_data['test']['labels']
test_folds = processed_data['test']['folds']

In [None]:
# dataset statistics
print("\nDataset statistics:")
print(f"Training set: {len(train_features)} samples")
print(f"Validation set: {len(val_features)} samples")
print(f"Test set: {len(test_features)} samples")

# sample shapes
print("\nFeature Shapes (first sample):")
print(f"Waveform: {train_features[0]['waveform'].shape}")
print(f"Log Mel: {train_features[0]['log_mel'].shape}")
print(f"MFCCs: {train_features[0]['mfcc'].shape}")
print(f"Energy: {train_features[0]['energy'].shape}")

# class distribution in each set 
print("\nClass distribution:")
print(f"Training classes: {np.unique(train_labels)}")
print(f"Validation classes: {np.unique(val_labels)}")
print(f"Test classes: {np.unique(test_labels)}")

# fold distribution
print("\nFold distribution:")
print(f"Training folds: {np.unique(train_folds)}")
print(f"Validation folds: {np.unique(val_folds)}")
print(f"Test folds: {np.unique(test_folds)}")


Dataset statistics:
Training set: 5435 samples
Validation set: 1644 samples
Test set: 1653 samples

Feature Shapes (first sample):
Waveform: (64000,)
Log-Mel: (64, 126)
MFCCs: (20, 126)
Energy: (1, 126)

Class distribution:
Training classes: ['air_conditioner' 'car_horn' 'children_playing' 'dog_bark' 'drilling'
 'engine_idling' 'gun_shot' 'jackhammer' 'siren' 'street_music']
Validation classes: ['air_conditioner' 'car_horn' 'children_playing' 'dog_bark' 'drilling'
 'engine_idling' 'gun_shot' 'jackhammer' 'siren' 'street_music']
Test classes: ['air_conditioner' 'car_horn' 'children_playing' 'dog_bark' 'drilling'
 'engine_idling' 'gun_shot' 'jackhammer' 'siren' 'street_music']

Fold distribution:
Training folds: [1 2 3 4 5 6]
Validation folds: [7 8]
Test folds: [ 9 10]


## 2. Building the Model

### 2.1 RNN Model

### 2.2 Transformer Model

## 3. Comparison