In [11]:
from data_loader import data_loader
from pathlib import Path
import pandas as pd
from tqdm import tqdm
import os
import librosa
import numpy as np
import soundfile as sf
import random
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift, Gain, PolarityInversion, TimeMask, FrequencyMask, SpecFrequencyMask, RoomSimulator, AddBackgroundNoise
from IPython.display import Audio

In [12]:
timemask = TimeMask(min_band_part=0.1, max_band_part=1, p=1)
freqmask = FrequencyMask(min_frequency_band=0.1, max_frequency_band=1, p=1)

def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
    return np.roll(data, shift_range)

def pitch(data, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, 16000, pitch_factor)

def spec_augment(spec: np.ndarray, num_mask=2, 
                 freq_masking_max_percentage=0.15, time_masking_max_percentage=0.2):

    spec = spec.copy()
    for i in range(num_mask):
        all_frames_num, all_freqs_num = spec.shape
        freq_percentage = random.uniform(0.0, freq_masking_max_percentage)
        
        num_freqs_to_mask = int(freq_percentage * all_freqs_num)
        f0 = np.random.uniform(low=0.0, high=all_freqs_num - num_freqs_to_mask)
        f0 = int(f0)
        spec[:, f0:f0 + num_freqs_to_mask] = 0

        time_percentage = random.uniform(0.0, time_masking_max_percentage)
        
        num_frames_to_mask = int(time_percentage * all_frames_num)
        t0 = np.random.uniform(low=0.0, high=all_frames_num - num_frames_to_mask)
        t0 = int(t0)
        spec[t0:t0 + num_frames_to_mask, :] = 0
    
    return spec


  freqmask = FrequencyMask(min_frequency_band=0.1, max_frequency_band=1, p=1)


In [13]:
dl = data_loader(
  '../Datasets/Crema/',
  '../Datasets/Ravdess/',
  '../Datasets/Savee/',
  '../Datasets/Tess/',
  '../Datasets/BanglaSER',
  '../Datasets/SUBESCO'
)

In [17]:
tess_df = dl.get_banglaser_df()
dir = '../Datasets/custom_db/bser'
# Path(dir).mkdir(parents=True, exist_ok=True)
tess_df.shape

(1467, 2)

In [18]:
train, val, test = dl.split_df(tess_df, ratio_train=0.7, ratio_val=0.10, ratio_test=0.20)

In [19]:
train['Emotion'].value_counts(), val['Emotion'].value_counts(), test['Emotion'].value_counts()

(sad         214
 angry       214
 happy       214
 surprise    214
 neutral     170
 Name: Emotion, dtype: int64,
 sad         31
 surprise    31
 happy       31
 angry       30
 neutral     24
 Name: Emotion, dtype: int64,
 angry       62
 sad         61
 surprise    61
 happy       61
 neutral     49
 Name: Emotion, dtype: int64)

In [20]:
def move_files(df: pd.DataFrame, folder):
    dest_folder = dir + '/' + folder + '/'
    Path(dest_folder).mkdir(parents=True, exist_ok=True)
    for i, row in tqdm(df.iterrows(), total=df.shape[0]):
        data, sr = librosa.load(row['File_Path'], sr=None)
        data, _ = librosa.effects.trim(data, top_db=25)
        if sr != 16000:
            data = librosa.resample(data, orig_sr=sr, target_sr=16000)
        # src = os.path.abspath(row['File_Path'])
        dest = os.path.abspath(dest_folder + row['Emotion'] + '_' + str(i) + '.wav')
        sf.write(dest, data, 16000)
        # os.system(f'cp {src} {dest}')

In [21]:
move_files(train, 'train')
move_files(val, 'val')
move_files(test, 'test')

100%|██████████| 1026/1026 [02:41<00:00,  6.34it/s]
100%|██████████| 147/147 [00:23<00:00,  6.20it/s]
100%|██████████| 294/294 [00:44<00:00,  6.58it/s]


## Augmentation

In [22]:
audio = []
emo = []

for dirs, _, files in os.walk(dir + '/train'):
    for file in files:
        path = os.path.join(dirs, file)
        audio.append(path)
        emo.append(file.split('_')[0])

aug_df = pd.DataFrame({'File_Path': audio, 'Emotion': emo})
aug_df

Unnamed: 0,File_Path,Emotion
0,../Datasets/custom_db/bser/train/angry_1290.wav,angry
1,../Datasets/custom_db/bser/train/happy_873.wav,happy
2,../Datasets/custom_db/bser/train/angry_346.wav,angry
3,../Datasets/custom_db/bser/train/angry_827.wav,angry
4,../Datasets/custom_db/bser/train/neutral_325.wav,neutral
...,...,...
1021,../Datasets/custom_db/bser/train/neutral_1457.wav,neutral
1022,../Datasets/custom_db/bser/train/sad_1332.wav,sad
1023,../Datasets/custom_db/bser/train/happy_888.wav,happy
1024,../Datasets/custom_db/bser/train/sad_1230.wav,sad


In [23]:
Path(dir + '/augment').mkdir(parents=True, exist_ok=True)

for i, row in tqdm(aug_df.iterrows(), total=aug_df.shape[0]):
    data, sr = librosa.load(row['File_Path'], sr=None)

    if sr != 16000:
        Audio(data, rate=sr)
        raise Exception(f'Sample rate is {sr} Hz of file: ' + row['File_Path'])

    data_noise = noise(data)
    sf.write(dir + '/augment' + '/' + row['Emotion'] + '_' + str(i) + '_' + 'noise', data_noise, sr, format='wav')

    data_ts = shift(data)
    sf.write(dir + '/augment' + '/' + row['Emotion'] + '_' + str(i) + '_' + 'timeshift', data_ts, sr, format='wav')

    data_ps = pitch(data)
    sf.write(dir + '/augment' + '/' + row['Emotion'] + '_' + str(i) + '_' + 'pitchshift', data_ps, sr, format='wav')

    try:
        data_tm = timemask(data, sr)
        sf.write(dir + '/augment' + '/' + row['Emotion'] + '_' + str(i) + '_' + 'timemask', data_tm, sr, format='wav')
    except Exception as e:
        print('Error while applying timemask of', row['File_Path'], 'Error:', e)

    try:
        data_fm = freqmask(data, sr)
        sf.write(dir + '/augment' + '/' + row['Emotion'] + '_' + str(i) + '_' + 'freqmask', data_fm, sr, format='wav')
    except Exception as e:
        print('Error while applying freqmask of', row['File_Path'], 'Error:', e)



  return librosa.effects.pitch_shift(data, 16000, pitch_factor)
 38%|███▊      | 389/1026 [01:53<03:07,  3.40it/s]

Error while applying freqmask of ../Datasets/custom_db/bser/train/surprise_501.wav Error: empty range for randrange() (16, 8, -8)


 97%|█████████▋| 999/1026 [05:08<00:07,  3.81it/s]

Error while applying freqmask of ../Datasets/custom_db/bser/train/sad_784.wav Error: empty range for randrange() (16, 0, -16)


100%|██████████| 1026/1026 [05:17<00:00,  3.23it/s]
