In [55]:
import pandas as pd

In [56]:
df = pd.read_csv('UrbanSound8K/metadata/UrbanSound8K.csv')
df.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


In [57]:
df['relative_path'] = '/fold' + df['fold'].astype(str) + '/' + df['slice_file_name'].astype(str)
df.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class,relative_path
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark,/fold5/100032-3-0-0.wav
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing,/fold5/100263-2-0-117.wav
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing,/fold5/100263-2-0-121.wav
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing,/fold5/100263-2-0-126.wav
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing,/fold5/100263-2-0-137.wav


In [58]:
df = df[['relative_path', 'classID']]
df.head()

Unnamed: 0,relative_path,classID
0,/fold5/100032-3-0-0.wav,3
1,/fold5/100263-2-0-117.wav,2
2,/fold5/100263-2-0-121.wav,2
3,/fold5/100263-2-0-126.wav,2
4,/fold5/100263-2-0-137.wav,2


In [59]:
import math, random
import torch
import torchaudio
from torchaudio import transforms
from IPython.display import Audio

In [60]:
sig1, sr1 = torchaudio.load('UrbanSound8K/audio/fold1/102305-6-0-0.wav')
sig2, sr2 = torchaudio.load('UrbanSound8K/audio/fold1/101415-3-0-8.wav')

In [61]:
print(f'shape of signal1 : {sig1.shape}')
print(f'shape of signal2 : {sig2.shape}')

shape of signal1 : torch.Size([2, 115101])
shape of signal2 : torch.Size([1, 192000])


In [62]:
sig1

tensor([[0.0000e+00, 0.0000e+00, 1.2207e-04,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [3.0518e-05, 3.6621e-04, 8.5449e-04,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00]])

In [63]:
sig2

tensor([[ 0.3343,  0.3311,  0.3259,  ..., -0.0013, -0.0009, -0.0009]])

In [64]:
sr1

44100

In [65]:
sr2

48000

In [66]:
# Convert mono (1 audio channel) files to stereo (2 audio channel)
# sig2 has shape [1, 192000] : it is mono (1 audio channel)

# Convert from mono to stereo by duplicating the first channel
resig2 = torch.cat([sig2,sig2])
resig2.shape

torch.Size([2, 192000])

In [69]:
# Standerdize sample rate
# sr2 is 48000 and sr1 is 44100
# Convert sr2 to sr1
# Since resample applies to a single channel, we resample one channel at a time

def resample(aud, newsr):
    sig, sr = aud

    if (sr == newsr):
        return aud
    
    resigone = torchaudio.transforms.Resample(sr, newsr)(sig[:1,:])
    resigtwo = torchaudio.transforms.Resample(sr, newsr)(sig[1:,:])
    resig2 = torch.cat([resigone, resigtwo])
    return ((resig2, newsr))

aud = resig2, sr2
aud2 = resample(aud, 44100)

In [71]:
print(f'sig1 shape : {sig1.shape}')
print(f'sample rate of 1st audio : {sr1}')
print(f'sig2 shape : {aud2[0].shape}')
print(f'sample rate of second audio : {aud2[1]}')

sig1 shape : torch.Size([2, 115101])
sample rate of 1st audio : 44100
sig2 shape : torch.Size([2, 176400])
sample rate of second audio : 44100


In [72]:
aud1 = ((sig1, sr1))


In [73]:
aud1

(tensor([[0.0000e+00, 0.0000e+00, 1.2207e-04,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00],
         [3.0518e-05, 3.6621e-04, 8.5449e-04,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00]]),
 44100)

In [74]:
aud2

(tensor([[ 0.3193,  0.3366,  0.3220,  ..., -0.0013, -0.0010, -0.0009],
         [ 0.3193,  0.3366,  0.3220,  ..., -0.0013, -0.0010, -0.0009]]),
 44100)

In [None]:
# Resize to the same length
# Pad (or truncate) the signal to a fixed length 'max_ms' in milliseconds

def pad_trunc(aud, max_ms):
    sig, sr = aud
    num_rows, sig_len = sig.shape
    max_len = sr//1000 * max_ms

    if (sig_len > max_len):
        # Truncate the signal to the given length
        sig = sig[:, :max_len]

    elif (sig_len < max_len):
        # Length of padding to add at the beginning and end of the signal
        pad_begin_len = random.randint(0, max_len - sig_len)
        pad_end_len = max_len - sig_len - pad_begin_len

        # Pad with 0s
        pad_begin = torch.zeros((num_rows, pad_begin_len))
        pad_end = torch.zeros((num_rows, pad_end_len))

        sig = torch.cat((pad_begin, sig, pad_end), 1)

        return (sig, sr)

In [111]:
sig1 = torch.tensor([[1, 2, 3, 4, 5], [2, 4, 5, 6, 7]])


In [112]:
num_rows, sig_len = sig1.shape

In [113]:
sig_len

5

In [114]:
max_len = 23

In [115]:
sig_len < max_len

True

In [116]:
pad_begin_len = random.randint(0, max_len - sig_len)
pad_begin_len

13

In [117]:
pad_end_len = max_len - sig_len - pad_begin_len
pad_end_len

5

In [119]:
pad_begin = torch.zeros((num_rows, pad_begin_len))
pad_begin

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [120]:
pad_end = torch.zeros((num_rows, pad_end_len))
pad_end

tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]])

In [121]:
sig = torch.cat((pad_begin, sig1, pad_end), 1)

In [122]:
sig

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 2., 3., 4., 5.,
         0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 2., 4., 5., 6., 7.,
         0., 0., 0., 0., 0.]])