In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import librosa
import librosa.display
import IPython.display as ipd

pwd = os.getcwd()
# load ravdess.npy , savee.npy and tess.npy
ravdess_data = np.load('ravdess.npy')
savee_data = np.load('savee.npy') 
tess_data = np.load('tess.npy')
crema_data = np.load('crema.npy')

# ravdess_data.shape, savee_data.shape, tess_data.shape
all_data = np.vstack((ravdess_data, savee_data, tess_data, crema_data))
df = pd.DataFrame(all_data, columns=['label', 'gender', 'pathname', 'filename'])
df.head(), df.shape

# save all data into npy file
# np.save("all_datasets.npy", df)
def plot_spec(y, sr, hop_size, y_axis):
    plt.figure(figsize=(10, 7))
    librosa.display.specshow(y, sr=sr, hop_length=hop_size, x_axis='time', y_axis= y_axis)
    plt.colorbar(format='%+2.0f dB')

In [24]:
# load wav files using the 'pathname' and 'filename' columns of all_data with librosa
frame_size = 4096  # in samples
hop_size = 1024  # in samples
temporal_chunk_size = 20  # number of temporal bins per sample
mel_bands = 128  # number of mel bands
silence_threshold = 40  # in  relative to peak dB
in_dB = True  # convert to dB
mfcc_coefficients = 40  # number of MFCC coefficients

# total temporal bins is total_samples/hop_size

data = []

for sample_index in range(df.shape[0]):
    stft_temporal_chunks = []

    pathname = df['pathname'][sample_index]
    filename = df['filename'][sample_index]

    wav, sr = librosa.load(pwd + pathname + filename)
    trimmed_wav, _ = librosa.effects.trim(wav, top_db=silence_threshold)

    if sr != 22050:
        raise ValueError("Sample rate is not 22050Hz")

    # extract audio features for the audio file
    S_audio = librosa.stft(trimmed_wav, n_fft=frame_size, hop_length=hop_size)
    y_audio = np.abs(S_audio)

    mel_spec = librosa.feature.melspectrogram(
        S=y_audio, sr=sr, n_fft=frame_size, hop_length=hop_size, n_mels=mel_bands)

    mfccs = librosa.feature.mfcc(S=librosa.power_to_db(mel_spec), sr=sr, n_mfcc=mfcc_coefficients)


    # librosa.display.specshow(
    #     _y[:temporal_chunk_size].T, sr=sr, hop_length=hop_size, x_axis='time', y_axis='log')
    # plt.show()

    # extract the mel spectrogram

    if in_dB:
        y_audio = librosa.power_to_db(y_audio, ref=np.max)
        mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
        # mfccs = librosa.power_to_db(mfccs)
        
    # split y into chunks of size temporal_chunk_size.
    _y = (y_audio).T
    _mel_spec = (mel_spec).T
    _mfccs = (mfccs).T


    split_indices = np.unique([(i, len(_y) - temporal_chunk_size)[int(
        i + temporal_chunk_size >= len(_y))]for i in range(0, len(_y), temporal_chunk_size)])

    # to include mel-spec add `_mel_spec[i:i+temporal_chunk_size].T` in the list
    [ stft_temporal_chunks.append([ 
        _y[i:i+temporal_chunk_size].T, 
        _mel_spec[i:i+temporal_chunk_size].T,
        _mfccs[i:i+temporal_chunk_size].T ,
        df['gender'][sample_index], 
        df['label'][sample_index]])
     for i in split_indices ]

    # librosa.display.waveshow(trimmed_wav, sr=sr)

    [data.append(c) for c in stft_temporal_chunks]


In [32]:
# data = np.array(data) 
processed_dataset = pd.DataFrame(data, columns=['stft_data', 'mel_data', 'mfcc_data', 'gender', 'label'])

#plot the stft at index 0 of processed_dataset
# stfts = processed_dataset['mfcc_data'][:10]
# for stft in stfts:
#     # print(stft[0].shape)
#     plot_spec(stft, hop_size=hop_size, sr=sr, y_axis='mel')

In [33]:
np.save("processed_dataset.npy",  processed_dataset)