# Audio Preprocessing
### Annotation and MFCC feature extraction

In [1]:
import librosa
import os
import json

In [2]:
DATASET_PATH = "speech_commands_v0.01"
JSON_PATH = "speech_commands_v0.01_data.json"
# number of samples to consider for processing of audio files.:
SAMPLES_TO_CONSIDER = 22050 # 22050 is 1 sec of audio in librosa.

In [4]:
#hop_length: number of samples between successive frames.
def prepare_dataset(dataset_path, json_path, n_mfcc=13, hop_length=512, n_fft=2048):
    
    # data dictionary
    data = {
        "mappings": [],
        # target values for the neural network.
        "labels": [],
        # MFCCSs inputs for the neural network.
        "MFCCs": [],
        # list of file paths for audio files for us to know.
        "files": []
    }
    
    entries ={
        
    }
    
    # loop through all sub-dirs
    # Walk throught folder structure recursively and return a 3-tuple: (dirpath, dirnames, filenames)
    # -> getting from dataet_path to the first sul-dir and get all filenaes in that sub-dir. Then it goes to another folder... .
    for i, (dirpath, dirnames, filenames) in enumerate (os.walk(dataset_path)):
    
        # ensure we're at sub-folder level
        if dirpath is not dataset_path:
            
            # update mappings
            label = dirpath.split("\\")[-1] # splits the path with "/", we need the last index. dataset/down -> [dataset, down] 
            data["mappings"].append(label)
            print(f"Processing {label}")
            
            # loop through all filenames and extract MFCCs
            for f in filenames:
                
                # get file path
                file_path = os.path.join(dirpath, f)
                
                # load audio file 
                signal, sr = librosa.load(file_path)
                
                # check if signal is at least 1 sec.
                if len(signal) >= SAMPLES_TO_CONSIDER:
                    
                    # enforce 1 sec. long signal - take only the first SAMPLES_TO_CONSIDER
                    signal = signal[:SAMPLES_TO_CONSIDER]
                    
                    # extract MFCCs
                    MFCCs = librosa.feature.mfcc(y = signal, n_mfcc=n_mfcc, hop_length=hop_length, n_fft=n_fft)
                    
                    # store data
                    # the root dir (dataset_path) is equal to i = 0, so we need to substract 1 to start from 0.
                    data["labels"].append(i-1)
                    # we need to transpose the MFCCs matrix to list.
                    data["MFCCs"].append(MFCCs.T.tolist())
                    data["files"].append(file_path)
                    # show some progress :)
                    print(f"{file_path}: {i-1}")
                    
    #Store in json file:
    with open (json_path, "w") as fp:
        json.dump(data, fp, indent=4)
        
if __name__ == "__main__":
    prepare_dataset(DATASET_PATH, JSON_PATH)

Processing five
speech_commands_v0.01\five\004ae714_nohash_0.wav: 0
speech_commands_v0.01\five\00b01445_nohash_1.wav: 0
speech_commands_v0.01\five\00f0204f_nohash_0.wav: 0
speech_commands_v0.01\five\012c8314_nohash_1.wav: 0
speech_commands_v0.01\five\0132a06d_nohash_0.wav: 0
speech_commands_v0.01\five\0132a06d_nohash_1.wav: 0
speech_commands_v0.01\five\0132a06d_nohash_2.wav: 0
speech_commands_v0.01\five\0132a06d_nohash_3.wav: 0
speech_commands_v0.01\five\0132a06d_nohash_4.wav: 0
speech_commands_v0.01\five\0135f3f2_nohash_0.wav: 0
speech_commands_v0.01\five\0137b3f4_nohash_0.wav: 0
speech_commands_v0.01\five\0137b3f4_nohash_1.wav: 0
speech_commands_v0.01\five\0137b3f4_nohash_2.wav: 0
speech_commands_v0.01\five\0137b3f4_nohash_3.wav: 0
speech_commands_v0.01\five\0137b3f4_nohash_4.wav: 0
speech_commands_v0.01\five\01648c51_nohash_0.wav: 0
speech_commands_v0.01\five\01bb6a2a_nohash_0.wav: 0
speech_commands_v0.01\five\01bb6a2a_nohash_1.wav: 0
speech_commands_v0.01\five\01bb6a2a_nohash_2.wav