In [109]:
import json
import os
from pathlib import Path
import librosa
import numpy as np
from typing import Dict, List
from tqdm import tqdm

In [110]:
SAMPLE_RATE = 10000
SEGMENT_DURATION = 0.0256  # duration of each segment in seconds
OVERLAP_DURATION = 50   # overlap duration in % (0 means no overlap)
NUM_MFCC = 16
N_FFT = 2 ** int(np.ceil(np.log2(SEGMENT_DURATION * SAMPLE_RATE)))
SEGMENT_HOP_LENGTH = int(SAMPLE_RATE * SEGMENT_DURATION * (1 - OVERLAP_DURATION / 100))
MFCC_HOP_LENGTH = SEGMENT_HOP_LENGTH

CLASSES = ["RMT", "DPR", "HRK"]

#FILE_NAME = "data_25ms_h=0%_16mfcc_"
FILE_NAME = "data_25ms_h=" + str(OVERLAP_DURATION) + "%_16mfcc_"



for classname in CLASSES:
    FILE_NAME += classname
    if classname != CLASSES[len(CLASSES) - 1]:
        FILE_NAME += "-"

In [111]:
DATASET_PATH = "./data/original_dataset"
JSON_PATH = "./data/" + FILE_NAME + ".json"

print(JSON_PATH)

./data/data_25ms_h=50%_16mfcc_RMT-DPR-HRK.json


In [112]:
print(f"n_fft: {N_FFT}, hop_length: {SEGMENT_HOP_LENGTH}")

n_fft: 256, hop_length: 128


In [113]:
def extract_mfcc(file_path: str, num_mfcc: int, n_fft: int, hop_length: int) -> List[List[float]]:
    """Extract MFCCs from an audio file."""
    signal, sample_rate = librosa.load(file_path, sr=SAMPLE_RATE)
    track_duration = librosa.get_duration(y=signal, sr=SAMPLE_RATE)
    samples_per_track = SAMPLE_RATE * track_duration
    samples_per_segment = int(SAMPLE_RATE * SEGMENT_DURATION)
    num_segments = int((samples_per_track - samples_per_segment) / hop_length) + 1

    mfccs = []
    for d in range(num_segments):
        start = hop_length * d
        finish = start + samples_per_segment
        if finish > len(signal):
            finish = len(signal)
        mfcc = librosa.feature.mfcc(y=signal[start:finish], sr=SAMPLE_RATE, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length, center=False)
        mfccs.append(mfcc.T.tolist())
    return mfccs

In [None]:
def calculate_delta(mfcc_a: list, mfcc_b: list):
    newdelta = np.zeros((len(mfcc_a))).tolist()

    for i in range(len(mfcc_a)):
        newdelta[i] = (mfcc_b[i]- mfcc_a[i])

    return newdelta

In [None]:
def calculate_deltas(mfccs: list):

    deltamfccs = np.zeros((len(mfccs),1,16)).tolist()

    deltamfccs[0][0] = calculate_delta(mfccs[0][0], mfccs[1][0])

    for ind_mfcc in enumerate(mfccs[1:-1][0]) :
        adjusted_ind_mfcc = ind_mfcc[0] + 1
        #need to exclude first and (second to) last index positions => done with the ...[1:-1]
        deltamfccs[adjusted_ind_mfcc][0] = calculate_delta(mfccs[adjusted_ind_mfcc + 1][0], mfccs[adjusted_ind_mfcc - 1][0])
    
    deltamfccs[-1][0] = calculate_delta(mfccs[-2][0], mfccs[-1][0])
    
    return deltamfccs

In [116]:
def save_mfcc(dataset_path: str, json_path: str, selected_classes: List[str], voc_only: bool = True, num_mfcc: int = 16, n_fft: int = 256, hop_length: int = 256) -> None:
    """Extracts MFCCs from audio dataset and saves them into a json file along with class labels."""
    data: Dict[str, List] = {"mapping": [], "labels": [], "mfcc": [], "deltamfcc": [], "deltadeltamfcc": [], "files": []}
    dataset_path = Path(dataset_path)

    # Create a dictionary to map the selected classes to their respective labels
    class_to_label = {c: i for i, c in enumerate(selected_classes)}

    # Walk through dataset path
    for dirpath, _, filenames in os.walk(dataset_path):
        if dirpath != str(dataset_path):
            semantic_label = Path(dirpath).name
            if semantic_label in ["DPR", "HRK"]:
                if "DPR+HRK" in selected_classes:
                    semantic_label = "DPR+HRK"
                elif semantic_label not in selected_classes:
                    continue
            
            if semantic_label in selected_classes:
                if semantic_label not in data["mapping"]:
                    data["mapping"].append(semantic_label)
                print(f"\nProcessing: {semantic_label}")

                for f in tqdm(filenames, desc=f"Processing files in {semantic_label}", leave=False):
                    if ('voc' in f) == voc_only:
                        file_path = str(Path(dirpath) / f)
                        mfccs = extract_mfcc(file_path, num_mfcc, n_fft, hop_length)
                        deltamfccs  = calculate_deltas(mfccs)
                        deltadeltamfccs = calculate_deltas(deltamfccs)
                        data["mfcc"].extend(mfccs)
                        data["deltamfcc"].extend(deltamfccs)
                        data["deltadeltamfcc"].extend(deltadeltamfccs)
                        data["labels"].extend([class_to_label[semantic_label]] * len(mfccs))
                        data["files"].extend([file_path] * len(mfccs))
                        print(f"{file_path}, segments: {len(mfccs)}")

    with open(json_path, "w") as fp:
        json.dump(data, fp, indent=4)
    
    print("\nDONE")

maybe what i can do is concatenating mfcc, delta mfcc and delta delta mfcc informations to be able to use the cnn2D

In [117]:
save_mfcc(DATASET_PATH, JSON_PATH, selected_classes=CLASSES, voc_only=True, num_mfcc=NUM_MFCC, n_fft=N_FFT, hop_length=SEGMENT_HOP_LENGTH)


Processing: DPR


Processing files in DPR:   8%|▊         | 2/26 [00:37<07:35, 18.97s/it]

data\original_dataset\DPR\011106nt1_intvoc.wav, segments: 17053


Processing files in DPR:  15%|█▌        | 4/26 [01:22<07:39, 20.87s/it]

data\original_dataset\DPR\013106ab3_intvoc.wav, segments: 21474


Processing files in DPR:  23%|██▎       | 6/26 [01:56<06:23, 19.17s/it]

data\original_dataset\DPR\020106nt1_intvoc.wav, segments: 16467


Processing files in DPR:  31%|███       | 8/26 [02:27<05:17, 17.66s/it]

data\original_dataset\DPR\021605nt3_intvoc.wav, segments: 14458


Processing files in DPR:  38%|███▊      | 10/26 [03:15<05:20, 20.06s/it]

data\original_dataset\DPR\022505nt1_intvoc.wav, segments: 20377


Processing files in DPR:  46%|████▌     | 12/26 [03:30<03:39, 15.66s/it]

data\original_dataset\DPR\040505nt2_intvoc.wav, segments: 4631


Processing files in DPR:  54%|█████▍    | 14/26 [03:51<02:47, 13.96s/it]

data\original_dataset\DPR\051105nt1_intvoc.wav, segments: 8602


Processing files in DPR:  62%|██████▏   | 16/26 [04:33<02:42, 16.22s/it]

data\original_dataset\DPR\061405nt1_intvoc.wav, segments: 20163


Processing files in DPR:  69%|██████▉   | 18/26 [04:38<01:36, 12.04s/it]

data\original_dataset\DPR\072604nt1_intvoc.wav, segments: 2561


Processing files in DPR:  77%|███████▋  | 20/26 [05:20<01:28, 14.68s/it]

data\original_dataset\DPR\090105nt1_intvoc.wav, segments: 17063


Processing files in DPR:  85%|████████▍ | 22/26 [05:24<00:43, 10.83s/it]

data\original_dataset\DPR\092503hnf_intvoc.wav, segments: 1522


Processing files in DPR:  92%|█████████▏| 24/26 [06:13<00:30, 15.01s/it]

data\original_dataset\DPR\101805ab1_intvoc.wav, segments: 18391


                                                                        

data\original_dataset\DPR\110403hnf_intvoc.wav, segments: 11442

Processing: HRK


Processing files in HRK:  10%|█         | 2/20 [00:37<05:34, 18.60s/it]

data\original_dataset\HRK\020806nt1_intvoc.wav, segments: 14159


Processing files in HRK:  20%|██        | 4/20 [01:04<04:10, 15.67s/it]

data\original_dataset\HRK\030905nt2_intvoc.wav, segments: 10522


Processing files in HRK:  30%|███       | 6/20 [01:38<03:48, 16.31s/it]

data\original_dataset\HRK\032405nt2_intvoc.wav, segments: 12980


Processing files in HRK:  40%|████      | 8/20 [02:52<04:54, 24.55s/it]

data\original_dataset\HRK\051105nt3_intvoc.wav, segments: 32642


Processing files in HRK:  50%|█████     | 10/20 [03:58<04:34, 27.44s/it]

data\original_dataset\HRK\072604nt2_intvoc.wav, segments: 31341


Processing files in HRK:  60%|██████    | 12/20 [04:56<03:44, 28.09s/it]

data\original_dataset\HRK\080304nt2_intvoc.wav, segments: 28274


Processing files in HRK:  70%|███████   | 14/20 [05:35<02:31, 25.27s/it]

data\original_dataset\HRK\081204nt1_intvoc.wav, segments: 18697


Processing files in HRK:  80%|████████  | 16/20 [07:20<02:15, 33.89s/it]

data\original_dataset\HRK\083004nt1_intvoc.wav, segments: 49102


Processing files in HRK:  90%|█████████ | 18/20 [07:25<00:48, 24.08s/it]

data\original_dataset\HRK\091704nt1_intvoc.wav, segments: 1324


                                                                        

data\original_dataset\HRK\103105nt2_intvoc.wav, segments: 24416

Processing: RMT


Processing files in RMT:  10%|█         | 2/20 [00:31<04:40, 15.56s/it]

data\original_dataset\RMT\010506nt1_intvoc.wav, segments: 12831


Processing files in RMT:  20%|██        | 4/20 [01:22<05:42, 21.39s/it]

data\original_dataset\RMT\011706nt1_intvoc.wav, segments: 17434


Processing files in RMT:  30%|███       | 6/20 [02:32<06:26, 27.59s/it]

data\original_dataset\RMT\021406nt2_intvoc.wav, segments: 21802


Processing files in RMT:  40%|████      | 8/20 [04:05<07:00, 35.07s/it]

data\original_dataset\RMT\070805nt1_intvoc.wav, segments: 28563


Processing files in RMT:  50%|█████     | 10/20 [04:49<05:03, 30.36s/it]

data\original_dataset\RMT\091305nt1_intvoc.wav, segments: 13541


Processing files in RMT:  60%|██████    | 12/20 [05:37<03:45, 28.21s/it]

data\original_dataset\RMT\092005nt1_intvoc.wav, segments: 15133


Processing files in RMT:  70%|███████   | 14/20 [05:49<02:06, 21.02s/it]

data\original_dataset\RMT\092403rms_intvoc.wav, segments: 3709


Processing files in RMT:  80%|████████  | 16/20 [05:51<00:58, 14.64s/it]

data\original_dataset\RMT\100803hnf_intvoc.wav, segments: 567


Processing files in RMT:  90%|█████████ | 18/20 [06:02<00:23, 11.81s/it]

data\original_dataset\RMT\100903hnf_intvoc.wav, segments: 3431


                                                                        

data\original_dataset\RMT\121803hnf_intvoc.wav, segments: 45449





DONE
