In [1]:
import json
import os
from pathlib import Path
import librosa
import numpy as np
from typing import Dict, List
from tqdm import tqdm

In [2]:
SAMPLE_RATE = 10000
SEGMENT_DURATION = 0.0256  # duration of each segment in seconds
OVERLAP_DURATION = 50   # overlap duration in % (0 means no overlap)
NUM_MFCC = 16
N_FFT = 2 ** int(np.ceil(np.log2(SEGMENT_DURATION * SAMPLE_RATE)))
SEGMENT_HOP_LENGTH = int(SAMPLE_RATE * SEGMENT_DURATION * (1 - OVERLAP_DURATION / 100))
MFCC_HOP_LENGTH = SEGMENT_HOP_LENGTH

CLASSES = ["RMT", "DPR", "HRK"]

#FILE_NAME = "data_25ms_h=0%_16mfcc_"
FILE_NAME = "data_25ms_h=" + str(OVERLAP_DURATION) + "%_16mfcc_"



for classname in CLASSES:
    FILE_NAME += classname
    if classname != CLASSES[len(CLASSES) - 1]:
        FILE_NAME += "-"

In [3]:
DATASET_PATH = "./data/original_dataset"
JSON_PATH = "./data/" + FILE_NAME + ".json"

print(JSON_PATH)

./data/data_25ms_h=50%_16mfcc_RMT-DPR-HRK.json


In [4]:
print(f"n_fft: {N_FFT}, hop_length: {SEGMENT_HOP_LENGTH}")

n_fft: 256, hop_length: 128


In [5]:
def extract_mfcc(file_path: str, num_mfcc: int, n_fft: int, hop_length: int) -> List[List[float]]:
    """Extract MFCCs from an audio file."""
    signal, sample_rate = librosa.load(file_path, sr=SAMPLE_RATE)
    track_duration = librosa.get_duration(y=signal, sr=SAMPLE_RATE)
    samples_per_track = SAMPLE_RATE * track_duration
    samples_per_segment = int(SAMPLE_RATE * SEGMENT_DURATION)
    num_segments = int((samples_per_track - samples_per_segment) / hop_length) + 1

    mfccs = []
    for d in range(num_segments):
        start = hop_length * d
        finish = start + samples_per_segment
        if finish > len(signal):
            finish = len(signal)
        mfcc = librosa.feature.mfcc(y=signal[start:finish], sr=SAMPLE_RATE, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length, center=False)
        mfccs.append(mfcc.T.tolist())
    return mfccs

In [6]:
def calculate_delta(mfcc_a: list, mfcc_b: list):
    # print("mfcc a shape : {mfcc_a.shape} \nmfcc b shape : {mfcc_b.shape} \n-----------")
    newdelta = []
    for i in range(len(mfcc_b)):
        if(type(mfcc_b[i]) == float and type(mfcc_a[i]) == float):

            # print(mfcc_b[i])
            
            newdelta[i].append(mfcc_b[i]- mfcc_a[i])
        else:
            
            newdelta.append(mfcc_b[0][i] - mfcc_a[0][i]) #added a 0 index because of the MFCC formating
    return newdelta

In [7]:
def calculate_deltas(mfccs: list):
    deltamfccs = [len(mfccs)][1][16]


    print(np.asarray(mfccs[0]).shape)
    print(len(mfccs[0]))
    print(len(mfccs[0][0]))

    print(mfccs[0][0])

    deltamfccs[0][0] = calculate_delta(mfccs[0][0], mfccs[1][0])

    for ind_mfcc in enumerate(mfccs[1:-1][0]) :
        adjusted_ind_mfcc = ind_mfcc[0] + 1
        #need to exclude first and (second to) last index positions => done with the ...[1:-1]
        deltamfccs[adjusted_ind_mfcc][0] = calculate_delta(mfccs[adjusted_ind_mfcc + 1], mfccs[adjusted_ind_mfcc - 1])
    
    deltamfccs[-1][0] = calculate_delta(mfccs[-2], mfccs[-1])
    
    return deltamfccs

In [8]:
def save_mfcc(dataset_path: str, json_path: str, selected_classes: List[str], voc_only: bool = True, num_mfcc: int = 16, n_fft: int = 256, hop_length: int = 256) -> None:
    """Extracts MFCCs from audio dataset and saves them into a json file along with class labels."""
    data: Dict[str, List] = {"mapping": [], "labels": [], "mfcc": [], "deltamfcc": [], "deltadeltamfcc": [], "files": []}
    dataset_path = Path(dataset_path)

    # Create a dictionary to map the selected classes to their respective labels
    class_to_label = {c: i for i, c in enumerate(selected_classes)}

    # Walk through dataset path
    for dirpath, _, filenames in os.walk(dataset_path):
        if dirpath != str(dataset_path):
            semantic_label = Path(dirpath).name
            if semantic_label in ["DPR", "HRK"]:
                if "DPR+HRK" in selected_classes:
                    semantic_label = "DPR+HRK"
                elif semantic_label not in selected_classes:
                    continue
            
            if semantic_label in selected_classes:
                if semantic_label not in data["mapping"]:
                    data["mapping"].append(semantic_label)
                print(f"\nProcessing: {semantic_label}")

                for f in tqdm(filenames, desc=f"Processing files in {semantic_label}", leave=False):
                    if ('voc' in f) == voc_only:
                        file_path = str(Path(dirpath) / f)
                        mfccs = extract_mfcc(file_path, num_mfcc, n_fft, hop_length)
                        # deltamfccs  = calculate_deltas(mfccs)
                        # deltadeltamfccs = calculate_deltas(deltamfccs)
                        data["mfcc"].extend(mfccs)
                        # data["deltamfcc"].extend(deltamfccs)
                        # data["deltadeltamfcc"].extend(deltadeltamfccs)
                        data["labels"].extend([class_to_label[semantic_label]] * len(mfccs))
                        data["files"].extend([file_path] * len(mfccs))
                        print(f"{file_path}, segments: {len(mfccs)}")

    with open(json_path, "w") as fp:
        json.dump(data, fp, indent=4)
    
    print("\nDONE")

maybe what i can do is concatenating mfcc, delta mfcc and delta delta mfcc informations to be able to use the cnn2D

In [None]:
save_mfcc(DATASET_PATH, JSON_PATH, selected_classes=CLASSES, voc_only=True, num_mfcc=NUM_MFCC, n_fft=N_FFT, hop_length=SEGMENT_HOP_LENGTH)


Processing: DPR


Processing files in DPR:   0%|          | 0/26 [00:00<?, ?it/s]

Processing files in DPR:   8%|▊         | 2/26 [00:48<09:36, 24.02s/it]

data\original_dataset\DPR\011106nt1_intvoc.wav, segments: 17053


Processing files in DPR:  15%|█▌        | 4/26 [01:34<08:39, 23.60s/it]

data\original_dataset\DPR\013106ab3_intvoc.wav, segments: 21474


Processing files in DPR:  23%|██▎       | 6/26 [02:10<07:00, 21.05s/it]

data\original_dataset\DPR\020106nt1_intvoc.wav, segments: 16467


Processing files in DPR:  31%|███       | 8/26 [02:42<05:42, 19.03s/it]

data\original_dataset\DPR\021605nt3_intvoc.wav, segments: 14458


Processing files in DPR:  38%|███▊      | 10/26 [03:26<05:20, 20.04s/it]

data\original_dataset\DPR\022505nt1_intvoc.wav, segments: 20377


Processing files in DPR:  46%|████▌     | 12/26 [03:36<03:29, 14.98s/it]

data\original_dataset\DPR\040505nt2_intvoc.wav, segments: 4631


Processing files in DPR:  54%|█████▍    | 14/26 [03:57<02:42, 13.51s/it]

data\original_dataset\DPR\051105nt1_intvoc.wav, segments: 8602
