# Packages and libraries

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import sys
import math
import json
import pathlib
import librosa
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
from IPython.display import Audio, display
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split


# Constants

In [2]:
SAMPLE_RATE = 16000
BATCH_SIZE = 32
VALIDATION_SPLIT = 0.2
SEED = 42

# Files Paths

## Train and test directory paths

In [3]:
train_data_dir = pathlib.Path('Dataset/Train_1')
test_data_dir = pathlib.Path('Dataset/Test')

train_files = list(train_data_dir.glob('*.wav'))
test_files = list(test_data_dir.glob('*.wav'))

## Directory structure

In [4]:
def print_directory_tree(root_dir, indent=''):
    print(indent + os.path.basename(root_dir) + os.path.sep)
    indent += '    '
    for item in os.listdir(root_dir):
        item_path = os.path.join(root_dir, item)
        if os.path.isdir(item_path):
            print_directory_tree(item_path, indent)


## Train and test

In [5]:
print_directory_tree(train_data_dir)

Train_1/
    emabega/
    mumaaso/
    yimirira/
    ddyo/
    gaali/
    kkono/


In [6]:
print_directory_tree(test_data_dir)

Test/
    emabega/
    mumaaso/
    yimirira/
    ddyo/
    gaali/
    kkono/


In [None]:
def prepare_dataset(dataset_path, json_path, n_mfcc=13, hop_length=512, n_fft=2048):
    data = {
        'mappings': [],
        'labels': [],
        'MFCCs': [],
        'files': []
    }

    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):
        # check if we are not at the root director
        if dirpath is not dataset_path:
            # update the mappings
            category = dirpath.split("/")[-1]
            data['mappings'].append(category)
            print(f"Processing {category}")

            # loop through all the filenames and extract the MFCCs
            for f in filenames:
                file_path = os.path.join(dirpath, f)
                signal, sr = librosa.load(file_path, sr=SAMPLE_RATE)

                # ensure the audio file is at least 2 second
                if len(signal) >= SAMPLE_RATE:
                    # ensure the signal is at least 2 second
                    signal = signal[:SAMPLE_RATE]

                    # extract the MFCCs
                    MFCCs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
                    data['MFCCs'].append(MFCCs.T.tolist())
                    data['labels'].append(i-1)
                    data['files'].append(file_path)
                    print(f"{file_path}: {i-1}")
    
    with open(json_path, "w") as fp:
        json.dump(data, fp, indent=4)


if __name__ == "__main__":
    prepare_dataset(TRAIN_DATASET_PATH, TRAIN_JSON_PATH)
    prepare_dataset(TEST_DATASET_PATH, TEST_JSON_PATH)

In [7]:
def extract_mfccs_and_labels(root_folder):
    mfccs = []
    labels = []
    
    for subdir, _, files in os.walk(root_folder):
        for file in files:
            file_path = os.path.join(subdir, file)
            if file_path.endswith('.wav'):
                audio, sr = librosa.load(file_path, sr=None)
                mfcc = librosa.feature.mfcc(y=audio, sr=sr)
                mfccs.append(mfcc.T)
                labels.append(os.path.basename(subdir))
    
    return mfccs, labels


## Train

In [8]:
train_mfccs, labels = extract_mfccs_and_labels(train_data_dir)

In [9]:

for mfcc in train_mfccs:
    print(mfcc.shape)

(53, 20)
(27, 20)
(59, 20)
(45, 20)
(41, 20)
(42, 20)
(34, 20)
(48, 20)
(57, 20)
(44, 20)
(46, 20)
(44, 20)
(47, 20)
(52, 20)
(52, 20)
(63, 20)
(33, 20)
(53, 20)
(57, 20)
(36, 20)
(41, 20)
(42, 20)
(51, 20)
(56, 20)
(44, 20)
(48, 20)
(54, 20)
(52, 20)
(52, 20)
(55, 20)
(62, 20)
(40, 20)
(32, 20)
(47, 20)
(41, 20)
(56, 20)
(63, 20)
(49, 20)
(47, 20)
(57, 20)
(41, 20)
(41, 20)
(30, 20)
(51, 20)
(43, 20)
(41, 20)
(37, 20)
(47, 20)
(39, 20)
(45, 20)
(53, 20)
(44, 20)
(42, 20)
(62, 20)
(30, 20)
(59, 20)
(35, 20)
(40, 20)
(63, 20)
(52, 20)
(36, 20)
(23, 20)
(35, 20)
(58, 20)
(53, 20)
(36, 20)
(57, 20)
(32, 20)
(52, 20)
(33, 20)
(36, 20)
(38, 20)
(54, 20)
(26, 20)
(31, 20)
(53, 20)
(39, 20)
(58, 20)
(25, 20)
(54, 20)
(33, 20)
(47, 20)
(40, 20)
(25, 20)
(41, 20)
(47, 20)
(54, 20)
(44, 20)
(44, 20)
(30, 20)
(48, 20)
(28, 20)
(36, 20)
(19, 20)
(43, 20)
(41, 20)
(40, 20)
(52, 20)
(54, 20)
(19, 20)
(42, 20)
(45, 20)
(39, 20)
(45, 20)
(57, 20)
(44, 20)
(51, 20)
(53, 20)
(25, 20)
(50, 20)
(52, 20)
(

In [12]:
labels

['emabega',
 'emabega',
 'emabega',
 'emabega',
 'emabega',
 'emabega',
 'emabega',
 'emabega',
 'emabega',
 'emabega',
 'emabega',
 'emabega',
 'emabega',
 'emabega',
 'emabega',
 'emabega',
 'emabega',
 'emabega',
 'emabega',
 'emabega',
 'mumaaso',
 'mumaaso',
 'mumaaso',
 'mumaaso',
 'mumaaso',
 'mumaaso',
 'mumaaso',
 'mumaaso',
 'mumaaso',
 'mumaaso',
 'mumaaso',
 'mumaaso',
 'mumaaso',
 'mumaaso',
 'mumaaso',
 'mumaaso',
 'mumaaso',
 'mumaaso',
 'mumaaso',
 'mumaaso',
 'yimirira',
 'yimirira',
 'yimirira',
 'yimirira',
 'yimirira',
 'yimirira',
 'yimirira',
 'yimirira',
 'yimirira',
 'yimirira',
 'yimirira',
 'yimirira',
 'yimirira',
 'yimirira',
 'yimirira',
 'yimirira',
 'yimirira',
 'yimirira',
 'yimirira',
 'yimirira',
 'ddyo',
 'ddyo',
 'ddyo',
 'ddyo',
 'ddyo',
 'ddyo',
 'ddyo',
 'ddyo',
 'ddyo',
 'ddyo',
 'ddyo',
 'ddyo',
 'ddyo',
 'ddyo',
 'ddyo',
 'ddyo',
 'ddyo',
 'ddyo',
 'ddyo',
 'ddyo',
 'gaali',
 'gaali',
 'gaali',
 'gaali',
 'gaali',
 'gaali',
 'gaali',
 'gaali',


## Test

In [10]:
# test_mfccs = extract_mfccs(test_data_dir)

In [11]:
# for mfcc in mfccs:
#     print(test_data_dir.shape)