<a href="https://colab.research.google.com/github/TevinMusau/Integrating_Voice_to_Mobile_Payment_Systems_Using_Convolutional_Neural_Networks-A_Case_of_MPESA/blob/model/Keyword_Spotting_Data_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import librosa
import os
import json

# Data Preprocessing

- Extract an audio feature for each audio sample (MFCC)
- Store this in a JSON file

In [None]:
# Mounting Google Drive to obtain the data and store it
from google.colab import drive
drive.mount('/content/drive')

# dataset path
DATASET_PATH = "/content/drive/MyDrive/ICS_PROJECT/Datasets/Dataset_1"

# preprocessed location to store the resulting json file
JSON_PATH = "/content/drive/MyDrive/ICS_PROJECT/Modules/Keyword_Spotting/Outputs/prepared_data.json"

# how many samples to consider for preprocessinng
# 22050 is 1 second worth of sound in Librosa
SAMPLES_TO_CONSIDER = 22050

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# function to prepare the data (it will go through all audio files, extract MFCCs and add them to a JSON file)
# Params
  # dataset_path = path to the data
  # json_path = path to where the preprocessed data is saved
  # n_mfcc = the number of coefficients we want to extract
  # hop_length = tells us how big a segment should be in frames
  # n_fft = 

def prepare_dataset(dataset_path, json_path, n_mfcc = 13, hop_length = 512, n_fft = 2048):
  # data dictionary to store all the data we extract
  data = {
      "mappings": [],         # map key words to numbers
      "labels": [],           # target value (outputs) for the above mappings
      "MFCCs": [],            # inputs
      "files": []             # file name with path
  }

  # loop through all the sub directories for the datasets
  for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):
    
    # we need to ensure we are not at a root level
    if dirpath is not dataset_path:

      # update the mappings
      category = dirpath.split("/") # split the name at the path e.g. dataset/down -> [dataset, down]
      data["mappings"].append(category[-1]) # append the last item in the list
      print(f"Processing {category}")

      # loop through all filenames and extract MFCCs
      for f in filenames:

        # get file path
        file_path = os.path.join(dirpath, f) # join dirpath with filename

        # load the audio file
        signal, sample_rate = librosa.load(file_path)

        # ensure the audio file is at least 1 sec
        if len(signal) >= SAMPLES_TO_CONSIDER:

          # enforce 1 sec long signal
          signal = signal[:SAMPLES_TO_CONSIDER] # Consider the first second of data and ignore the rest

          # extract the MFCCs
          MFCCs = librosa.feature.mfcc(signal, n_mfcc = n_mfcc, hop_length = hop_length, n_fft = n_fft)

          # store the data
          data["labels"].append(i-1)
          data["MFCCs"].append(MFCCs.T.tolist()) # transpose and cast to a list
          data["files"].append(file_path)
          print(f"{file_path}: {i-1}")
  
  # store in a JSON file
  # open a new file at "json_path" in write mode
  with open(json_path, "w") as fp:
    json.dump(data, fp, indent = 4)

In [None]:
prepare_dataset(DATASET_PATH, JSON_PATH)