In [1]:
# Mounting google drive

from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
# using pydub to add padding to files

!pip install pydub



In [2]:
# Import all libraries here

import librosa
import os
import numpy as np
import json

In [None]:
# CONSTANTS

PATH_TO_DRIVE = "/content/drive/My Drive"
DATASET_PATH = PATH_TO_DRIVE + "/datasets/LSTM-noise-speech-classify-data"
JSON_PATH = PATH_TO_DRIVE + "/datasets/noise-speech-data-processed.json"
SAMPLE_RATE = 22050

In [None]:
# Function to save MFCC to JSON

def save_mfcc(dataset_path, json_path, n_mfcc=13, n_fft=2048, hop_length=512):

  # Object to store data
  data = {
      "type": [],
      "mfcc": [],
      "labels": []
  }

  # Loop through all the types of audio
  for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):

    # ensure we are not at the root level
    if dirpath is not dataset_path:

      # save the labels
      dirpath_components = dirpath.split("/") # a/b => ["a", "b"]
      label = dirpath_components[-1]
      data["type"].append(label)
      print("\nProcessing {}...".format(label))

      file_count = 0
      for f in filenames:

        # Get path of the audio file
        file_path = os.path.join(dirpath, f)

        # If duration < 5 secs add padding
        silence = AudioSegment.silent(duration=1000)
        audio = AudioSegment.from_wav(file_path)
        if(audio.duration_seconds < 5):
          # print("{} - {}".format(f, audio.duration_seconds))
          pad_ms = 1000 * (5 - audio.duration_seconds)
          silence = AudioSegment.silent(duration=pad_ms)
          padded = audio + silence
          padded.export(file_path, format='wav')

        # If duration > 5 secs trim the audio file
        if(audio.duration_seconds > 5):
          first_5_sec = audio[:5000]
          first_5_sec.export(file_path, format='wav')
 
        new_audio = AudioSegment.from_wav(file_path)
        if(new_audio.duration_seconds != 5):
          print(new_audio.duration_seconds)
          # return

        # Load audio file with librosa
        signal, sr = librosa.load(file_path, sr=SAMPLE_RATE)

        # Process files, extract MFCCs
        mfcc = librosa.feature.mfcc(signal,
                                    sr=sr,
                                    n_fft=n_fft,
                                    n_mfcc=n_mfcc,
                                    hop_length=hop_length)
        mfcc = mfcc.T

        print(mfcc.shape)

        data["mfcc"].append(mfcc.tolist())
        data["labels"].append(i-1)

        file_count += 1
        print("{} : {} - {}".format(file_count, f, label))

  # Store data as JSON
  with open(json_path, "w") as fp:
    json.dump(data, fp=fp, indent=4)

In [None]:
# Run the function

save_mfcc(DATASET_PATH, JSON_PATH)


Processing speech...
(216, 13)
1 : arctic_b0238.wav - speech
(216, 13)
2 : arctic_a0512.wav - speech
(216, 13)
3 : arctic_a0248.wav - speech
(216, 13)
4 : arctic_a0086.wav - speech
(216, 13)
5 : arctic_a0302.wav - speech
(216, 13)
6 : arctic_a0482.wav - speech
(216, 13)
7 : arctic_a0285.wav - speech
(216, 13)
8 : arctic_a0140.wav - speech
(216, 13)
9 : arctic_b0142.wav - speech
(216, 13)
10 : arctic_a0322.wav - speech
(216, 13)
11 : arctic_a0161.wav - speech
(216, 13)
12 : arctic_a0151.wav - speech
(216, 13)
13 : arctic_b0104.wav - speech
(216, 13)
14 : arctic_b0085.wav - speech
(216, 13)
15 : arctic_a0286.wav - speech
(216, 13)
16 : arctic_a0108.wav - speech
(216, 13)
17 : arctic_a0379.wav - speech
(216, 13)
18 : arctic_a0481.wav - speech
(216, 13)
19 : arctic_a0107.wav - speech
(216, 13)
20 : arctic_a0511.wav - speech
(216, 13)
21 : arctic_b0247.wav - speech
(216, 13)
22 : arctic_a0330.wav - speech
(216, 13)
23 : arctic_b0304.wav - speech
(216, 13)
24 : arctic_a0293.wav - speech
(21