This jupyter notebooks process the databases and creates summary tables. This tables are being used to define database to be used during training.

Expected Datasets:

/dataset/cv-corpus-6.1-2020-12-11

/dataset/speech_commands_v0.01_full

/dataset/fourthbrain

/dataset/heyfourthbrain

/dataset/hellofourthbrain

# Load relevant libraries

In [None]:
from google.colab import drive
import pandas as pd
import os
import librosa
import warnings
import numpy as np
import scipy.io.wavfile as wav
import glob
import shutil
import random
import tensorflow as tf
import soundfile as sf
import matplotlib.pyplot as plt
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D, Dropout, BatchNormalization

import datetime
warnings.simplefilter("ignore", UserWarning)

from IPython.display import Audio

import soundfile

!pip install textgrid
import textgrid

!pip install pydub
from pydub import AudioSegment

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score 
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score 
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve


Collecting textgrid
  Downloading TextGrid-1.5-py3-none-any.whl (10.0 kB)
Installing collected packages: textgrid
Successfully installed textgrid-1.5
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
os.chdir('/content/drive/MyDrive/SpotifyFourthBrainPartnerProject/custom_wakeword_engine/')

In [None]:
database_folder = 'datasets'
create_summary = False

# Create summary  for 'hey fourthbrain' database


In [None]:
if (create_summary == True):
  heyfourthbrain_files = glob.glob(database_folder+'/heyfourthbrain/*.wav')
  heyfourthbrain_validated = database_folder+'/heyfourthbrain/heyfourthbrain_validated.txt'

  df = pd.DataFrame(columns = ['path', 'filename','sentence', 'label', 'lentgh(sec)'])

  for i in range(len(heyfourthbrain_files)):
    sounddata = librosa.core.load(heyfourthbrain_files[i], sr=16000, mono=True)[0]
    df = df.append({'path' : heyfourthbrain_files[i],'filename' : heyfourthbrain_files[i].split('/')[-1], 'sentence' : ' heyfourthbrain ','label' : 'heyfourthbrain', 'lentgh(sec)' : len(sounddata)/16000},ignore_index = True)

  df.to_csv(heyfourthbrain_validated,sep='\t',index=False)

# Create summary  for 'fourthbrain' database


In [None]:
if (create_summary == True):
  fourthbrain_files = glob.glob(database_folder+'/fourthbrain/*.wav')
  fourthbrain_validated = database_folder+'/fourthbrain/fourthbrain_validated.txt'

  df = pd.DataFrame(columns = ['path', 'filename','sentence', 'label', 'lentgh(sec)'])

  for i in range(len(fourthbrain_files)):
    sounddata = librosa.core.load(fourthbrain_files[i], sr=16000, mono=True)[0]
    df = df.append({'path' : fourthbrain_files[i],'filename' : fourthbrain_files[i].split('/')[-1], 'sentence' : ' fourthbrain ','label' : 'fourthbrain', 'lentgh(sec)' : len(sounddata)/16000},ignore_index = True)

  df.to_csv(fourthbrain_validated,sep='\t',index=False)

# Create summary  for MCV data set

In [None]:
if (create_summary == True):

  mcv = database_folder+'/cv-corpus-6.1-2020-12-11/validated.tsv'
  mcv_validated = database_folder+'/cv-corpus-6.1-2020-12-11/mcv_validated.tsv'
  mcv_negtest = database_folder+'/cv-corpus-6.1-2020-12-11/mcv_negtest.tsv'

  df = pd.read_csv(mcv,sep='\t')

  df ['filename'] = df['path']
  df ['path'] = database_folder+'/cv-corpus-6.1-2020-12-11/subset_cv/'+ df ['filename'] 
  df ['label'] = 'NA'
  df = df[['path', 'filename','sentence', 'label']]

  # check which audio files are copied from full database
  candid_paths = [filename.split('/')[-1] for filename in glob.glob('datasets/cv-corpus-6.1-2020-12-11/subset_cv/*.mp3') ]

  df = df[df['filename'].isin(candid_paths)].reset_index(drop=True)
  df_negtest = df.loc[0:1000]
  df_val = df.loc[1001:]

  df_val.to_csv(mcv_validated,sep='\t',index=False)
  df_negtest.to_csv(mcv_negtest,sep='\t',index=False)

# Create summary for SpeechCommand (SC) data set


In [None]:
if (create_summary == True):

# Find labels for all files and save them as a datasummary in .csv file
  sc_labels = [ name for name in os.listdir(database_folder+'/speech_commands_v0.01_full') if os.path.isdir(os.path.join(database_folder+'/speech_commands_v0.01_full', name)) ]
  df = pd.DataFrame(columns=['path', 'filename','sentence', 'label','lentgh(sec)'])
  sc_validated = database_folder+'/speech_commands_v0.01_full/sc_validated.tsv'
  for sc_label in sc_labels:
      file_names = os.listdir(database_folder+'/speech_commands_v0.01_full/'+sc_label)
      for file_name in file_names:
        #if ((sc_label+'/'+file_name) in val_list):
        path_name = database_folder+'/speech_commands_v0.01_full/'+sc_label+'/'+file_name
        df = df.append({'path' : path_name, 'filename' : file_name,  'sentence' : ' '+sc_label+' ', 'label' : sc_label,'lentgh(sec)' : 1},ignore_index = True)
  df.to_csv(sc_validated,sep='\t',index=False)

# Create summary  for 'hello fourthbrain' database


In [None]:
if (create_summary == True):
  hellofourthbrain_files = glob.glob(database_folder+'/hellofourthbrain/*.wav')
  hellofourthbrain_validated = database_folder+'/hellofourthbrain/hellofourthbrain_validated.txt'

  df = pd.DataFrame(columns = ['path', 'filename','sentence', 'label', 'lentgh(sec)'])

  for i in range(len(hellofourthbrain_files)):
    sounddata = librosa.core.load(hellofourthbrain_files[i], sr=16000, mono=True)[0]
    df = df.append({'path' : hellofourthbrain_files[i],'filename' : hellofourthbrain_files[i].split('/')[-1], 'sentence' : ' hellofourthbrain ','label' : 'hellofourthbrain', 'lentgh(sec)' : len(sounddata)/16000},ignore_index = True)

  df.to_csv(hellofourthbrain_validated,sep='\t',index=False)