# Notebook Data Preparation

This notebook shows how to extract data from wave files using extractor, and how the data is divided between test and training sets

## Chunk Data Prep

In [9]:
from extractor import extractor
import os
import importlib
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from collections import OrderedDict
import pprint as p

In [10]:
importlib.reload(extractor)
wav_data = "/home/ec2-user/SageMaker/voxforge/wav" #Path to .wav file data
languages = {"english" : 0, "spanish" : 1, "french" : 2, "italian" : 3, "german" : 4}
TOTAL_FRAMES = 150
MIN_ENERGY = 12
npy_data    = "/home/ec2-user/SageMaker/npy"


extractor.TOTAL_FRAMES = TOTAL_FRAMES # lenght of audio each clip will be divided into
extractor.MIN_ENERGY = MIN_ENERGY # Minimum threshold of energy, zero's out anything not within threshold

In [11]:
test = lambda data : data[:len(data) // 4]
train = lambda data : data[len(data) // 4 :]

##### The Block Below Will Taka a While

In [12]:
data_y = []
data_x = []
# num_samples = 100
# get the path for num_samples .wav files
for language in languages:
    
    files = []
    path = os.path.join(wav_data, language)
    for f in train(os.listdir(path)):
        files.append(os.path.join(path, f))
    
    x, y = extractor.make_feature_set(files, languages[language])
    data_x.append(x)
    data_y.append(y)

skipped file /home/ec2-user/SageMaker/voxforge/wav/english/anonymous-20080731-jyq-a0573.wav
skipped file /home/ec2-user/SageMaker/voxforge/wav/spanish/sg_ac_enrique-20141114-kij-es-0014.wav


In [13]:
data_x = np.concatenate(data_x, axis = 0)
data_y = np.concatenate(data_y, axis = 0)

In [14]:
shape_x, shape_y = np.shape(data_x), np.shape(data_y)
print(shape_x, shape_y)

(320890, 3, 150, 13) (320890,)


In [15]:
# Convert from Sup_Seq x 3 x TOTAL_FRAMES x n_coeff
#         to   Sup_Seq x TOTAL_FRAMES x n_coeff * 3
shape_x = np.shape(data_x)
print(shape_x)
print(f'total length of audio = {shape_x[0] * (TOTAL_FRAMES/100) / 3600} hours' )

(320890, 3, 150, 13)
total length of audio = 133.70416666666668 hours


In [16]:
def shuffle_in_unison(a, b):
    rng_state = np.random.get_state()
    np.random.shuffle(a)
    np.random.set_state(rng_state)
    np.random.shuffle(b)

In [17]:
def label_distribution(data_y):
    counts = Counter(data_y)
    total = sum(counts.values())
    weights = {k : counts[k] / total for k in counts}
    #print(weights)
    return(weights)


In [18]:
# Copy and paste the weights below onto the train script
label_distribution(data_y)

Counter({0: 141305, 1: 68145, 2: 48961, 4: 38634, 3: 23845})


{0: 0.4403533921281436,
 1: 0.21236249181962666,
 2: 0.1525787653089844,
 3: 0.07430895322384617,
 4: 0.12039639751939916}

In [19]:
shuffle_in_unison(data_x, data_y)
shape_x = np.shape(data_x)

In [20]:
with open(os.path.join(npy_data, 'train_x.npy'), 'wb') as file:
    np.save(file, data_x)
with open(os.path.join(npy_data,'train_y.npy'), 'wb') as file:
    np.save(file, data_y)

##### The Block Below Will Taka a While

In [21]:
data_y = []
data_x = []
# num_samples = 100
# get the path for num_samples .wav files
for language in languages:
    
    files = []
    path = os.path.join(wav_data, language)
    for f in test(os.listdir(path)):
        files.append(os.path.join(path, f))
    
    x, y = extractor.make_feature_set(files, languages[language])
    data_x.append(x)
    data_y.append(y)
data_x = np.concatenate(data_x, axis = 0)
data_y = np.concatenate(data_y, axis = 0)

In [22]:
shape_x, shape_y = np.shape(data_x), np.shape(data_y)
print(shape_x, shape_y)
print(f'total length of audio = {shape_x[0] * (TOTAL_FRAMES/100) / 3600} hours' )
label_distribution(data_y)

(106555, 3, 150, 13) (106555,)
total length of audio = 44.39791666666667 hours
Counter({0: 46928, 1: 22733, 2: 16349, 4: 12664, 3: 7881})


{0: 0.44041105532354186,
 1: 0.21334522077800197,
 2: 0.15343249964806907,
 3: 0.07396180376331472,
 4: 0.11884942048707241}

In [23]:
shuffle_in_unison(data_x, data_y)

In [24]:
with open(os.path.join(npy_data,'test_x.npy'), 'wb') as file:
    np.save(file, data_x)
with open(os.path.join(npy_data,'test_y.npy'), 'wb') as file:
    np.save(file, data_y)

# Full Utterance Data Prep

In [25]:
from extractor import extractor
import os
import importlib
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from collections import OrderedDict
import pprint as p

# Define IAM role
import boto3
import re
import os
import numpy as np
import pandas as pd
import importlib
from sagemaker import get_execution_role
import sagemaker as sage
from time import gmtime, strftime
import time

In [26]:
# Data Specific
importlib.reload(extractor)
wav_data   = "/home/ec2-user/SageMaker/voxforge/wav" #Path to .wav file data
npy_data    = "/home/ec2-user/SageMaker/npy"
languages  = {"english" : 0, "spanish" : 1, "french" : 2, "italian" : 3, "german" : 4}
MIN_ENERGY = 12

extractor.MIN_ENERGY = MIN_ENERGY 
max_padding = 800

# Sage Maker Specific
role         = get_execution_role()
sess         = sage.Session()
bucket       = 'oosv-multilingual-bucket' # feel free to change the bucket

In [27]:
def adjust_samples(X, max_padding): 
    out = []
    for idx, x in enumerate(X):
        if len(x[0]) > max_padding:
            shorted = np.array([x[0][:int(max_padding)],
                       x[1][:int(max_padding)],
                       x[2][:int(max_padding)]])
            out.append(shorted)

        else:
            shape = np.shape(x)
            zeros = np.zeros([shape[0], int(max_padding) - shape[1], shape[2]])
            out.append(np.concatenate((zeros, x), axis = 1))
    return np.array(out)

##### The Block Below Will Taka a While

In [None]:
data_y = []
data_x = []
# num_samples = 100
# get the path for num_samples .wav files
for language in languages:
    
    files = []
    path = os.path.join(wav_data, language)
    for f in train(os.listdir(path)):
        files.append(os.path.join(path, f))
    
    x, y = extractor.make_feature_set(files, languages[language], chunk=False)
    data_x.append(x)
    data_y.append(y)

skipped file /home/ec2-user/SageMaker/voxforge/wav/english/anonymous-20080731-jyq-a0573.wav
skipped file /home/ec2-user/SageMaker/voxforge/wav/spanish/sg_ac_enrique-20141114-kij-es-0014.wav


In [36]:
print(len(data_x), len(data_y))
print(np.shape(data_x))
#data_y = np.concatenate(data_y)
label_distribution(data_y)
# all utterance lengths are different but a single sample of X shaped
# deltas (3) x length_utterance x ceptra
# the first index is tha language index, second is utterance index
# languages can't be concatenated to be shuffled until all utterances are the same length

118870 118870
(118870, 3, 800, 13)
Counter({0: 61166, 4: 17618, 2: 16761, 1: 16485, 3: 6840})


{1: 0.13868091192058551,
 4: 0.14821233280053842,
 0: 0.514562126693026,
 2: 0.14100277614200388,
 3: 0.05754185244384622}

In [32]:
data_x_ = []
data_x_ = adjust_samples(lang, max_padding)

data_x_ = np.concatenate(data_x_)
print(np.shape(data_x_))
print(np.shape(data_y))

IndexError: tuple index out of range

[1 4 4 ... 3 4 4]


In [None]:
# if the value above looks right, replace data_x
data_x = data_x_
shuffle_in_unison(data_x, data_y)
with open(os.path.join(npy_data ,'full_train_x.npy'), 'wb') as npy:
    np.save(npy, data_x)
    
with open(os.path.join(npy_data ,'full_train_y.npy'), 'wb') as npy:
    np.save(npy, data_y)

##### The Block Below Will Taka a While

In [38]:
data_y = []
data_x = []
# num_samples = 100
# get the path for num_samples .wav files
for language in languages:
    
    files = []
    path = os.path.join(wav_data, language)
    for f in test(os.listdir(path)):
        files.append(os.path.join(path, f))
    
    x, y = extractor.make_feature_set(files, languages[language], chunk=False)
    data_x.append(x)
    data_y.append(y)

In [39]:
print(len(data_x), len(data_y))
data_y = np.concatenate(data_y)
print(np.shape(data_x[0][0])) 
label_distribution(data_y)

5 5
(3, 455, 13)
Counter({0: 20387, 4: 5872, 2: 5585, 1: 5497, 3: 2278})


{0: 0.5145763396350236,
 1: 0.13874656099346272,
 2: 0.14096771750927586,
 3: 0.0574976652616169,
 4: 0.1482117166006209}

In [40]:
data_x_ = []
for lang in data_x:
    data_x_.append(adjust_samples(lang, max_padding))

data_x_ = np.concatenate(data_x_)
print(np.shape(data_x_))
print(np.shape(data_y))

(39619, 3, 800, 13)
(39619,)


In [41]:
data_x = data_x_
shuffle_in_unison(data_x, data_y)
with open(os.path.join(npy_data ,'full_test_x.npy'), 'wb') as npy:
    np.save(npy, data_x)
    
with open(os.path.join(npy_data ,'full_test_y.npy'), 'wb') as npy:
    np.save(npy, data_y)

# Uploading Train and Test Files to Bucket

In [42]:
channels = ['train', 'test']
files = ['full_{0}_y.npy', 'full_{0}_x.npy', '{0}_x.npy', '{0}_y.npy']
def upload_data(file_name, file_path, channel):
    target = f'data/{channel}/{channel}_5/'
    s3 = boto3.resource('s3')
    s3.Bucket(bucket).upload_file(file_path, target + file_name)

##### The Block Below Will Taka a While

In [43]:
string = '/full_{0}_x.npy'
for channel in ['train', 'test']:
    for file in files:
        upload_data(file.format(channel), os.path.join(npy_data, file.format(channel)), channel)
        