In [1]:

# Define IAM role
import boto3
import re

import os
import numpy as np
import pandas as pd
from sagemaker import get_execution_role

role = get_execution_role()

import sagemaker as sage
from time import gmtime, strftime

sess = sage.Session()
bucket = 'oosv-multilingual-bucket'
snippet_length = 75
test_data_location = None
train_data_location = None

In [39]:
def get_data(file_name, bucket):
    prefix = '/tmp/data/'
    s3 = boto3.resource('s3')
    s3.Bucket(bucket).download_file('data/' + file_name, prefix + file_name)
    df = pd.read_csv(prefix + file_name)
    os.remove(prefix + file_name)
    return df.as_matrix()

In [None]:
data_en = get_data('english.csv', bucket)
data_es = get_data('spanish.csv', bucket)
data_en = data_en[(len(data_en) % snippet_length):]
data_es = data_es[(len(data_es) % snippet_length):]

In [None]:
dist_en = int(len(data_en) * 3 / 4) - int(len(data_en) * 3 / 4) % snippet_length
dist_es = int(len(data_es) * 3 / 4) - int(len(data_es) * 3 / 4) % snippet_length
print(dist_en)
print(dist_es)

In [None]:
train_en = data_en[:dist_en]
train_es = data_es[:dist_es]
test_en = data_en[dist_en:]
test_es = data_es[dist_es:]
print(len(train_en) % snippet_length)
print(len(train_es) % snippet_length)
print(len(test_en) % snippet_length)
print(len(test_es) % snippet_length)

In [None]:
train_dir = '/tmp/data/train'
test_dir = '/tmp/data/test'
pd.DataFrame(train_en).to_csv(train_dir + '/english.csv', index=False)
pd.DataFrame(train_es).to_csv(train_dir + '/spanish.csv', index=False)
pd.DataFrame(train_en).to_csv(test_dir  + '/english.csv', index=False)
pd.DataFrame(train_es).to_csv(test_dir  + '/spanish.csv', index=False)

In [None]:
train_key = 'data/train'
test_key = 'data/test'

In [None]:
train_data_location = sess.upload_data(train_dir, bucket=bucket, key_prefix=train_key)
test_data_location = sess.upload_data(test_dir, bucket=bucket, key_prefix=test_key)
#s3://{bucker}/{train_key}

In [23]:
# hyperparameters: n_clusters = 30, cov_type='full', iter = 100, snippet_length=75, languages=2
data_location = f's3://{bucket}/data'
hyper_params = {'n_clusters' : [10,10], 'iter' : 100}

In [24]:
account = sess.boto_session.client('sts').get_caller_identity()['Account']
region = sess.boto_session.region_name
image = '{}.dkr.ecr.{}.amazonaws.com/gmm-image-3:latest'.format(account, region)
 
gmm_model = sage.estimator.Estimator(image,
                       role, 1, 'ml.c4.4xlarge',
                       train_volume_size=50,
                       output_path="s3://{}/output".format(bucket),
                       sagemaker_session=sess,
                       hyperparameters=hyper_params)

gmm_model.fit(data_location, wait = True)
print(gmm_model)

INFO:sagemaker:Creating training-job with name: gmm-image-3-2018-10-04-16-23-57-971


2018-10-04 16:23:58 Starting - Starting the training job...
Launching requested ML instances...
Preparing the instances for training.........
2018-10-04 16:26:00 Downloading - Downloading input data.........
2018-10-04 16:27:52 Training - Downloading the training image...
Training image download completed. Training in progress.
  import imp[0m
[31mdebug : False[0m
[31mdebug2: False[0m
  os.path.join(train_data, language)[0m
[31m[SUCCESS] data parsed[0m
[31m[10, 10][0m
[31m[SUCCESS] model created[0m
[31m[SUCCESS] model trained[0m
[31m[SUCCESS] checkpoint saved[0m
  os.path.join(test_data, language)[0m
[31m[SUCCESS] acc: {'english': 0.7996314968920205, 'spanish': 0.7570300516957337}[0m
[31m[SUCCESS] model saved[0m

2018-10-04 17:41:16 Uploading - Uploading generated training model
2018-10-04 17:41:22 Completed - Training job completed
Billable seconds: 4522
<sagemaker.estimator.Estimator object at 0x7fb53fd68208>


In [31]:
import time
variants = [{'n_clusters' : [30,50]}, \
            {'n_clusters' : [40,40]}, \
            {'n_clusters' : [30,40]}, \
            {'n_clusters' : [30,30]}, \
            {'n_clusters' : [60,50]}, \
            {'n_clusters' : [60,40]}, \
            {'n_clusters' : [60,30]}, \
            {'n_clusters' : [50,40]}, \
            {'n_clusters' : [50,30]}, \
            {'n_clusters' : [40,40]}]
'''
Saved clusters, but not performance tests done
            {'n_clusters' : [60,60]}, \
            {'n_clusters' : [50,60]}, \
            {'n_clusters' : [40,60]}, \
            {'n_clusters' : [30,60]}, \
            {'n_clusters' : [50,50]}, \
            {'n_clusters' : [40,50]}, \
'''

"\nSaved clusters, but not performance tests done\n            {'n_clusters' : [60,60]},             {'n_clusters' : [50,60]},             {'n_clusters' : [40,60]},             {'n_clusters' : [30,60]},             {'n_clusters' : [50,50]},             {'n_clusters' : [40,50]}, "

In [None]:
for count, variant in enumerate(variants, 1):
    gmm_model = sage.estimator.Estimator(image,
                       role, 1, 'ml.c4.4xlarge',
                       train_volume_size=50,
                       output_path="s3://{}/output".format(bucket),
                       sagemaker_session=sess,
                       hyperparameters=variant)
    gmm_model.fit(data_location, wait = False)
    if count % 4 == 0:        
        time.sleep(9 * 60 * 60) #sleep for 9 hours before starting more jobs
        # I am so sorry for how much compute power this will consume


INFO:sagemaker:Creating training-job with name: gmm-image-1-2018-09-27-18-55-05-299
INFO:sagemaker:Creating training-job with name: gmm-image-1-2018-09-27-18-55-05-484
INFO:sagemaker:Creating training-job with name: gmm-image-1-2018-09-27-18-55-08-524
INFO:sagemaker:Creating training-job with name: gmm-image-1-2018-09-27-18-55-10-857


In [78]:
client = boto3.client('sagemaker') 

In [79]:
image = '{}.dkr.ecr.{}.amazonaws.com/gmm-image-2:latest'.format(account, region)
some_model = 'gmm-image-1-2018-09-27-18-55-05-299'
folder = 'output'
model_bucket = f's3://{bucket}/{folder}/{some_model}/output/model.tar.gz'
container_des = {
        'Image': image,
        'ModelDataUrl': model_bucket
        } 

In [80]:
model = client.create_model(ModelName = 'oosv-demo-gmm', 
                            PrimaryContainer=container_des,
                            ExecutionRoleArn=role)

In [81]:
response = client.create_endpoint_config(
    EndpointConfigName='oosv-demo-config',
    ProductionVariants=[
        {
            'VariantName': 'gmm1',
            'ModelName': 'oosv-demo-gmm',
            'InitialInstanceCount': 1,
            'InstanceType': 'ml.t2.medium',
        }
    ]
)
print(response)

{'EndpointConfigArn': 'arn:aws:sagemaker:us-west-2:367698673629:endpoint-config/oosv-demo-config', 'ResponseMetadata': {'RequestId': '2c203ca5-b181-42be-867c-c4f3f7e6c3f3', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '2c203ca5-b181-42be-867c-c4f3f7e6c3f3', 'content-type': 'application/x-amz-json-1.1', 'content-length': '97', 'date': 'Fri, 28 Sep 2018 18:24:24 GMT'}, 'RetryAttempts': 0}}


In [82]:
endpoint = client.create_endpoint(EndpointName='oosv-demo-endpoint', EndpointConfigName='oosv-demo-config')
print(endpoint)

{'EndpointArn': 'arn:aws:sagemaker:us-west-2:367698673629:endpoint/oosv-demo-endpoint', 'ResponseMetadata': {'RequestId': '1b496d93-387b-4061-b896-fe4161125b9d', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '1b496d93-387b-4061-b896-fe4161125b9d', 'content-type': 'application/x-amz-json-1.1', 'content-length': '86', 'date': 'Fri, 28 Sep 2018 18:24:30 GMT'}, 'RetryAttempts': 0}}


In [86]:
client.describe_endpoint(EndpointName = 'oosv-demo-endpoint')

{'EndpointName': 'oosv-demo-endpoint',
 'EndpointArn': 'arn:aws:sagemaker:us-west-2:367698673629:endpoint/oosv-demo-endpoint',
 'EndpointConfigName': 'oosv-demo-config',
 'EndpointStatus': 'Creating',
 'CreationTime': datetime.datetime(2018, 9, 28, 18, 24, 30, 249000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2018, 9, 28, 18, 24, 30, 249000, tzinfo=tzlocal()),
 'ResponseMetadata': {'RequestId': 'c131574a-2138-4095-9b08-329be6f43682',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'c131574a-2138-4095-9b08-329be6f43682',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '258',
   'date': 'Fri, 28 Sep 2018 18:29:06 GMT'},
  'RetryAttempts': 0}}

In [77]:
# clean up!!!!!
del_config = client.delete_endpoint_config(EndpointConfigName = 'oosv-demo-config')
del_ndpnt = client.delete_endpoint(EndpointName = 'oosv-demo-endpoint')
del_cl = client.delete_model(ModelName = 'oosv-demo-gmm')
print(del_config)
print(del_ndpnt)
print(del_cl)

{'ResponseMetadata': {'RequestId': 'ad059b43-3334-4d46-9a58-5b36f67eef9f', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'ad059b43-3334-4d46-9a58-5b36f67eef9f', 'content-type': 'application/x-amz-json-1.1', 'content-length': '0', 'date': 'Fri, 28 Sep 2018 18:24:07 GMT'}, 'RetryAttempts': 0}}
{'ResponseMetadata': {'RequestId': 'e5614a89-7da5-4595-abdd-5bd29c92fdcd', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'e5614a89-7da5-4595-abdd-5bd29c92fdcd', 'content-type': 'application/x-amz-json-1.1', 'content-length': '0', 'date': 'Fri, 28 Sep 2018 18:24:07 GMT'}, 'RetryAttempts': 0}}
{'ResponseMetadata': {'RequestId': '6ee9febd-881e-433f-8311-4bb574189f85', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '6ee9febd-881e-433f-8311-4bb574189f85', 'content-type': 'application/x-amz-json-1.1', 'content-length': '0', 'date': 'Fri, 28 Sep 2018 18:24:07 GMT'}, 'RetryAttempts': 0}}


In [110]:
from extractor import extractor
import tarfile
import pickle
import boto3
import pickle
import numpy as np

saved_model = 'gmm-image-1-2018-09-27-18-55-05-299'
bucket = 'oosv-multilingual-bucket'

s3 = boto3.resource('s3')

def get_wav(file_name, bucket):
    file = '/tmp/wav/' + file_name
    s3.Bucket(bucket).download_file('data/wav/' + file_name, file)
    return file
def get_model(bucket):
    #model_bucket
    model = '/tmp/model.tar.gz'
    s3.Bucket(bucket).download_file(f'output/{some_model}/output/model.tar.gz', model)
    return model

print(some_model)
print(bucket)

gmm-image-1-2018-09-27-18-55-05-299
oosv-multilingual-bucket


In [109]:
file_name = 'demo_en.wav'
wav = get_wav(file_name, bucket)
features = extractor.get_features(wav)
print(np.shape(features))
features = np.swapaxes(features, 0, 2)
features = features.reshape((75, 39))
print(np.shape(features))
model = get_model(bucket)
tf = tarfile.open(model)
tf.extractall()
tf.close()
print(os.listdir())
lang = {0 : 'english', 1 : 'spanish'}
with open('gmm_clust-[30, 50].plk', 'rb') as file:
    model = pickle.load(file)
lang[model.predict(features)]



(3, 75, 13)
(75, 39)
['.git', 'container_gmm', 'Training DNN.ipynb', 'Untitled.ipynb', 'gmm_es', 'gmm_clust-[30, 50].plk', 'extractor', 'gmm_sage_maker_prototype.ipynb', 'container_dnn', '.ipynb_checkpoints', '.gitignore', 'prototype_notebook.ipynb', 'accuracy.json', 'python_speech_features', 'gmm_prototype.ipynb', 'README.md', 'gmm_en', 'models', 'Training GMM.ipynb']




'english'

In [105]:
file_name = 'demo_en.wav'
wav = get_wav(file_name, bucket)
features = extractor.get_features(wav)
print(np.shape(features))
features = np.swapaxes(features, 0, 2)
features = features.reshape((75, 39))
print(np.shape(features))
model = get_model(bucket)
tf = tarfile.open(model)
tf.extractall()
tf.close()
print(os.listdir())
lang = {0 : 'english', 1 : 'spanish'}
with open('gmm_clust-[30, 50].plk', 'rb') as file:
    model = pickle.load(file)
lang[model.predict(features)]

['.git', 'container_gmm', 'Training DNN.ipynb', 'Untitled.ipynb', 'gmm_es', 'gmm_clust-[30, 50].plk', 'extractor', 'gmm_sage_maker_prototype.ipynb', 'container_dnn', '.ipynb_checkpoints', '.gitignore', 'prototype_notebook.ipynb', 'accuracy.json', 'python_speech_features', 'gmm_prototype.ipynb', 'README.md', 'gmm_en', 'models', 'Training GMM.ipynb']
