In [1]:

# Define IAM role
import boto3
import re

import os
import numpy as np
import pandas as pd
from sagemaker import get_execution_role

role = get_execution_role()

import sagemaker as sage
from time import gmtime, strftime

sess = sage.Session()
bucket = 'oosv-dnn'
snippet_length = 75


In [6]:
def get_data(file_name, bucket):
    prefix = '/tmp/data/'
    s3 = boto3.resource('s3')
    s3.Bucket(bucket).download_file('data/' + file_name, prefix + file_name)
    df = pd.read_csv(prefix + file_name)
    os.remove(prefix + file_name)
    return df.as_matrix()

In [8]:
data_en = get_data('english.csv', bucket)
data_es = get_data('spanish.csv', bucket)
data_en = data_en[(len(data_en) % snippet_length):]
data_es = data_es[(len(data_es) % snippet_length):]

In [35]:
dist_en = int(len(data_en) * 3 / 4) - int(len(data_en) * 3 / 4) % snippet_length
dist_es = int(len(data_es) * 3 / 4) - int(len(data_es) * 3 / 4) % snippet_length
print(dist_en)
print(dist_es)

3704175
1320225


In [37]:
train_en = data_en[:dist_en]
train_es = data_es[:dist_es]
test_en = data_en[dist_en:]
test_es = data_es[dist_es:]
print(len(train_en) % snippet_length)
print(len(train_es) % snippet_length)
print(len(test_en) % snippet_length)
print(len(test_es) % snippet_length)

0
0
0
0


In [38]:
train_dir = '/tmp/data/train'
test_dir = '/tmp/data/test'
pd.DataFrame(train_en).to_csv(train_dir + '/english.csv', index=False)
pd.DataFrame(train_es).to_csv(train_dir + '/spanish.csv', index=False)
pd.DataFrame(train_en).to_csv(test_dir  + '/english.csv', index=False)
pd.DataFrame(train_es).to_csv(test_dir  + '/spanish.csv', index=False)

In [39]:
train_key = 'data/train'
test_key = 'data/test'
train_data_location = sess.upload_data(train_dir, bucket=bucket, key_prefix=train_key)
test_data_location = sess.upload_data(test_dir, bucket=bucket, key_prefix=test_key)

In [16]:
# hyperparameters: n_clusters = 30, cov_type='full', iter = 100, snippet_length=75, languages=2

# hyper_params = {'n_clusters' : [50,50]}

In [24]:
account = sess.boto_session.client('sts').get_caller_identity()['Account']
region = sess.boto_session.region_name
image = '{}.dkr.ecr.{}.amazonaws.com/dnn-image-1:latest'.format(account, region)

gmm_model = sage.estimator.Estimator(image,
                       role, 1, 'ml.c4.4xlarge',
                       train_volume_size=50,
                       output_path="s3://{}/output".format(bucket),
                       sagemaker_session=sess,
                       hyperparameters=hyper_params)

gmm_model.fit(data_location)

In [40]:
variants = [{'n_clusters' : [60,60]}, \
            {'n_clusters' : [50,60]}, \
            {'n_clusters' : [40,60]}, \
            {'n_clusters' : [30,60]}, \
            {'n_clusters' : [50,50]}, \
            {'n_clusters' : [40,50]}, \
            {'n_clusters' : [30,50]}, \
            {'n_clusters' : [40,40]}, \
            {'n_clusters' : [30,40]}, \
            {'n_clusters' : [30,30]}, \
            {'n_clusters' : [60,50]}, \
            {'n_clusters' : [60,40]}, \
            {'n_clusters' : [60,30]}, \
            {'n_clusters' : [50,40]}, \
            {'n_clusters' : [50,30]}, \
            {'n_clusters' : [40,40]}]

In [None]:
for count, variant in enumerate(variants, 1):
    gmm_model = sage.estimator.Estimator(image,
                       role, 1, 'ml.c4.4xlarge',
                       train_volume_size=50,
                       output_path="s3://{}/output".format(bucket),
                       sagemaker_session=sess,
                       hyperparameters=variant)
    if count % 4 == 0:
        gmm_model.fit(data_location, wait = True)
    else:
        gmm_model.fit(data_location, wait = False)

INFO:sagemaker:Creating training-job with name: gmm-image-1-2018-09-22-02-35-21-818
INFO:sagemaker:Creating training-job with name: gmm-image-1-2018-09-22-02-35-22-056
INFO:sagemaker:Creating training-job with name: gmm-image-1-2018-09-22-02-35-22-938
INFO:sagemaker:Creating training-job with name: gmm-image-1-2018-09-22-02-35-31-732


......................
  data_x.append(pd.read_csv(os.path.join(training_path, language)).as_matrix())[0m
[31m[SUCCESS] data parsed[0m
[31m<class 'list'>[0m
[31m[30, 60][0m
[31m[SUCCESS] model created[0m
[31mshape before: (3704175, 39)[0m
[31mshape after: (49389, 75, 39)[0m
