# Notebook for training a Costume Mixed LSTM Deep Network

###### Model Specifically trains a costume LSTM model
see model_params for reference

In [25]:

# Define IAM role
import boto3
import re
import os
import numpy as np
import pandas as pd
import importlib
from sagemaker import get_execution_role
import sagemaker as sage
from time import gmtime, strftime
from sagemaker.pytorch import PyTorch
import time

role         = get_execution_role()
sess         = sage.Session()
bucket       = 'oosv-multilingual-bucket'
TOTAL_FRAMES = 150


# this is where to find training and testing date and their respective channels
# when the instance launches, it will create a folder 
# /opt/ml/input/data/{channel}/ where all files in the buckets below are will be copied over
# TODO: move debug data to a debug bucket, currently this is downloading all data in folders which is bad
train_args = {
    'training' : f's3://{bucket}/data/train',
    'validation'  : f's3://{bucket}/data/test'
}
debug_args = {
    'training' : f's3://{bucket}/data/debug/train',
    'validation'  : f's3://{bucket}/data/debug/test'
}

In [27]:
# Debugging single instance
model_params_debug={ 
    'n_features'      : 39,
    'n_hidden'        : 512, 
    'languages'       : 2,
    'frames'          : TOTAL_FRAMES,
    'dropout'         : 0,
    'lstm_layers'     : 1, 
    'linear_layers'   : 1,
    'bidirectional'   : True,
    'lr'              : 0.001,
    'batch-size'      : 100,
    'epoch'           : 5,
    'backend'         : 'gloo',
    'test-batch-size' : 1000
}
estimator_debug = PyTorch(entry_point='train.py',
                    role=role,
                    train_instance_count=1,
                    train_instance_type='ml.p3.8xlarge',
                    train_volume_size = 70,
                    source_dir='deep_learning',
                    output_path= f's3://{bucket}/output',
                    framework_version=0.4,
                    base_job_name="sage-maker-debug-mixedlstm",
                    hyperparameters=model_params_debug)

estimator_debug.fit(debug_args, wait = True)

INFO:sagemaker:Creating training-job with name: sage-maker-debug-mixedlstm-2018-10-22-21-31-40-743


2018-10-22 21:31:41 Starting - Starting the training job...
2018-10-22 21:31:45 Starting - Launching requested ML instances.........
2018-10-22 21:33:22 Starting - Preparing the instances for training......
2018-10-22 21:34:41 Downloading - Downloading input data
2018-10-22 21:34:41 Training - Downloading the training image...
2018-10-22 21:35:07 Training - Training image download completed. Training in progress..
[31mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[31mbash: no job control in this shell[0m
[31m2018-10-22 21:35:08,527 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[31m2018-10-22 21:35:08,604 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[31m2018-10-22 21:35:08,812 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[31m2018-10-22 21:35:09,096 sagemaker-containers INFO     Module train does not provide a setup.py. 


2018-10-22 21:36:09 Uploading - Uploading generated training model
2018-10-22 21:36:09 Failed - Training job failed
[31mNCCL version 2.1.15+cuda9.0[0m
[31mException during training: cuda runtime error (59) : device-side assert triggered at /pytorch/aten/src/THC/generic/THCTensorMath.cu:26[0m
[31mTraceback (most recent call last):
  File "/usr/local/lib/python3.5/dist-packages/train.py", line 460, in <module>
    train(parser.parse_args())
  File "/usr/local/lib/python3.5/dist-packages/train.py", line 250, in train
    loss.backward()
  File "/usr/local/lib/python3.5/dist-packages/torch/tensor.py", line 93, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph)
  File "/usr/local/lib/python3.5/dist-packages/torch/autograd/__init__.py", line 89, in backward
    allow_unreachable=True)  # allow_unreachable flag[0m
[31mRuntimeError: cuda runtime error (59) : device-side assert triggered at /pytorch/aten/src/THC/generic/THCTensorMath.cu:26
[0m
[31m2018

ValueError: Error training sage-maker-debug-mixedlstm-2018-10-22-21-31-40-743: Failed Reason: AlgorithmError: ExecuteUserScriptError:
Command "/usr/bin/python -m train --backend gloo --batch-size 100 --bidirectional True --dropout 0 --epoch 5 --frames 150 --languages 2 --linear_layers 1 --lr 0.001 --lstm_layers 1 --n_features 39 --n_hidden 512 --test-batch-size 1000"
Get train data loader
Get test data loader
/usr/local/lib/python3.5/dist-packages/models/lstm.py:132: UserWarning: RNN module weights are not part of single contiguous chunk of memory. This means they need to be compacted at every call, possibly greatly increasing memory usage. To compact weights again call flatten_parameters().
  self.hidden))
/pytorch/aten/src/THCUNN/ClassNLLCriterion.cu:105: void cunn_ClassNLLCriterion_updateOutput_kernel(Dtype *, Dtype *, Dtype *, long *, Dtype *, int, int, int, int, long) [with Dtype = float, Acctype = float]: block: [0,0,0], thread: [8,0,0] Assertion `t >= 0 && t < n_classes` failed.
/pytorch/aten/src/THCUNN/ClassNLLCriterion.cu:105: void cunn_ClassNLLCriterion_updateOutput_kernel(Dtype *, Dtype *, 

In [28]:
# 512 hidden dimensions, lr = 0.001, single lstm single direction, single linear layer
# Reason: Understanding simple LSTM performance
model_params_1={ 
    'n_features'      : 39,
    'n_hidden'        : 512, 
    'languages'       : 3,
    'frames'          : TOTAL_FRAMES,
    'dropout'         : 0,  # Needs 2 LSTM Layers for it to be non-zero
    'lstm_layers'     : 1,  
    'linear_layers'   : 1,  # Minimum of 1
    'bidirectional'   : False,
    'lr'              : 0.0001,
    'batch-size'      : 100,
    'epoch'           : 20,
    'backend'         : 'gloo',
    'test-batch-size' : 1000
}

# Google: 2560 hidden dimensions, 4 linear,  lr = 0.0001, single lstm single direction. Same as google Paper using fusion
# Reason: Seeing how a paper ML Algorithm performs, although orignal document used 3000 hours of data
model_params_2={ 
    'n_features'      : 39,
    'n_hidden'        : 2560, 
    'languages'       : 3,
    'frames'          : TOTAL_FRAMES,
    'dropout'         : 0,
    'lstm_layers'     : 1, 
    'linear_layers'   : 4,
    'bidirectional'   : False,
    'lr'              : 0.0001,
    'batch-size'      : 1000,
    'epoch'           : 10,
    'backend'         : 'gloo',
    'test-batch-size' : 1000
}

# 512 hidden dimensions, 1 linear,  lr = 0.001, double lstm single direction, dropout = 0.5
# Reason: the point of this model is to see how dropout affects performance
model_params_3={ 
    'n_features'      : 39,
    'n_hidden'        : 512, 
    'languages'       : 3,
    'frames'          : TOTAL_FRAMES,
    'dropout'         : 0.5,
    'lstm_layers'      : 2, 
    'linear_layers'   : 1,
    'bidirectional'   : False,
    'lr'              : 0.001,
    'batch-size'      : 100,
    'epoch'           : 20,
    'backend'         : 'gloo',
    'test-batch-size' : 1000
}

# 512 hidden dimensions, 1 linear,  lr = 0.001, single BiLSTM, dropout = 0.0
# Reason: the point of this model is to see how BiLSTM affects performance
model_params_4={ 
    'n_features'      : 39,
    'n_hidden'        : 512, 
    'languages'       : 3,
    'frames'          : TOTAL_FRAMES,
    'dropout'         : 0,
    'lstm_layers'     : 1, 
    'linear_layers'   : 1,
    'bidirectional'   : True,
    'lr'              : 0.001,
    'batch-size'      : 100,
    'epoch'           : 20,
    'backend'         : 'gloo',
    'test-batch-size' : 1000
}

In [29]:
# Build an estimtor for each type of parameter dictionary
#
# Entry_point represents a .py file that will run as script/executable
#
# role=sagemaker.get_execution_role()
#
# train_instance_type and train_intance_count are self explantory
#
# Volume size is how much non-ram memory is to be used
#
# source_dir = 'deep_learning', representing which directory
#    from the working directory of this notebook to copy over
#
# output_path = which bucket and folder to save the output model and any other meta data to
#    output path will save a {training_job_name}/output/model.tar.gz folder, model.tar.gz will have model and metadata
#
# hyperparameters = represents any way to possibly 
#    modify the training job externally, these are past in as arguments

estimator1 = PyTorch(entry_point = 'train.py',
                    role = role,
                    train_instance_count = 1,
                    train_instance_type = 'ml.p3.16xlarge',
                    train_volume_size = 70,
                    source_dir = 'deep_learning',
                    output_path = f's3://{bucket}/MixedLSTM',
                    framework_version=0.4,
                    base_job_name="MixedLSTM",
                    hyperparameters = model_params_1)

# Google based model
estimator2 = PyTorch(entry_point = 'train.py',
                    role = role,
                    train_instance_count = 12,
                    train_instance_type = 'ml.p3.16xlarge',
                    train_volume_size = 70,
                    source_dir = 'deep_learning',
                    output_path = f's3://{bucket}/MixedLSTM',
                    framework_version=0.4,
                    base_job_name="MixedLSTM",
                    hyperparameters = model_params_2)

estimator3 = PyTorch(entry_point = 'train.py',
                    role = role,
                    train_instance_count = 1,
                    train_instance_type = 'ml.p3.16xlarge',
                    train_volume_size = 70,
                    source_dir = 'deep_learning',
                    output_path = f's3://{bucket}/MixedLSTM',
                    framework_version=0.4,
                    base_job_name="MixedLSTM",
                    hyperparameters = model_params_3)

estimator4 = PyTorch(entry_point = 'train.py',
                    role = role,
                    train_instance_count = 1,
                    train_instance_type = 'ml.p3.16xlarge',
                    train_volume_size = 70,
                    source_dir = 'deep_learning',
                    output_path = f's3://{bucket}/MixedLSTM',
                    framework_version=0.4,
                    base_job_name="MixedLSTM",
                    hyperparameters = model_params_4)

In [30]:
estimator1.fit(train_args, wait = True)

# estimator3.fit(train_args, wait = False)

# estimator4.fit(train_args, wait = False)

# estimator2.fit(train_args, wait = False)

INFO:sagemaker:Creating training-job with name: MixedLSTM-2018-10-22-21-51-30-545


2018-10-22 21:51:31 Starting - Starting the training job...
2018-10-22 21:51:32 Starting - Launching requested ML instances...............
2018-10-22 21:54:04 Starting - Preparing the instances for training......
2018-10-22 21:55:27 Downloading - Downloading input data............
2018-10-22 21:57:33 Training - Training image download completed. Training in progress..
[31mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[31mbash: no job control in this shell[0m
[31m2018-10-22 21:57:34,248 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[31m2018-10-22 21:57:34,324 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[31m2018-10-22 21:57:34,533 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[31m2018-10-22 21:57:34,884 sagemaker-containers INFO     Module train does not provide a setup.py. [0m
[31mGenerating setup.py[0m
[31m2018-10-2

[31mNCCL version 2.1.15+cuda9.0[0m
[31m2018-10-22 22:09:16,835 sagemaker-containers INFO     Reporting training SUCCESS[0m

2018-10-22 22:09:24 Uploading - Uploading generated training model
2018-10-22 22:09:24 Completed - Training job completed
Billable seconds: 837


In [19]:
# 512 Hidden dimensions, 1 linear layer, lr = 0.001, double layer BiLSTM, dropout=0.5, 5 epochs, 
# Reason: After testing the model with BiLSTM's and Dropout, both resulted on useful gains
# in accuracy, additionally, BiLSTM showed reduced compute time relative to 2 layers LSTM.
# This however, should balance out, depending on performance, lr will be changed
# this model is on hold until the best learning rate is found
model_params_5={ 
    'n_features'      : 39,
    'n_hidden'        : 512, 
    'languages'       : 2,
    'frames'          : TOTAL_FRAMES,
    'dropout'         : 0,
    'lstm_layers'     : 1, 
    'linear_layers'   : 1,
    'bidirectional'   : False,
    'lr'              : 0.0001,
    'batch-size'      : 100,
    'epoch'           : 20,
    'backend'         : 'gloo',
    'test-batch-size' : 1000
}
# Pure reduced learning rate to .0001 learning rate
model_params_6={ 
    'n_features'      : 39,
    'n_hidden'        : 512, 
    'languages'       : 2,
    'frames'          : TOTAL_FRAMES,
    'dropout'         : 0.5,
    'lstm_layers'     : 2, 
    'linear_layers'   : 1,
    'bidirectional'   : False,
    'lr'              : 0.0001,
    'batch-size'      : 100,
    'epoch'           : 20,
    'backend'         : 'gloo',
    'test-batch-size' : 1000
}

# BiLSTM + Dropout, increased number of nodes as BiLSTM will reduce them
model_params_7={ 
    'n_features'      : 39,
    'n_hidden'        : 512, 
    'languages'       : 2,
    'frames'          : TOTAL_FRAMES,
    'dropout'         : 0,
    'lstm_layers'     : 1, 
    'linear_layers'   : 1,
    'bidirectional'   : True,
    'lr'              : 0.0001,
    'batch-size'      : 100,
    'epoch'           : 20,
    'backend'         : 'gloo',
    'test-batch-size' : 1000
}
estimator5 = PyTorch(entry_point='train_mixedlstm.py',
                    role=role,
                    train_instance_count=1,
                    train_instance_type='ml.p3.16xlarge',
                    train_volume_size = 70,
                    source_dir='deep_learning',
                    output_path= f's3://{bucket}/output',
                    framework_version=0.4,
                    hyperparameters=model_params_5)

estimator6 = PyTorch(entry_point='train_mixedlstm.py',
                    role=role,
                    train_instance_count=1,
                    train_instance_type='ml.p3.16xlarge',
                    train_volume_size = 70,
                    source_dir='deep_learning',
                    output_path= f's3://{bucket}/output',
                    framework_version=0.4,
                    hyperparameters=model_params_6)

estimator7 = PyTorch(entry_point='train_mixedlstm.py',
                    role=role,
                    train_instance_count=1,
                    train_instance_type='ml.p3.16xlarge',
                    train_volume_size = 70,
                    source_dir='deep_learning',
                    output_path= f's3://{bucket}/output',
                    framework_version=0.4,
                    hyperparameters=model_params_7)
estimator5.fit(train_args, wait = False)
estimator6.fit(train_args, wait = False)
estimator7.fit(train_args, wait = False)
# TODO: Add a sixth option called "use inception, and that may be 
# what I need, but if Inception then I may have to change
# data to be 3 dimensional... maybe"

INFO:sagemaker:Creating training-job with name: sagemaker-pytorch-2018-10-18-23-50-20-680
INFO:sagemaker:Creating training-job with name: sagemaker-pytorch-2018-10-18-23-50-21-641
INFO:sagemaker:Creating training-job with name: sagemaker-pytorch-2018-10-18-23-50-22-933


In [233]:
# just a function to get data from s3 buckets
def get_data(file_name, bucket, _dir='train'):
    prefix = '/tmp/data/'
    path = f'data/{_dir}/'
    s3 = boto3.resource('s3')
    s3.Bucket(bucket).download_file(path + file_name, prefix + file_name)
    arr = np.load(prefix + file_name)
    os.remove(prefix + file_name)
    return arr

In [83]:
# Run this to check if the data being fed to train_mixedlstm.py is correctly shaped
# Shape should be n_samples x TOTAL_FRAMES (150) x n_features (39)
# there are files saved the same as below but with _sm.npy for doing job completion tests
# before running them with gigabytes worth of data
train_x = get_data('train_x.npy', bucket)
train_y = get_data('train_y.npy', bucket)
test_x  = get_data('test_x.npy', bucket, 'test')
test_y  = get_data('test_y.npy', bucket, 'test')
print(np.shape(train_x))
print(np.shape(train_y))
print(np.shape(test_x))
print(np.shape(test_y))