# Capstone Project - LSTM Modelling

_Author: Yifei Tong_

---
## Goal

The goal of this notebook is to perform modelling using PyTorch LSTM models on the preprocess financial news data. 

## Steps

1. Upload data
2. Define LSTM models and write LSTM training codes
3. Train LSTM on processed data
4. Deploy models
5. Evaluation

## Step 1: Upload data

In [1]:
import sagemaker

sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()
prefix = 'financial-news-dataset'

role = sagemaker.get_execution_role()

In [2]:
import os

data_dir = '../data'

input_data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)

## Step 2: Define LSTM models and write LSTM training codes

_See model.py and train.py in ./train directory_

In [3]:
!pygmentize ./train/model.py

[34mimport[39;49;00m [04m[36mtorch.nn[39;49;00m [34mas[39;49;00m [04m[36mnn[39;49;00m

[34mclass[39;49;00m [04m[32mLSTMRegressor[39;49;00m(nn.Module):
    [33m"""[39;49;00m
[33m    LSTM network that we use to perform regression on financial news data[39;49;00m
[33m    """[39;49;00m
    
    [34mdef[39;49;00m [32m__init__[39;49;00m([36mself[39;49;00m, embedding_dim, hidden_dim, vocab_size, num_layers=[34m1[39;49;00m):
        [33m"""[39;49;00m
[33m        Model initialization: initializing layers[39;49;00m
[33m        """[39;49;00m
        [36msuper[39;49;00m(LSTMRegressor, [36mself[39;49;00m).[32m__init__[39;49;00m()
        
        [36mself[39;49;00m.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = [34m0[39;49;00m)
        [36mself[39;49;00m.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=num_layers)
        [36mself[39;49;00m.linear = nn.Linear(in_features = hidden_dim, out_f

In [4]:
!pygmentize ./train/train.py

[34mimport[39;49;00m [04m[36margparse[39;49;00m
[34mimport[39;49;00m [04m[36mjson[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m
[34mimport[39;49;00m [04m[36mpickle[39;49;00m
[34mimport[39;49;00m [04m[36msys[39;49;00m
[34mimport[39;49;00m [04m[36msagemaker_containers[39;49;00m
[34mimport[39;49;00m [04m[36mpandas[39;49;00m [34mas[39;49;00m [04m[36mpd[39;49;00m
[34mimport[39;49;00m [04m[36mtorch[39;49;00m
[34mimport[39;49;00m [04m[36mtorch.optim[39;49;00m [34mas[39;49;00m [04m[36moptim[39;49;00m
[34mimport[39;49;00m [04m[36mtorch.utils.data[39;49;00m
[34mimport[39;49;00m [04m[36mtorch.nn[39;49;00m [34mas[39;49;00m [04m[36mnn[39;49;00m

[34mfrom[39;49;00m [04m[36mmodel[39;49;00m [34mimport[39;49;00m LSTMRegressor

[34mdef[39;49;00m [32mmodel_fn[39;49;00m(model_dir):
    [33m"""[39;49;00m
[33m    Load PyTorch model from model_dir[39;49;00m
[33m    """[39;49;00m
    
    [34mprint

## Step 3: Create and train a PyTorch LSTM regression estimator

In [5]:
from sagemaker.pytorch import PyTorch
from time import gmtime, strftime

prefix = 'capstone-project-output'

In [10]:
model_name = "google-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())

google_estimator = PyTorch(entry_point='train.py',
                          source_dir='train',
                          role=role,
                          framework_version='1.4.0',
                          train_instance_count=1,
                          train_instance_type='ml.p2.xlarge',
                          output_path='s3://{}/{}/{}'.format(sagemaker_session.default_bucket(), prefix, model_name),
                          hyperparameters={
                              'epochs': 10,
                              'hidden_dim': 200,
                              'training_data_file': 'google_train.csv',
                              'valid_data_file': 'google_valid.csv',
                              'word_dict_file': 'google_dict.pickle'
                          })

In [11]:
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner

metric_definitions = [{'Name': 'validation:loss',
                       'Regex': 'Validation Loss: (\S+)'}]

google_hyperparameter_tuner = HyperparameterTuner(estimator = google_estimator, # The estimator object to use as the basis for the training jobs.
                                               objective_metric_name = 'validation:loss', # The metric used to compare trained models.
                                               metric_definitions = metric_definitions,
                                               objective_type = 'Minimize', # Whether we wish to minimize or maximize the metric.
                                               max_jobs = 20, # The total number of models to train
                                               max_parallel_jobs = 1, # The number of models to train in parallel
                                               hyperparameter_ranges = {
                                                    'epochs': IntegerParameter(10, 20),
                                                    'num_layer': IntegerParameter(1, 4),
                                                    'learning_rate': ContinuousParameter(0.001, 0.01)
                                               })

In [12]:
google_hyperparameter_tuner.fit({'training': input_data})

In [None]:
google_hyperparameter_tuner.wait()

.....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [14]:
google_best_estimator = google_hyperparameter_tuner.best_estimator()

2020-08-01 23:06:06 Starting - Preparing the instances for training
2020-08-01 23:06:06 Downloading - Downloading input data
2020-08-01 23:06:06 Training - Training image download completed. Training in progress.
2020-08-01 23:06:06 Uploading - Uploading generated training model
2020-08-01 23:06:06 Completed - Training job completed[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2020-08-01 23:02:28,594 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2020-08-01 23:02:28,596 sagemaker-containers INFO     Failed to parse hyperparameter _tuning_objective_metric value validation:loss to Json.[0m
[34mReturning the value itself[0m
[34m2020-08-01 23:02:28,617 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2020-08-01 23:02:34,853 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m


In [15]:
google_predictor = google_best_estimator.deploy(initial_instance_count=1, instance_type = 'ml.m4.xlarge')

---------------!

In [16]:
import pandas as pd
import numpy as np

In [17]:
# load test data
test_dir = '../test_data'
google_test_file_name = 'google_test.csv'
google_test_file_path = os.path.join(test_dir, google_test_file_name)

google_test_X = pd.read_csv(google_test_file_path)

In [18]:
google_test_X.values

array([[ 500,  557,  564, ...,   32,   24,  299],
       [ 500,    9,  254, ..., 1054, 3125,  400],
       [ 216, 1838,   63, ...,    0,    0,    0],
       ...,
       [ 500,  173, 2526, ...,   96,  441,  645],
       [ 311, 2235,  282, ...,    0,    0,    0],
       [ 500,  250,   12, ...,   24,  534,   64]])

In [19]:
def predict(predictor, data, rows=512):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = np.array([])
    for array in split_array:
        predictions = np.append(predictions, predictor.predict(array))
    
    return predictions

In [20]:
google_predictions = predict(google_predictor, google_test_X.values)

In [21]:
google_predictions = google_predictions.reshape(-1,1)
google_predictions

array([[ 4.27331984e-01],
       [ 8.33433926e-01],
       [ 4.87758994e-01],
       [ 1.04991424e+00],
       [ 2.99579024e-01],
       [ 6.49342000e-01],
       [ 4.86716151e-01],
       [ 6.93444371e-01],
       [ 8.05019259e-01],
       [ 7.59111226e-01],
       [ 2.62272090e-01],
       [ 6.61397040e-01],
       [ 6.94407225e-01],
       [ 8.83531034e-01],
       [ 3.16438496e-01],
       [ 6.52886331e-01],
       [-1.96229070e-01],
       [ 7.55518138e-01],
       [ 2.75855035e-01],
       [ 4.07718346e-02],
       [ 1.74774796e-01],
       [ 5.54941714e-01],
       [ 4.12691891e-01],
       [ 2.40472630e-02],
       [-1.36006653e-01],
       [ 1.26368630e+00],
       [ 2.66611159e-01],
       [ 1.38837898e+00],
       [ 2.60843411e-02],
       [ 1.96046829e-01],
       [ 6.99554622e-01],
       [ 6.50359333e-01],
       [ 2.15290219e-01],
       [ 5.76554298e-01],
       [ 7.51254261e-01],
       [ 6.72825217e-01],
       [ 8.36382747e-01],
       [ 2.78035671e-01],
       [ 5.4

In [22]:
google_test_y = pd.read_csv(os.path.join(test_dir, 'google_test_y.csv'))
google_test_y

Unnamed: 0,0
0,1.080505
1,-1.533489
2,1.103292
3,-0.283037
4,-0.591338
5,1.414253
6,-2.010966
7,1.827237
8,1.344002
9,-0.478736


In [23]:
from sklearn.metrics import mean_absolute_error

google_mae = mean_absolute_error(google_test_y.values, google_predictions)

In [24]:
google_mae

1.3030695022355296

Since the y value has been scaled up by 100 to represent the percentage change, this means that our predictions of google stock change is off by a little more than 1 percent. This is not ideal in stock market.

In [31]:
google_predictor.delete_endpoint()

In [29]:
amazon_model_name = "amazon-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())

amazon_estimator = PyTorch(entry_point='train.py',
                          source_dir='train',
                          role=role,
                          framework_version='1.4.0',
                          train_instance_count=1,
                          train_instance_type='ml.p2.xlarge',
                          output_path='s3://{}/{}/{}'.format(sagemaker_session.default_bucket(), prefix, model_name),
                          hyperparameters={
                              'epochs': 10,
                              'hidden_dim': 200,
                              'training_data_file': 'amazon_train.csv',
                              'valid_data_file': 'amazon_valid.csv',
                              'word_dict_file': 'amazon_dict.pickle'
                          })

In [32]:
amazon_hyperparameter_tuner = HyperparameterTuner(estimator = amazon_estimator, # The estimator object to use as the basis for the training jobs.
                                               objective_metric_name = 'validation:loss', # The metric used to compare trained models.
                                               metric_definitions = metric_definitions,
                                               objective_type = 'Minimize', # Whether we wish to minimize or maximize the metric.
                                               max_jobs = 20, # The total number of models to train
                                               max_parallel_jobs = 1, # The number of models to train in parallel
                                               hyperparameter_ranges = {
                                                    'epochs': IntegerParameter(10, 20),
                                                    'num_layer': IntegerParameter(1, 4),
                                                    'learning_rate': ContinuousParameter(0.001, 0.01)
                                               })

In [33]:
amazon_hyperparameter_tuner.fit({'training': input_data})

In [36]:
amazon_best_estimator = amazon_hyperparameter_tuner.best_estimator()

2020-08-02 05:45:51 Starting - Preparing the instances for training
2020-08-02 05:45:51 Downloading - Downloading input data
2020-08-02 05:45:51 Training - Training image download completed. Training in progress.
2020-08-02 05:45:51 Uploading - Uploading generated training model
2020-08-02 05:45:51 Completed - Training job completed[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2020-08-02 05:40:42,555 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2020-08-02 05:40:42,557 sagemaker-containers INFO     Failed to parse hyperparameter _tuning_objective_metric value validation:loss to Json.[0m
[34mReturning the value itself[0m
[34m2020-08-02 05:40:42,582 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2020-08-02 05:40:43,196 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m


In [37]:
amazon_predictor = amazon_best_estimator.deploy(initial_instance_count=1, instance_type = 'ml.m4.xlarge')

---------------!

In [38]:
amazon_test_file_name = 'amazon_test.csv'
amazon_test_file_path = os.path.join(test_dir, amazon_test_file_name)

amazon_test_X = pd.read_csv(amazon_test_file_path)

In [39]:
amazon_predictions = predict(amazon_predictor, amazon_test_X.values)

In [40]:
amazon_test_y = pd.read_csv(os.path.join(test_dir, 'amazon_test_y.csv'))

In [41]:
amazon_mae = mean_absolute_error(amazon_test_y.values, amazon_predictions)
amazon_mae

1.3594194957064325

In [42]:
amazon_predictor.delete_endpoint()

In [43]:
fb_model_name = "fb-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())

fb_estimator = PyTorch(entry_point='train.py',
                          source_dir='train',
                          role=role,
                          framework_version='1.4.0',
                          train_instance_count=1,
                          train_instance_type='ml.p2.xlarge',
                          output_path='s3://{}/{}/{}'.format(sagemaker_session.default_bucket(), prefix, model_name),
                          hyperparameters={
                              'epochs': 10,
                              'hidden_dim': 200,
                              'training_data_file': 'fb_train.csv',
                              'valid_data_file': 'fb_valid.csv',
                              'word_dict_file': 'fb_dict.pickle'
                          })

In [44]:
fb_hyperparameter_tuner = HyperparameterTuner(estimator = fb_estimator, # The estimator object to use as the basis for the training jobs.
                                               objective_metric_name = 'validation:loss', # The metric used to compare trained models.
                                               metric_definitions = metric_definitions,
                                               objective_type = 'Minimize', # Whether we wish to minimize or maximize the metric.
                                               max_jobs = 20, # The total number of models to train
                                               max_parallel_jobs = 1, # The number of models to train in parallel
                                               hyperparameter_ranges = {
                                                    'epochs': IntegerParameter(10, 20),
                                                    'num_layer': IntegerParameter(1, 4),
                                                    'learning_rate': ContinuousParameter(0.001, 0.01)
                                               })

In [45]:
fb_hyperparameter_tuner.fit({'training': input_data})

In [46]:
fb_best_estimator = fb_hyperparameter_tuner.best_estimator()

2020-08-02 21:13:23 Starting - Preparing the instances for training
2020-08-02 21:13:23 Downloading - Downloading input data
2020-08-02 21:13:23 Training - Training image download completed. Training in progress.
2020-08-02 21:13:23 Uploading - Uploading generated training model
2020-08-02 21:13:23 Completed - Training job completed[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2020-08-02 21:09:24,780 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2020-08-02 21:09:24,781 sagemaker-containers INFO     Failed to parse hyperparameter _tuning_objective_metric value validation:loss to Json.[0m
[34mReturning the value itself[0m
[34m2020-08-02 21:09:24,804 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2020-08-02 21:09:27,828 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m


In [None]:
fb_predictor = fb_best_estimator.deploy(initial_instance_count=1, instance_type = 'ml.m4.xlarge')

------------

In [48]:
fb_test_file_name = 'fb_test.csv'
fb_test_file_path = os.path.join(test_dir, fb_test_file_name)
fb_test_X = pd.read_csv(fb_test_file_path)

fb_predictions = predict(fb_predictor, fb_test_X.values)
fb_test_y = pd.read_csv(os.path.join(test_dir, 'fb_test_y.csv'))
fb_mae = mean_absolute_error(fb_test_y.values, fb_predictions)
fb_mae

1.724564613839111

In [49]:
fb_predictor.delete_endpoint()

In [50]:
msft_model_name = "msft-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())

msft_estimator = PyTorch(entry_point='train.py',
                          source_dir='train',
                          role=role,
                          framework_version='1.4.0',
                          train_instance_count=1,
                          train_instance_type='ml.p2.xlarge',
                          output_path='s3://{}/{}/{}'.format(sagemaker_session.default_bucket(), prefix, model_name),
                          hyperparameters={
                              'epochs': 10,
                              'hidden_dim': 200,
                              'training_data_file': 'msft_train.csv',
                              'valid_data_file': 'msft_valid.csv',
                              'word_dict_file': 'msft_dict.pickle'
                          })

In [51]:
msft_hyperparameter_tuner = HyperparameterTuner(estimator = msft_estimator, # The estimator object to use as the basis for the training jobs.
                                               objective_metric_name = 'validation:loss', # The metric used to compare trained models.
                                               metric_definitions = metric_definitions,
                                               objective_type = 'Minimize', # Whether we wish to minimize or maximize the metric.
                                               max_jobs = 20, # The total number of models to train
                                               max_parallel_jobs = 1, # The number of models to train in parallel
                                               hyperparameter_ranges = {
                                                    'epochs': IntegerParameter(10, 20),
                                                    'num_layer': IntegerParameter(1, 4),
                                                    'learning_rate': ContinuousParameter(0.001, 0.01)
                                               })

In [52]:
msft_hyperparameter_tuner.fit({'training': input_data})

In [53]:
msft_best_estimator = msft_hyperparameter_tuner.best_estimator()

2020-08-03 05:38:39 Starting - Preparing the instances for training
2020-08-03 05:38:39 Downloading - Downloading input data
2020-08-03 05:38:39 Training - Training image download completed. Training in progress.
2020-08-03 05:38:39 Uploading - Uploading generated training model
2020-08-03 05:38:39 Completed - Training job completed[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2020-08-03 05:37:23,133 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2020-08-03 05:37:23,135 sagemaker-containers INFO     Failed to parse hyperparameter _tuning_objective_metric value validation:loss to Json.[0m
[34mReturning the value itself[0m
[34m2020-08-03 05:37:23,160 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2020-08-03 05:37:24,586 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m


In [None]:
msft_predictor = msft_best_estimator.deploy(initial_instance_count=1, instance_type = 'ml.m4.xlarge')

--------

In [55]:
msft_test_file_name = 'msft_test.csv'
msft_test_file_path = os.path.join(test_dir, msft_test_file_name)
msft_test_X = pd.read_csv(msft_test_file_path)

msft_predictions = predict(msft_predictor, msft_test_X.values)
msft_test_y = pd.read_csv(os.path.join(test_dir, 'msft_test_y.csv'))
msft_mae = mean_absolute_error(msft_test_y.values, msft_predictions)
msft_mae

1.1214700235561963

In [56]:
msft_predictor.delete_endpoint()

In [58]:
type(input_data)

str