# Capstone Project - Linear Regression Modelling

_Author: Yifei Tong_

---
## Goal

The goal of this notebook is to perform modelling using sagemaker linear regression on the preprocess financial news data. 

## Steps

1. Upload data
3. Train linear learner on processed data
4. Deploy models
5. Evaluation

In [1]:
import sagemaker

sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()
prefix = 'financial-news-dataset'

role = sagemaker.get_execution_role()

## Step 1: Format Training Data

In [2]:
import pandas as pd
import os
import numpy as np
data_dir = '../data'

In [6]:
def format_training_data(training_data_file):
    df = pd.read_csv(os.path.join(data_dir, training_data_file))
    label = np.array([i[0] for i in df.values]).astype('float32')
    
    x = np.array([i[2:] for i in df.values]).astype('float32')
    
    return x, label

In [7]:
google_X, google_y = format_training_data('google_train.csv')
amazon_X, amazon_y = format_training_data('amazon_train.csv')
fb_X, fb_y = format_training_data('fb_train.csv')
msft_X, msft_y = format_training_data('msft_train.csv')

## Step 2: Train Linear Learner on Processed Data

In [8]:
from sagemaker import LinearLearner
from time import gmtime, strftime
prefix = 'capstone-project-output'

In [9]:
model_name = "google-linear-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())

google_linear = LinearLearner(role=role,
                       train_instance_count=1, 
                       train_instance_type='ml.c4.xlarge',
                       predictor_type='regressor',
                       loss='squared_loss',
                       output_path='s3://{}/{}/{}'.format(sagemaker_session.default_bucket(), prefix, model_name),
                       sagemaker_session=sagemaker_session,
                       epochs=15)

In [10]:
google_record = google_linear.record_set(google_X, google_y)

In [11]:
google_linear.fit(google_record)

2020-08-05 01:25:13 Starting - Starting the training job...
2020-08-05 01:25:15 Starting - Launching requested ML instances......
2020-08-05 01:26:18 Starting - Preparing the instances for training......
2020-08-05 01:27:23 Downloading - Downloading input data...
2020-08-05 01:28:12 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[08/05/2020 01:28:15 INFO 140432478881600] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto', u'lr_scheduler_


2020-08-05 01:28:29 Uploading - Uploading generated training model
2020-08-05 01:28:29 Completed - Training job completed
Training seconds: 66
Billable seconds: 66


In [12]:
google_predictor = google_linear.deploy(initial_instance_count=1, instance_type = 'ml.m4.xlarge')

---------------!

In [13]:
from sklearn.metrics import mean_absolute_error

test_dir = '../test_data'

def predict(predictor, data, rows=512):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = np.array([])
    for array in split_array:
        predictions = np.append(predictions, predictor.predict(array))
    
    return predictions

In [14]:
google_test_file_name = 'google_test.csv'
google_test_file_path = os.path.join(test_dir, google_test_file_name)
google_test_X = pd.read_csv(google_test_file_path)

google_predictions = predict(google_predictor, google_test_X.values.astype('float32'))
google_test_y = pd.read_csv(os.path.join(test_dir, 'google_test_y.csv'))

google_predictions = np.array([x.label['score'].float32_tensor.values[0] for x in google_predictions])
google_mae = mean_absolute_error(google_test_y.values, google_predictions)
google_mae

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (400) from model with message "unable to evaluate payload provided". See https://us-east-2.console.aws.amazon.com/cloudwatch/home?region=us-east-2#logEventViewer:group=/aws/sagemaker/Endpoints/linear-learner-2020-08-05-01-25-13-147 in account 989457217313 for more information.

In [93]:
google_predictor.delete_endpoint()

In [94]:
amazon_model_name = "amazon-linear-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())

amazon_linear = LinearLearner(role=role,
                       train_instance_count=1, 
                       train_instance_type='ml.c4.xlarge',
                       predictor_type='regressor',
                       loss='squared_loss',
                       output_path='s3://{}/{}/{}'.format(sagemaker_session.default_bucket(), prefix, model_name),
                       sagemaker_session=sagemaker_session,
                       epochs=15)

amazon_record = amazon_linear.record_set(amazon_X, amazon_y)

amazon_linear.fit(amazon_record)

2020-08-04 04:37:39 Starting - Starting the training job...
2020-08-04 04:37:42 Starting - Launching requested ML instances......
2020-08-04 04:38:45 Starting - Preparing the instances for training.........
2020-08-04 04:40:36 Downloading - Downloading input data
2020-08-04 04:40:36 Training - Downloading the training image..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[08/04/2020 04:40:54 INFO 140076018857792] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto', u'lr_scheduler_step': u'auto', u'init_me


2020-08-04 04:41:08 Uploading - Uploading generated training model
2020-08-04 04:41:08 Completed - Training job completed
[34m[2020-08-04 04:40:56.352] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 16, "duration": 126, "num_examples": 5, "num_bytes": 8238780}[0m
[34m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.9167494049072266, "sum": 0.9167494049072266, "min": 0.9167494049072266}}, "EndTime": 1596516056.352836, "Dimensions": {"model": 0, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 6}, "StartTime": 1596516056.352742}
[0m
[34m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.9621297149658203, "sum": 0.9621297149658203, "min": 0.9621297149658203}}, "EndTime": 1596516056.352925, "Dimensions": {"model": 1, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 6}, "StartTime": 1596516056.352907}
[0m
[34m#metrics {"Metrics": {"train_mse_obj

Training seconds: 49
Billable seconds: 49


In [95]:
amazon_predictor = amazon_linear.deploy(initial_instance_count=1, instance_type = 'ml.m4.xlarge')

---------------!

In [96]:
amazon_test_file_name = 'amazon_test.csv'
amazon_test_file_path = os.path.join(test_dir, amazon_test_file_name)
amazon_test_X = pd.read_csv(amazon_test_file_path)

amazon_predictions = predict(amazon_predictor, amazon_test_X.values.astype('float32'))
amazon_test_y = pd.read_csv(os.path.join(test_dir, 'amazon_test_y.csv'))

amazon_predictions = np.array([x.label['score'].float32_tensor.values[0] for x in amazon_predictions])
amazon_mae = mean_absolute_error(amazon_test_y.values, amazon_predictions)
amazon_mae

1.4025801052085767

In [97]:
amazon_predictor.delete_endpoint()

In [None]:
fb_model_name = "fb-linear-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())

fb_linear = LinearLearner(role=role,
                       train_instance_count=1, 
                       train_instance_type='ml.c4.xlarge',
                       predictor_type='regressor',
                       loss='squared_loss',
                       output_path='s3://{}/{}/{}'.format(sagemaker_session.default_bucket(), prefix, model_name),
                       sagemaker_session=sagemaker_session,
                       epochs=15)

fb_record = fb_linear.record_set(fb_X, fb_y)

fb_linear.fit(fb_record)

2020-08-04 16:40:59 Starting - Starting the training job...
2020-08-04 16:41:02 Starting - Launching requested ML instances......
2020-08-04 16:42:05 Starting - Preparing the instances for training......
2020-08-04 16:43:23 Downloading - Downloading input data...
2020-08-04 16:43:48 Training - Downloading the training image..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[08/04/2020 16:44:06 INFO 140289191446336] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto', u'lr_scheduler_step': u'auto', u'init_me


2020-08-04 16:44:19 Uploading - Uploading generated training model
2020-08-04 16:44:19 Completed - Training job completed
Training seconds: 56
Billable seconds: 56


In [None]:
fb_predictor = fb_linear.deploy(initial_instance_count=1, instance_type = 'ml.m4.xlarge')

---------------!

In [100]:
fb_test_file_name = 'fb_test.csv'
fb_test_file_path = os.path.join(test_dir, fb_test_file_name)
fb_test_X = pd.read_csv(fb_test_file_path)

fb_predictions = predict(fb_predictor, fb_test_X.values.astype('float32'))
fb_test_y = pd.read_csv(os.path.join(test_dir, 'fb_test_y.csv'))

fb_predictions = np.array([x.label['score'].float32_tensor.values[0] for x in fb_predictions])
fb_mae = mean_absolute_error(fb_test_y.values, fb_predictions)
fb_mae

1.364877235731505

In [101]:
fb_predictor.delete_endpoint()

In [None]:
msft_model_name = "msft-linear-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())

msft_linear = LinearLearner(role=role,
                       train_instance_count=1, 
                       train_instance_type='ml.c4.xlarge',
                       predictor_type='regressor',
                       loss='squared_loss',
                       output_path='s3://{}/{}/{}'.format(sagemaker_session.default_bucket(), prefix, model_name),
                       sagemaker_session=sagemaker_session,
                       epochs=15)

msft_record = msft_linear.record_set(msft_X, msft_y)

msft_linear.fit(msft_record)

2020-08-04 17:26:14 Starting - Starting the training job...
2020-08-04 17:26:17 Starting - Launching requested ML instances......
2020-08-04 17:27:23 Starting - Preparing the instances for training......
2020-08-04 17:28:21 Downloading - Downloading input data...
2020-08-04 17:29:15 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[08/04/2020 17:29:18 INFO 139753394149184] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto', u'lr_scheduler_

[34m[08/04/2020 17:29:19 INFO 139753394149184] Saved checkpoint to "/tmp/tmpItT_T5/mx-mod-0000.params"[0m
[34m[08/04/2020 17:29:19 INFO 139753394149184] #progress_metric: host=algo-1, completed 26 % of epochs[0m
[34m#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 2, "sum": 2.0, "min": 2}, "Number of Batches Since Last Reset": {"count": 1, "max": 2, "sum": 2.0, "min": 2}, "Number of Records Since Last Reset": {"count": 1, "max": 1529, "sum": 1529.0, "min": 1529}, "Total Batches Seen": {"count": 1, "max": 11, "sum": 11.0, "min": 11}, "Total Records Seen": {"count": 1, "max": 8645, "sum": 8645.0, "min": 8645}, "Max Records Seen Between Resets": {"count": 1, "max": 1529, "sum": 1529.0, "min": 1529}, "Reset Count": {"count": 1, "max": 6, "sum": 6.0, "min": 6}}, "EndTime": 1596562159.506896, "Dimensions": {"Host": "algo-1", "Meta": "training_data_iter", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 3}, "StartTime": 1596562159.456631}
[0m



2020-08-04 17:29:28 Uploading - Uploading generated training model
2020-08-04 17:29:28 Completed - Training job completed
Training seconds: 67
Billable seconds: 67


In [103]:
msft_predictor = msft_linear.deploy(initial_instance_count=1, instance_type = 'ml.m4.xlarge')

---------------!

In [104]:
msft_test_file_name = 'msft_test.csv'
msft_test_file_path = os.path.join(test_dir, msft_test_file_name)
msft_test_X = pd.read_csv(msft_test_file_path)

msft_predictions = predict(msft_predictor, msft_test_X.values.astype('float32'))
msft_test_y = pd.read_csv(os.path.join(test_dir, 'msft_test_y.csv'))

msft_predictions = np.array([x.label['score'].float32_tensor.values[0] for x in msft_predictions])
msft_mae = mean_absolute_error(msft_test_y.values, msft_predictions)
msft_mae

1.4628215101033162

In [105]:
msft_predictor.delete_endpoint()