In [1]:
import pandas as pd
import boto3
import sagemaker

In [2]:
# Getting SageMaker role & S3 bucket
# session and role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

# create an S3 bucket
bucket = sagemaker_session.default_bucket()

In [3]:
# Name of directory including features data
data_dir = 'plagiarism_data'

# setting prefix to be used for uploading data to S3
prefix = 'data-plagiarism'

# upload all data to S3
input_data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)
print(input_data)

s3://sagemaker-eu-central-1-930426367865/data-plagiarism


In [4]:
# confirm that data is in S3 bucket
empty_check = []
for obj in boto3.resource('s3').Bucket(bucket).objects.all():
    empty_check.append(obj.key)
    print(obj.key)

assert len(empty_check) !=0, 'S3 bucket is empty.'
print('Test passed!')

data-plagiarism/test.csv
data-plagiarism/train.csv
Test passed!


In [6]:
# Create estimator instance
from sagemaker.pytorch import PyTorch

output_path = 's3://{}/{}'.format(bucket, prefix)

# instantiate a pytorch estimator
estimator = PyTorch(entry_point='train.py',
                    source_dir='source_pytorch', # this should be just "source" for your code
                    role=role,
                    framework_version='1.0',
                    train_instance_count=1,
                    train_instance_type='ml.p2.xlarge',
                    output_path=output_path,
                    sagemaker_session=sagemaker_session,
                    hyperparameters={
                        'input_dim': 3,  # num of features
                        'hidden_dim': 50,
                        'output_dim': 1,
                        'epochs': 200 # could change to higher
                    })

In [8]:
%%time

# Train estimator on S3 training data
estimator.fit({'train': input_data})

'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


2020-08-23 15:52:11 Starting - Starting the training job...
2020-08-23 15:52:13 Starting - Launching requested ML instances......
2020-08-23 15:53:15 Starting - Preparing the instances for training............
2020-08-23 15:55:41 Downloading - Downloading input data
2020-08-23 15:55:41 Training - Downloading the training image......
2020-08-23 15:56:37 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2020-08-23 15:56:39,295 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2020-08-23 15:56:39,320 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2020-08-23 15:56:42,331 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2020-08-23 15:56:42,561 sagemaker-containers INFO     Module train does not provide a setup.

In [9]:
%%time

from sagemaker.pytorch import PyTorchModel

model = PyTorchModel(model_data=estimator.model_data,
                     role = role,
                     framework_version='1.0',
                     entry_point='predict.py',
                     source_dir='source_pytorch')

# deploy model to create a predictor
predictor = model.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.
'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


-------------!CPU times: user 349 ms, sys: 21.5 ms, total: 371 ms
Wall time: 6min 31s


In [10]:
import os

# read in test data
test_data = pd.read_csv(os.path.join(data_dir, "test.csv"), header=None, names=None)

# labels are in the first column
test_y = test_data.iloc[:,0]
test_x = test_data.iloc[:,1:]

In [11]:
print(test_x)

           1         2         3
0   1.000000  0.922280  0.820755
1   0.765306  0.589655  0.621711
2   0.884444  0.180995  0.597458
3   0.619048  0.043243  0.427835
4   0.920000  0.394366  0.775000
5   0.992674  0.973978  0.993056
6   0.412698  0.000000  0.346667
7   0.462687  0.000000  0.189320
8   0.581152  0.000000  0.247423
9   0.584211  0.000000  0.294416
10  0.566372  0.000000  0.258333
11  0.481481  0.022901  0.278912
12  0.619792  0.026596  0.341584
13  0.921739  0.654867  0.929412
14  1.000000  0.922481  1.000000
15  0.861538  0.062827  0.504717
16  0.626168  0.223975  0.558559
17  1.000000  0.968872  0.996700
18  0.383838  0.010309  0.178744
19  1.000000  0.944649  0.854671
20  0.613924  0.000000  0.298343
21  0.972763  0.830040  0.927083
22  0.962810  0.689076  0.909804
23  0.415254  0.000000  0.177419
24  0.532189  0.017467  0.245833


In [12]:
# Generate predicted, class labels
import numpy as np

test_y_preds = np.squeeze(np.round(predictor.predict(test_x)))

tp = np.logical_and(test_y, test_y_preds).sum()
fp = np.logical_and(1-test_y, test_y_preds).sum()
tn = np.logical_and(1-test_y, 1-test_y_preds).sum()
fn = np.logical_and(test_y, 1-test_y_preds).sum()

recall = tp / (tp + fn)
precision = tp / (tp + fp)
accuracy = (tp + tn) / (tp + fp + tn + fn)

print(pd.crosstab(test_y, test_y_preds, rownames=['actuals'], colnames=['predictions']))
print("\n{:<11} {:.3f}".format('Recall:', recall))
print("{:<11} {:.3f}".format('Precision:', precision))
print("{:<11} {:.3f}".format('Accuracy:', accuracy))

# test that model generates the correct number of labels
assert len(test_y_preds)==len(test_y), 'Unexpected number of predictions.'
print('Test passed!')

predictions  0.0  1.0
actuals              
0             10    0
1              0   15

Recall:     1.000
Precision:  1.000
Accuracy:   1.000
Test passed!


In [13]:
# Calculate the test accuracy
accuracy = (tp + tn) / (tp + fp + tn + fn)

print(accuracy)

## print out the array of predicted and true labels
print('\nPredicted class labels: ')
print(test_y_preds)
print('\nTrue class labels: ')
print(test_y.values)

1.0

Predicted class labels: 
[1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 0.
 0.]

True class labels: 
[1 1 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 0]


In [14]:
# Delete endpoint in AWS SageMaker service
predictor.delete_endpoint()

In [15]:
# deleting bucket

bucket_to_delete = boto3.resource('s3').Bucket(bucket)
bucket_to_delete.objects.all().delete()

[{'ResponseMetadata': {'RequestId': '7C2F599B46CDD7C7',
   'HostId': 'h1TLqh+T82bXx/13MXnhFZf9ikOZukRPKvJBFS5quq86FPGdYa+0HeMRoCbDDjR2l2CMuQEsxtk=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'h1TLqh+T82bXx/13MXnhFZf9ikOZukRPKvJBFS5quq86FPGdYa+0HeMRoCbDDjR2l2CMuQEsxtk=',
    'x-amz-request-id': '7C2F599B46CDD7C7',
    'date': 'Sun, 23 Aug 2020 16:07:32 GMT',
    'connection': 'close',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'data-plagiarism/test.csv'},
   {'Key': 'data-plagiarism/train.csv'},
   {'Key': 'data-plagiarism/sagemaker-pytorch-2020-08-23-15-52-11-550/debug-output/training_job_end.ts'},
   {'Key': 'sagemaker-pytorch-2020-08-23-15-52-11-550/source/sourcedir.tar.gz'},
   {'Key': 'data-plagiarism/sagemaker-pytorch-2020-08-23-15-52-11-550/output/model.tar.gz'},
   {'Key': 'sagemaker-pytorch-2020-08-23-15-57-26-786/sourcedir.tar.gz'}]}]