In [1]:
import sagemaker
import boto3

region = boto3.Session().region_name
session = sagemaker.Session()
role = sagemaker.get_execution_role()

bucket = session.default_bucket()
prefix = 'sagemaker-studio-book/chapter08/winequality'

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

In [51]:
import pandas as pd
import numpy as np
from time import gmtime, strftime, sleep
import json
import io
from urllib.parse import urlparse
from sklearn.metrics import classification_report, f1_score, confusion_matrix
# import matplotlib.pyplot as plt
# %matplotlib inline

In [3]:
from sagemaker import AutoML
automl = AutoML.attach(auto_ml_job_name='white-wine-predict-quality')

In [5]:
TOP_N_CANDIDATES = 3
candidates = automl.list_candidates(sort_by='FinalObjectiveMetricValue',
                                    sort_order='Descending',
                                    max_results=TOP_N_CANDIDATES)

for candidate in candidates:
    print("Candidate name: ", candidate['CandidateName'])
    print("Objective metric name: ", candidate['FinalAutoMLJobObjectiveMetric']['MetricName'])
    print("Objective metric value: ", candidate['FinalAutoMLJobObjectiveMetric']['Value'])
    print('\n')

Candidate name:  white-wine-predict-qualitysZ1CBE-003-1a47413b
Objective metric name:  validation:f1
Objective metric value:  0.4073199927806854


Candidate name:  white-wine-predict-qualitysZ1CBE-093-c37343d7
Objective metric name:  validation:f1
Objective metric value:  0.4049299955368042


Candidate name:  white-wine-predict-qualitysZ1CBE-082-6e1e3463
Objective metric name:  validation:f1
Objective metric value:  0.4048300087451935




In [53]:
test_data = pd.read_csv('winequality-white-test.csv')
# test_data_no_target = ... # test_data to be used (without target column)
test_file_basename = 'winequality-white-test-notarget.csv'
test_file =  f's3://{bucket}/{prefix}/{test_file_basename}' # path of data to upload to S3 and perform batch inference (csv file of test_data_no_target)
target_attribute_name = 'quality' # name of target column (values to predict)
# target_attribute_values = np.sort(test_data[target_attribute_name].unique()).tolist() # list of unique values in target column (sorted)

In [7]:
inference_response_keys = ['predicted_label']

In [9]:
s3_transform_output_path = 's3://{}/{}/inference-results/'.format(bucket, prefix);

transformers = []

for candidate in candidates:
    model = automl.create_model(name=candidate['CandidateName'],
                                candidate=candidate,
                                inference_response_keys=inference_response_keys)
    
    output_path = s3_transform_output_path + candidate['CandidateName'] +'/'
    
    transformers.append(
        model.transformer(instance_count=1, 
                          instance_type='ml.m5.xlarge',
                          assemble_with='Line',
                          output_path=output_path))

print("Setting up {} Batch Transform Jobs in `transformers`".format(len(transformers)))

Setting up 3 Batch Transform Jobs in `transformers`


In [11]:
for transformer in transformers:
    transformer.transform(data=test_file, split_type='Line', 
                          content_type='text/csv', wait=False)
    print("Starting transform job {}".format(transformer._current_job_name))

Starting transform job white-wine-predict-qualitysZ1CBE-003-1a-2021-06-29-00-22-07-845
Starting transform job white-wine-predict-qualitysZ1CBE-093-c3-2021-06-29-00-22-08-294
Starting transform job white-wine-predict-qualitysZ1CBE-082-6e-2021-06-29-00-22-10-787


In [12]:
pending_complete = True

while pending_complete:
    pending_complete = False
    num_transform_jobs = len(transformers)
    for transformer in transformers:
        desc = sm.describe_transform_job(TransformJobName=transformer._current_job_name)
        if desc['TransformJobStatus'] not in ['Failed', 'Completed']:
            pending_complete = True
        else:
            num_transform_jobs -= 1
    print("{} out of {} transform jobs are running.".format(num_transform_jobs, len(transformers)))
    sleep(30)
    
for transformer in transformers:
    desc = sm.describe_transform_job(TransformJobName=transformer._current_job_name)
    print("Transform job '{}' finished with status {}".format(transformer._current_job_name, desc['TransformJobStatus']))

3 out of 3 transform jobs are running.
3 out of 3 transform jobs are running.
3 out of 3 transform jobs are running.
3 out of 3 transform jobs are running.
3 out of 3 transform jobs are running.
3 out of 3 transform jobs are running.
3 out of 3 transform jobs are running.
3 out of 3 transform jobs are running.
3 out of 3 transform jobs are running.
3 out of 3 transform jobs are running.
2 out of 3 transform jobs are running.
0 out of 3 transform jobs are running.
Transform job 'white-wine-predict-qualitysZ1CBE-003-1a-2021-06-29-00-22-07-845' finished with status Completed
Transform job 'white-wine-predict-qualitysZ1CBE-093-c3-2021-06-29-00-22-08-294' finished with status Completed
Transform job 'white-wine-predict-qualitysZ1CBE-082-6e-2021-06-29-00-22-10-787' finished with status Completed


In [15]:
def get_csv_from_s3(s3uri, file_name):
    parsed_url = urlparse(s3uri)
    bucket_name = parsed_url.netloc
    prefix = parsed_url.path[1:].strip('/')
    s3 = boto3.resource('s3')
    obj = s3.Object(bucket_name, '{}/{}'.format(prefix, file_name))
    return obj.get()["Body"].read().decode('utf-8')

predictions = []

for transformer in transformers:
    print(transformer.output_path)
    pred_csv = get_csv_from_s3(transformer.output_path, 
                               '{}.out'.format(test_file_basename))
    predictions.append(pd.read_csv(io.StringIO(pred_csv), header=None))

s3://sagemaker-us-west-2-552106442228/sagemaker-studio-book/chapter08/winequality/inference-results/white-wine-predict-qualitysZ1CBE-003-1a47413b/
s3://sagemaker-us-west-2-552106442228/sagemaker-studio-book/chapter08/winequality/inference-results/white-wine-predict-qualitysZ1CBE-093-c37343d7/
s3://sagemaker-us-west-2-552106442228/sagemaker-studio-book/chapter08/winequality/inference-results/white-wine-predict-qualitysZ1CBE-082-6e1e3463/


In [20]:
labels = test_data[target_attribute_name]

In [46]:
for prediction, candidate in zip(predictions, candidates):
    print("Candidate name: ", candidate['CandidateName'])
    print("Objective metric name: ", candidate['FinalAutoMLJobObjectiveMetric']['MetricName'])
    print("Objective metric value: ", candidate['FinalAutoMLJobObjectiveMetric']['Value'])

    scores={}
    for avg in ['macro', 'weighted']:
        scores[avg] = [f1_score(labels, prediction, average=avg), 
                       precision_score(labels, prediction, average=avg)]
        print('f1 = %.2f, Precision = %.2f (%s)' % (scores[avg][0], scores[avg][1], avg))
    print(classification_report(labels, prediction))
    print(confusion_matrix(labels, prediction))
    print()

Candidate name:  white-wine-predict-qualitysZ1CBE-003-1a47413b
Objective metric name:  validation:f1
Objective metric value:  0.4073199927806854
f1 = 0.51, Precision = 0.59 (macro)
f1 = 0.67, Precision = 0.68 (weighted)
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         3
           4       0.70      0.39      0.50        18
           5       0.63      0.67      0.65       144
           6       0.67      0.77      0.72       215
           7       0.76      0.57      0.65        94
           8       0.78      0.44      0.56        16

    accuracy                           0.67       490
   macro avg       0.59      0.47      0.51       490
weighted avg       0.68      0.67      0.67       490

[[  0   0   3   0   0   0]
 [  0   7   8   3   0   0]
 [  0   2  96  45   1   0]
 [  0   1  37 166  10   1]
 [  0   0   8  31  54   1]
 [  0   0   0   3   6   7]]

Candidate name:  white-wine-predict-qualitysZ1CBE-093-c37343d7
Objective 