In [2]:
import pandas as pd
import numpy as np
import boto3
import sagemaker
import os, sys

print (sagemaker.__version__)

sess   = sagemaker.Session()
bucket = sess.default_bucket()                     
prefix = 'sagemaker/automl-dm'
region = boto3.Session().region_name

# Role when working on a notebook instance
role = "arn:aws:iam::388295382521:role/service-role/AmazonSageMaker-ExecutionRole-20201029T114207"

sm = boto3.Session().client(service_name='sagemaker',region_name=region)
sm_rt = boto3.Session().client('runtime.sagemaker', region_name=region)

2.16.1


In [2]:
data = pd.read_csv('../finalCSVforTraining3.csv', sep=',')
pd.set_option('display.max_columns', 500)     # Make sure we can see all of the columns
pd.set_option('display.max_rows', 50)         # Keep the output on one page
data[:10] # Show the first 10 lines

data.shape # (number of lines, number of columns)

train_data, test_data, _ = np.split(data.sample(frac=1, random_state=123), 
                                                  [int(0.80 * len(data)), int(len(data))])  

# Save to CSV files
train_data.drop('indice',axis=1, inplace=True)
train_data.to_csv('automl-train.csv', index=False, header=True, sep=',') # Need to keep column names

test_data.to_csv('automl-test.csv', index=False, header=True, sep=',')
test_data.set_index('indice', inplace=True)
#test_data.drop(0,inplace=True)
print(test_data[:10])

        number of tokens  number of words in upper case  number of verbs  \
indice                                                                     
2042                1284                             14              156   
3425                 263                              3               27   
423                  364                              2               56   
1638                 102                              0               18   
295                  201                              5               29   
2005                 966                             15              118   
2778                 843                              5               98   
1933                 995                              7              129   
250                   93                              0               15   
1813                 180                              0               23   

        number of subjuntive and imperative verbs  number of nouns  \
indice           

In [3]:
sess.upload_data(path="automl-train.csv", key_prefix=prefix + "/input")

's3://sagemaker-sa-east-1-388295382521/sagemaker/automl-dm/input/automl-train.csv'

In [3]:
job_config = {
    'CompletionCriteria': {
      'MaxRuntimePerTrainingJobInSeconds': 600,
      # 'MaxCandidates': 10,
      'MaxAutoMLJobRuntimeInSeconds': 3600
    },
}

input_data_config = [{
      'DataSource': {
        'S3DataSource': {
          'S3DataType': 'S3Prefix',
          'S3Uri': 's3://{}/{}/input'.format(bucket,prefix)
        }
      },
      'TargetAttributeName': 'classification'  # the column we want to predict
    }
]

output_data_config = { 'S3OutputPath': 's3://{}/{}/output'.format(bucket,prefix) }

# Optional parameters

problem_type = 'BinaryClassification'

job_objective = { 'MetricName': 'F1' }

In [7]:
from time import gmtime, strftime, sleep
timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())

auto_ml_job_name = 'automl-dm-' + timestamp_suffix
print('AutoMLJobName: ' + auto_ml_job_name)

sm.create_auto_ml_job(AutoMLJobName=auto_ml_job_name,
                      InputDataConfig=input_data_config,
                      OutputDataConfig=output_data_config,
                      AutoMLJobConfig=job_config,
                      AutoMLJobObjective=job_objective,
                      ProblemType=problem_type,
                      RoleArn=role)

AutoMLJobName: automl-dm-03-02-26-28


{'AutoMLJobArn': 'arn:aws:sagemaker:sa-east-1:388295382521:automl-job/automl-dm-03-02-26-28',
 'ResponseMetadata': {'RequestId': '019f07dd-e6b8-4ed8-91d7-6ef23afea0ea',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '019f07dd-e6b8-4ed8-91d7-6ef23afea0ea',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '92',
   'date': 'Tue, 03 Nov 2020 02:26:28 GMT'},
  'RetryAttempts': 0}}

In [17]:
from time import gmtime, strftime, sleep
timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())

In [4]:
auto_ml_job_name = 'automl-dm-03-02-26-28'

In [11]:
%%time
job_run_status = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)['AutoMLJobStatus']

print(job_run_status)

while job_run_status not in ('Failed', 'Completed', 'Stopped'):
    describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
    job_run_status = describe_response['AutoMLJobStatus']
    
    print (describe_response['AutoMLJobStatus'] + " - " + describe_response['AutoMLJobSecondaryStatus'])
    sleep(60)

InProgress
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
Completed - MaxAutoMLJobRuntimeReached
CPU times: user 225 ms, sys: 55.3 ms, total: 280 ms
Wall time: 25min 3s


In [9]:
job = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
job_candidate_notebook = job['AutoMLJobArtifacts']['CandidateDefinitionNotebookLocation']
job_data_notebook = job['AutoMLJobArtifacts']['DataExplorationNotebookLocation']

print(job_candidate_notebook)
print(job_data_notebook)

s3://sagemaker-sa-east-1-388295382521/sagemaker/automl-dm/output/automl-dm-03-02-26-28/sagemaker-automl-candidates/pr-1-b9d63786add948db918ad2666a9129bbd7321d8f35874070aec63b1c67/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb
s3://sagemaker-sa-east-1-388295382521/sagemaker/automl-dm/output/automl-dm-03-02-26-28/sagemaker-automl-candidates/pr-1-b9d63786add948db918ad2666a9129bbd7321d8f35874070aec63b1c67/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb


In [10]:
%%sh -s $job_candidate_notebook $job_data_notebook
aws s3 cp $1 .
aws s3 cp $2 .

download: s3://sagemaker-sa-east-1-388295382521/sagemaker/automl-dm/output/automl-dm-03-02-26-28/sagemaker-automl-candidates/pr-1-b9d63786add948db918ad2666a9129bbd7321d8f35874070aec63b1c67/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb to ./SageMakerAutopilotCandidateDefinitionNotebook.ipynb
download: s3://sagemaker-sa-east-1-388295382521/sagemaker/automl-dm/output/automl-dm-03-02-26-28/sagemaker-automl-candidates/pr-1-b9d63786add948db918ad2666a9129bbd7321d8f35874070aec63b1c67/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb to ./SageMakerAutopilotDataExplorationNotebook.ipynb


In [12]:
from sagemaker.analytics import ExperimentAnalytics

analytics = ExperimentAnalytics(
    sagemaker_session=sess, 
    experiment_name=auto_ml_job_name+'-aws-auto-ml-job'
)

df = analytics.dataframe()
df

Unnamed: 0,TrialComponentName,DisplayName,SourceArn,SageMaker.ImageUri,SageMaker.InstanceCount,SageMaker.InstanceType,SageMaker.VolumeSizeInGB,_tuning_objective_metric,alpha,colsample_bytree,...,code - MediaType,code - Value,input_channel_mode,job_name,label_col,max_dataset_size,SageMaker.ImageUri - MediaType,SageMaker.ImageUri - Value,ds - MediaType,ds - Value
0,tuning-job-1-7ca4ae7007ee478598-192-cfa0233e-a...,tuning-job-1-7ca4ae7007ee478598-192-cfa0233e-a...,arn:aws:sagemaker:sa-east-1:388295382521:train...,737474898029.dkr.ecr.sa-east-1.amazonaws.com/s...,1.0,ml.m5.4xlarge,50.0,validation:f1,0.000037,0.479084,...,,,,,,,,,,
1,tuning-job-1-7ca4ae7007ee478598-191-edfe3a83-a...,tuning-job-1-7ca4ae7007ee478598-191-edfe3a83-a...,arn:aws:sagemaker:sa-east-1:388295382521:train...,737474898029.dkr.ecr.sa-east-1.amazonaws.com/s...,1.0,ml.m5.4xlarge,50.0,validation:f1,0.837711,0.547698,...,,,,,,,,,,
2,tuning-job-1-7ca4ae7007ee478598-185-fa2a4e9c-a...,tuning-job-1-7ca4ae7007ee478598-185-fa2a4e9c-a...,arn:aws:sagemaker:sa-east-1:388295382521:train...,737474898029.dkr.ecr.sa-east-1.amazonaws.com/s...,1.0,ml.m5.4xlarge,50.0,validation:f1,0.000009,0.387084,...,,,,,,,,,,
3,tuning-job-1-7ca4ae7007ee478598-190-001d8452-a...,tuning-job-1-7ca4ae7007ee478598-190-001d8452-a...,arn:aws:sagemaker:sa-east-1:388295382521:train...,737474898029.dkr.ecr.sa-east-1.amazonaws.com/s...,1.0,ml.m5.4xlarge,50.0,validation:f1,0.239770,0.429856,...,,,,,,,,,,
4,tuning-job-1-7ca4ae7007ee478598-189-431a5583-a...,tuning-job-1-7ca4ae7007ee478598-189-431a5583-a...,arn:aws:sagemaker:sa-east-1:388295382521:train...,737474898029.dkr.ecr.sa-east-1.amazonaws.com/s...,1.0,ml.m5.4xlarge,50.0,validation:f1,0.000135,0.705051,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,automl-dm--dpp0-1-d87b16d1f5004797a81aaf66a511...,automl-dm--dpp0-1-d87b16d1f5004797a81aaf66a511...,arn:aws:sagemaker:sa-east-1:388295382521:train...,737474898029.dkr.ecr.sa-east-1.amazonaws.com/s...,1.0,ml.m5.4xlarge,50.0,,,,...,application/x-code,s3://sagemaker-sa-east-1-388295382521/sagemake...,,,,,,,,
199,automl-dm--dpp1-1-21349515f9e94be3bf7d6952b28c...,automl-dm--dpp1-1-21349515f9e94be3bf7d6952b28c...,arn:aws:sagemaker:sa-east-1:388295382521:train...,737474898029.dkr.ecr.sa-east-1.amazonaws.com/s...,1.0,ml.m5.4xlarge,50.0,,,,...,application/x-code,s3://sagemaker-sa-east-1-388295382521/sagemake...,,,,,,,,
200,automl-dm--dpp4-1-ec8888b84a764309ba6ac9827544...,automl-dm--dpp4-1-ec8888b84a764309ba6ac9827544...,arn:aws:sagemaker:sa-east-1:388295382521:train...,737474898029.dkr.ecr.sa-east-1.amazonaws.com/s...,1.0,ml.m5.4xlarge,50.0,,,,...,application/x-code,s3://sagemaker-sa-east-1-388295382521/sagemake...,,,,,,,,
201,automl-dm--dpp3-1-9a89d1e5ea564ff0aac45047b264...,automl-dm--dpp3-1-9a89d1e5ea564ff0aac45047b264...,arn:aws:sagemaker:sa-east-1:388295382521:train...,737474898029.dkr.ecr.sa-east-1.amazonaws.com/s...,1.0,ml.m5.4xlarge,50.0,,,,...,application/x-code,s3://sagemaker-sa-east-1-388295382521/sagemake...,,,,,,,,


In [13]:
candidates = sm.list_candidates_for_auto_ml_job(AutoMLJobName=auto_ml_job_name, 
                                                SortBy='FinalObjectiveMetricValue')['Candidates']
index = 1
for candidate in candidates:
  print (str(index) + "  " 
         + candidate['CandidateName'] + "  " 
         + str(candidate['FinalAutoMLJobObjectiveMetric']['Value']))
  index += 1

1  tuning-job-1-7ca4ae7007ee478598-072-0158faa6  0.9694499969482422
2  tuning-job-1-7ca4ae7007ee478598-180-3cf1f6d0  0.967710018157959
3  tuning-job-1-7ca4ae7007ee478598-051-0a34ad38  0.967710018157959
4  tuning-job-1-7ca4ae7007ee478598-063-fba3a84f  0.9659600257873535
5  tuning-job-1-7ca4ae7007ee478598-183-eadebadd  0.9650899767875671
6  tuning-job-1-7ca4ae7007ee478598-171-ee1dbe23  0.9650899767875671
7  tuning-job-1-7ca4ae7007ee478598-152-db0ecff9  0.9650899767875671
8  tuning-job-1-7ca4ae7007ee478598-066-21be16ff  0.9650899767875671
9  tuning-job-1-7ca4ae7007ee478598-036-04f49e34  0.9650899767875671
10  tuning-job-1-7ca4ae7007ee478598-032-9b9945bc  0.9642199873924255


In [14]:
best_candidate = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)['BestCandidate']
best_candidate_name = best_candidate['CandidateName']

print("Candidate name: " + best_candidate_name)

Candidate name: tuning-job-1-7ca4ae7007ee478598-072-0158faa6


In [15]:
for container in best_candidate['InferenceContainers']:
    print(container['Image'])
    print(container['ModelDataUrl'])
    print('-')

737474898029.dkr.ecr.sa-east-1.amazonaws.com/sagemaker-sklearn-automl:0.2-1-cpu-py3
s3://sagemaker-sa-east-1-388295382521/sagemaker/automl-dm/output/automl-dm-03-02-26-28/data-processor-models/automl-dm--dpp4-1-ec8888b84a764309ba6ac9827544b6d378b9f97de6944/output/model.tar.gz
-
737474898029.dkr.ecr.sa-east-1.amazonaws.com/sagemaker-xgboost:1.0-1-cpu-py3
s3://sagemaker-sa-east-1-388295382521/sagemaker/automl-dm/output/automl-dm-03-02-26-28/tuning/automl-dm--dpp4-xgb/tuning-job-1-7ca4ae7007ee478598-072-0158faa6/output/model.tar.gz
-
737474898029.dkr.ecr.sa-east-1.amazonaws.com/sagemaker-sklearn-automl:0.2-1-cpu-py3
s3://sagemaker-sa-east-1-388295382521/sagemaker/automl-dm/output/automl-dm-03-02-26-28/data-processor-models/automl-dm--dpp4-1-ec8888b84a764309ba6ac9827544b6d378b9f97de6944/output/model.tar.gz
-


In [20]:
# model_name = 'automl-dm-model-' + timestamp_suffix

# model_arn = sm.create_model(Containers=best_candidate['InferenceContainers'],
#                             ModelName=model_name,
#                             ExecutionRoleArn=role)

# print('Model ARN: ', model_arn['ModelArn'])

############## use this to make iferences
from sagemaker import AutoML
model_name = 'finished-automl-dm-model-' + timestamp_suffix
aml = AutoML.attach(auto_ml_job_name='automl-dm-03-02-26-28')
aml_best_model = aml.create_model(name=model_name,
                                  candidate=best_candidate,
                                  inference_response_keys=["predicted_label", "probability", "labels", "probabilities"])

aml_transformer = aml_best_model.transformer(accept='text/csv',
                                            assemble_with='Line',
                                            instance_type='ml.m5.xlarge',
                                            instance_count=1,)




In [21]:
# Don't forget to update the bucket! It must be in the same region as SageMaker
s3_capture_path = 's3://jsimon-capture-saeast1/' + model_name + '/'

print(s3_capture_path)

s3://jsimon-capture-saeast1/finished-automl-dm-model-03-03-29-48/


In [22]:
data_capture_configuration = {
    "EnableCapture": True, # flag turns data capture on and off
    "DestinationS3Uri": s3_capture_path, # s3 location where captured data is saved
    "InitialSamplingPercentage": 100, # sampling rate to capture data. max is 100%
    "CaptureOptions": [
       {
            "CaptureMode": "Output" # The type of capture this option enables. Values can be: [Output/Input]
        },
        {
            "CaptureMode": "Input" # The type of capture this option enables. Values can be: [Output/Input]
        }
    ],
    "CaptureContentTypeHeader": {
       "CsvContentTypes": ["text/csv"], # headers which should signal to decode the payload into CSV format 
       "JsonContentTypes": ["application/json"] # headers which should signal to decode the payload into JSON format 
    }
}

In [None]:
# ############## use this to make iferences
# from sagemaker import AutoML

# aml = AutoML.attach(auto_ml_job_name='automl-dm-30-02-15-18')
# aml_best_model = aml.create_model(name='automl-dm-model-' + timestamp_suffix,
#                                   candidate=best_candidate,
#                                   inference_response_keys=["predicted_label",'probabilities', 'probability' 'labels'])

# aml_transformer = aml_best_model.transformer(accept='text/csv',
#                                             assemble_with='Line',
#                                             instance_type='ml.m5.xlarge',
#                                             instance_count=1,)

In [23]:
# Endpoint configuration name
timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())
epc_name = 'automl-dm-epc-' + timestamp_suffix
print('Endpoint configuration name:', epc_name)

ep_config = sm.create_endpoint_config(EndpointConfigName = epc_name,
                                      ProductionVariants=[{'InstanceType':'ml.m4.xlarge',
                                                           'InitialInstanceCount':1,
                                                           'ModelName':model_name,
                                                           'VariantName': 'AllTraffic'}],
                                      DataCaptureConfig = data_capture_configuration)

Endpoint configuration name: automl-dm-epc-03-03-31-06


In [24]:
# Endpoint name
ep_name = 'automl-dm-ep-' + timestamp_suffix
variant_name = 'automl-dm-variant-' + timestamp_suffix
print('Endpoint name:', ep_name)
# variant_name = "automl-dm-variant-29-22-04-24"
# ep_name = "automl-dm-ep-29-22-04-24"

create_endpoint_response = sm.create_endpoint(EndpointName=ep_name,
                                              EndpointConfigName=epc_name)

Endpoint name: automl-dm-ep-03-03-31-06


In [25]:
%%time
sm.get_waiter('endpoint_in_service').wait(EndpointName=ep_name)

resp = sm.describe_endpoint(EndpointName=ep_name)
status = resp['EndpointStatus']

print("Endpoint ARN   : " + resp['EndpointArn'])
print("Endpoint status: " + status)

Endpoint ARN   : arn:aws:sagemaker:sa-east-1:388295382521:endpoint/automl-dm-ep-03-03-31-06
Endpoint status: InService
CPU times: user 168 ms, sys: 1.81 ms, total: 170 ms
Wall time: 8min 32s


In [26]:
tp = tn = fp = fn = count = 0

with open('automl-test.csv') as f:
    lines = f.readlines()
    for l in lines[1:]:   # Skip header
        l = l.split(',')  # Split CSV line into features
        label = l[-1]     # Store 'yes'/'no' label
        l = l[:-1]        # Remove label
        l = ','.join(l)   # Rebuild CSV line without label
                
        response = sm_rt.invoke_endpoint(EndpointName=ep_name, ContentType='text/csv', Accept='text/csv', Body=l)

        response = response['Body'].read().decode("utf-8")
        #print ("label %s response %s" %(label,response))

        if 'TRUE' in label:
            # Sample is positive
            if 'TRUE' in response:
                # True positive
                tp=tp+1
            else:
                # False negative
                fn=fn+1
        else:
            # Sample is negative
            if 'FAKE' in response:
                # True negative
                tn=tn+1
            else:
                # False positive
                fp=fp+1
        count = count+1
        if (count % 100 == 0):   
            sys.stdout.write(str(count)+' ')
            
print ("Done")

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (400) from container-2 with message "Unable to evaluate payload provided: Feature size of recordio-protobuf inference data 772 is not consistent with feature size of trained model 771.". See https://sa-east-1.console.aws.amazon.com/cloudwatch/home?region=sa-east-1#logEventViewer:group=/aws/sagemaker/Endpoints/automl-dm-ep-03-03-31-06 in account 388295382521 for more information.

In [None]:
print ("%d %d" % (tn, fp))
print ("%d %d" % (fn, tp))

accuracy  = (tp+tn)/(tp+tn+fp+fn)
precision = tp/(tp+fp)
recall    = tn/(tp+fn)
f1        = (2*precision*recall)/(precision+recall)

print ("Accuracy: %.4f, Precision: %.4f, Recall: %.4f, F1: %.4f" % (accuracy, precision, recall, f1))

In [None]:
%%sh -s "$s3_capture_path"

aws s3 ls --recursive $1

In [None]:
############## use this to make iferences
from sagemaker import AutoML

aml = AutoML.attach(auto_ml_job_name='automl-dm-30-02-15-18')
aml_best_model = aml.create_model(name='automl-dm-model-' + timestamp_suffix,
                                  candidate=None,
                                  inference_response_keys=["predicted_label",'probabilities', 'probability' 'labels'])

aml_transformer = aml_best_model.transformer(accept='text/csv',
                                            assemble_with='Line',
                                            instance_type='ml.m5.xlarge',
                                            instance_count=1,)