In [42]:
import sagemaker
import boto3
from sagemaker import get_execution_role

In [44]:
region = boto3.Session().region_name
session = sagemaker.Session()
bucket = session.default_bucket()
prefix = 'auto_pilot'
role = get_execution_role()
sm = boto3.Session().client(service_name = 'sagemaker',region_name = region,
                           )

In [45]:
import pandas as pd
data = pd.read_csv('https://raw.githubusercontent.com/a-forty-two/COG_GN22CDBDS001_MARCH_22/main/breastcancer_data.csv')

In [46]:
data.head(10)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,symmetry_mean,fractal_dimension_mean
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.2419,0.07871
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.1812,0.05667
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.2069,0.05999
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.2597,0.09744
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1809,0.05883
5,843786,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.2087,0.07613
6,844359,M,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.1794,0.05742
7,84458202,M,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.2196,0.07451
8,844981,M,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.235,0.07389
9,84501001,M,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.203,0.08243


In [47]:
data.shape

(1138, 11)

In [48]:
data.drop(columns = ['id'],inplace=True)

In [49]:
data.shape

(1138, 10)

In [50]:
data.head(10)

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,symmetry_mean,fractal_dimension_mean
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.2419,0.07871
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.1812,0.05667
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.2069,0.05999
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.2597,0.09744
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1809,0.05883
5,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.2087,0.07613
6,M,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.1794,0.05742
7,M,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.2196,0.07451
8,M,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.235,0.07389
9,M,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.203,0.08243


In [51]:
train_data = data.sample(frac=0.8,random_state=200)
test_data = data.drop(train_data.index)
test_data = test_data.drop(columns=['diagnosis'])

In [52]:
train_data.shape

(910, 10)

In [53]:
test_data.shape

(228, 9)

In [54]:
train_file = 'train_data.csv';
train_data.to_csv(train_file, index=False, header=True)
train_data_s3_path = session.upload_data(path=train_file, key_prefix=prefix + "/train")
print('Train data uploaded to: ' + train_data_s3_path)
 
test_file = 'test_data.csv';
test_data.to_csv(test_file, index=False, header=False)
test_data_s3_path = session.upload_data(path=test_file, key_prefix=prefix + "/test")
print('Test data uploaded to: ' + test_data_s3_path)

Train data uploaded to: s3://sagemaker-us-east-1-313830654669/auto_pilot/train/train_data.csv
Test data uploaded to: s3://sagemaker-us-east-1-313830654669/auto_pilot/test/test_data.csv


In [62]:
input_data_config = [{
      'DataSource': {
        'S3DataSource': {
          'S3DataType': 'S3Prefix',
          'S3Uri': 's3://{}/{}/train'.format(bucket,prefix)
        }
      },
      'TargetAttributeName': 'diagnosis'
    }
  ]
 
job_config = {
    'CompletionCriteria': {
      'MaxRuntimePerTrainingJobInSeconds': 600,
      'MaxAutoMLJobRuntimeInSeconds': 3600
    },
}
 
output_data_config = {
    'S3OutputPath': 's3://{}/{}/output'.format(bucket,prefix)
  }
 
problem_type = 'BinaryClassification'
job_objective = { 'MetricName': 'Accuracy'}

In [63]:
from time import gmtime, strftime, sleep
timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())
 
auto_ml_job_name = 'automodel' + timestamp_suffix
print('AutoMLJobName: ' + auto_ml_job_name)
 
sm.create_auto_ml_job(AutoMLJobName=auto_ml_job_name,
                      InputDataConfig=input_data_config,
                      OutputDataConfig=output_data_config,
                      AutoMLJobConfig=job_config,
                      AutoMLJobObjective=job_objective,
                      ProblemType=problem_type,
                      RoleArn=role)

AutoMLJobName: automodel09-09-29-19


{'AutoMLJobArn': 'arn:aws:sagemaker:us-east-1:313830654669:automl-job/automodel09-09-29-19',
 'ResponseMetadata': {'RequestId': 'e2e287da-5cc9-49aa-b61a-6fb054c9ae88',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'e2e287da-5cc9-49aa-b61a-6fb054c9ae88',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '91',
   'date': 'Thu, 09 Jun 2022 09:29:20 GMT'},
  'RetryAttempts': 0}}

In [64]:
print ('JobStatus - Secondary Status')
print('------------------------------')
 
 
describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
print (describe_response['AutoMLJobStatus'] + " - " + describe_response['AutoMLJobSecondaryStatus'])
job_run_status = describe_response['AutoMLJobStatus']
    
while job_run_status not in ('Failed', 'Completed', 'Stopped'):
    describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
    job_run_status = describe_response['AutoMLJobStatus']
    
    print (describe_response['AutoMLJobStatus'] + " - " + describe_response['AutoMLJobSecondaryStatus'])
    sleep(30)

JobStatus - Secondary Status
------------------------------
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineer

In [66]:
job = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
job_candidate_notebook = job['AutoMLJobArtifacts']['CandidateDefinitionNotebookLocation']
job_data_notebook = job['AutoMLJobArtifacts']['DataExplorationNotebookLocation']
job_best_candidate = job['BestCandidate']
job_best_candidate_name = job_best_candidate['CandidateName']
 
job_candidate_notebook
job_data_notebook
job_best_candidate_name

'automodel09-09-29-19GgtJYQEGPSlU-016-643b6ac3'

In [67]:
%%sh -s $job_candidate_notebook $job_data_notebook
 
aws s3 cp $1 .
aws s3 cp $2 .

download: s3://sagemaker-us-east-1-313830654669/auto_pilot/output/automodel09-09-29-19/sagemaker-automl-candidates/automodel09-09-29-19-pr-1-8fb18d19cd874bf3936849ea8604841a7a7f8/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb to ./SageMakerAutopilotCandidateDefinitionNotebook.ipynb
download: s3://sagemaker-us-east-1-313830654669/auto_pilot/output/automodel09-09-29-19/sagemaker-automl-candidates/automodel09-09-29-19-pr-1-8fb18d19cd874bf3936849ea8604841a7a7f8/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb to ./SageMakerAutopilotDataExplorationNotebook.ipynb


In [69]:
model_name = 'automl-termdepo-model-' + timestamp_suffix
 
model = sm.create_model(Containers=job_best_candidate['InferenceContainers'],
                            ModelName=model_name,
                            ExecutionRoleArn=role)
 
print('Model ARN corresponding to the best candidate is : {}'.format(model['ModelArn']))

Model ARN corresponding to the best candidate is : arn:aws:sagemaker:us-east-1:313830654669:model/automl-termdepo-model-09-09-29-19


In [70]:
transform_job_name = 'automl-termdepo-transform-' + timestamp_suffix
 
transform_input = {
        'DataSource': {
            'S3DataSource': {
                'S3DataType': 'S3Prefix',
                'S3Uri': test_data_s3_path
            }
        },
        'ContentType': 'text/csv',
        'CompressionType': 'None',
        'SplitType': 'Line'
    }
 
transform_output = {
        'S3OutputPath': 's3://{}/{}/inference-results'.format(bucket,prefix),
    }
 
transform_resources = {
        'InstanceType': 'ml.m4.xlarge',
        'InstanceCount': 1
    }
 
sm.create_transform_job(TransformJobName = transform_job_name,
                        ModelName = model_name,
                        TransformInput = transform_input,
                        TransformOutput = transform_output,
                        TransformResources = transform_resources
)

{'TransformJobArn': 'arn:aws:sagemaker:us-east-1:313830654669:transform-job/automl-termdepo-transform-09-09-29-19',
 'ResponseMetadata': {'RequestId': '704afe15-0091-4c1a-920c-bb15fd6180ba',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '704afe15-0091-4c1a-920c-bb15fd6180ba',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '114',
   'date': 'Thu, 09 Jun 2022 10:41:33 GMT'},
  'RetryAttempts': 0}}

In [71]:
print ('JobStatus')
print('----------')
 
describe_response = sm.describe_transform_job(TransformJobName = transform_job_name)
job_run_status = describe_response['TransformJobStatus']
print (job_run_status)
 
while job_run_status not in ('Failed', 'Completed', 'Stopped'):
    describe_response = sm.describe_transform_job(TransformJobName = transform_job_name)
    job_run_status = describe_response['TransformJobStatus']
    print (job_run_status)
    sleep(30)


JobStatus
----------
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
Completed


In [72]:
s3_output_key = '{}/inference-results/test_data.csv.out'.format(prefix)
local_inference_results_path = 'inference_results.csv'
 
s3 = boto3.resource('s3')
inference_results_bucket = s3.Bucket(session.default_bucket())
 
inference_results_bucket.download_file(s3_output_key, local_inference_results_path)
 
pred_data = pd.read_csv(local_inference_results_path, sep=';')    
pred_data



Unnamed: 0,M
0,M
1,M
2,M
3,B
4,B
...,...
222,M
223,B
224,B
225,M


In [79]:
test_data = test_data[1:]

In [84]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(test_data['diagnosis'],pred_data['M'])
cm

array([[141,   2],
       [  0,  84]])

In [85]:
print("Accuracy Score = {}".format(accuracy_score(test_data['diagnosis'],pred_data['M'])))

Accuracy Score = 0.9911894273127754
