In [None]:
!pip -q install --upgrade pip
!pip -q install jsonlines
import boto3
import botocore
import os
import shutil
import json
import jsonlines
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image
import numpy as np
from itertools import cycle

In [None]:
s3 = boto3.resource('s3')
# Lists all buckets
for bucket in s3.buckets.all():
    print(bucket.name)

In [None]:
top_bucket = 'INSERT BUCKET HERE'
input_prefix = 'Dogs/GermanShepherd'
training_input_prefix = f'{input_prefix}/Training'
test_input_prefix = f'{input_prefix}/Test'

bashurl_bucket = f'https://{top_bucket}.s3.amazonaws.com'
s3uri_bucket = f's3://{top_bucket}'
bashurl_prefix_bucket = f'{bashurl_bucket}/{input_prefix}/'
s3uri_prefix_bucket = f'{s3uri_bucket}/{input_prefix}/'

print(bashurl_prefix_bucket)
print(s3uri_prefix_bucket)

In [None]:
# Recursively downloads every file in the path ending in .jpg
!aws s3 cp s3://$top_bucket/$training_input_prefix Training/ --include "*.jpg" --recursive
!aws s3 cp s3://$top_bucket/$test_input_prefix Test/ --include "*.jpg" --recursive

In [None]:
client = boto3.client('sagemaker')
labeling_job_name = 'GermanShepherd'
# This is the output of the labeling job, and 'output.manifest' is the file created with
# all the S3 locations per image with annotations and meta-data in JSON format.
s3_output = client.describe_labeling_job(LabelingJobName=labeling_job_name)['OutputConfig']['S3OutputPath'] + labeling_job_name
s3_output_manifest = f'{s3_output}/manifests/output/output.manifest'

!aws s3 cp $s3_output_manifest LabelingOutput/

In [None]:
def clean_output_manifest(augmented_manifest):
    with jsonlines.open(augmented_manifest, 'r') as reader:
        lines = list(reader)
        print(len(lines), 'images in original output.manifest')
        print("----------------------------------------------------")
        for each_line in lines:
            img_source_ref = each_line['source-ref']
            try:
                if (len(each_line['GermanShepherd']['annotations'])==0):
                    #print(img_source_ref, "was not annotated and was removed")
                    lines.remove(each_line)
                    continue
            except KeyError:
                #print(img_source_ref, "failed and was removed")
                lines.remove(each_line)
                continue
            try:
                s3.Object(top_bucket, training_input_prefix + '/' + os.path.basename(img_source_ref)).load()
            except botocore.exceptions.ClientError as e:
                #print("REMOVED", each_line)
                lines.remove(each_line)
        print(len(lines), 'images after removal of errors')
        reader.close()
    return lines

In [None]:
def write_to_new_manifest(new_manifest_lines, new_manifest):
    with open(new_manifest, 'w') as f:
        for line in new_manifest_lines:
            current_source_ref = line['source-ref']
            new_s3_path = current_source_ref[:current_source_ref.rfind('/')] + '/Training/' + os.path.basename(current_source_ref)
            line['source-ref'] = new_s3_path
            f.write(json.dumps(line))
            f.write('\n')
        f.close()

In [None]:
augmented_manifest = 'LabelingOutput/output.manifest'
cleaned_lines = clean_output_manifest(augmented_manifest)

cleaned_manifest = 'LabelingOutput/cleaned-output.manifest'
write_to_new_manifest(cleaned_lines, cleaned_manifest)

s3_cleaned_manifest = f'{s3_output}/manifests/output/'
!aws s3 cp LabelingOutput/cleaned-output.manifest $s3_cleaned_manifest

In [None]:
def show_annotated_images(manifest, num_of_images):
    def draw_bounding_boxes(img_path, bboxes):
        im = np.array(Image.open(img_path), dtype=np.uint8)
        fig,ax = plt.subplots(1)
        ax.imshow(im)
        colors = cycle(['r', 'g', 'b', 'y', 'c', 'm', 'k', 'w'])
        for bbox in bboxes:
            rect = patches.Rectangle((bbox['left'],bbox['top']),bbox['width'],bbox['height'],linewidth=1,edgecolor=next(colors),facecolor='none')
            ax.add_patch(rect)
        plt.show()
    
    with jsonlines.open(manifest, 'r') as reader:
        for desc in list(reader)[:num_of_images]:
            local_path = 'Training/' + os.path.basename(desc['source-ref'])
            file_exists = os.path.isfile(local_path)
            if (file_exists):
                bboxes = desc[labeling_job_name]['annotations']
                draw_bounding_boxes(local_path, bboxes)
                #print(desc['source-ref'])
            else:
                print(desc['source-ref'], "doesn't exist")

In [None]:
show_annotated_images(cleaned_manifest, 10)

In [None]:
training_size = 0.7

np.random.shuffle(cleaned_lines)
dataset_size = len(cleaned_lines)
num_training_samples = round(dataset_size * training_size)
train_data = cleaned_lines[:num_training_samples]
validation_data = cleaned_lines[num_training_samples:]

training_manifest = 'train.manifest'
with open(training_manifest, 'w') as f:
    for line in train_data:
        f.write(json.dumps(line))
        f.write('\n')
    f.close()

validation_manifest = 'validation.manifest'
with open(validation_manifest, 'w') as f:
    for line in validation_data:
        f.write(json.dumps(line))
        f.write('\n')
    f.close()

print(f'Training samples: {num_training_samples}, Validation samples: {len(cleaned_lines)-num_training_samples}')

In [None]:
prefix_training = f'{input_prefix}/training'

s3_train_data_path = 's3://{}/{}/{}'.format(top_bucket, prefix_training, training_manifest)
s3_validation_data_path = 's3://{}/{}/{}'.format(top_bucket, prefix_training, validation_manifest)

!aws s3 cp train.manifest s3://$top_bucket/$prefix_training/
!aws s3 cp validation.manifest s3://$top_bucket/$prefix_training/

In [None]:
import time
import sagemaker

role = sagemaker.get_execution_role()
sess = sagemaker.Session()

training_image = sagemaker.amazon.amazon_estimator.get_image_uri(
    boto3.Session().region_name, 'object-detection', repo_version='latest')
s3_output_path = 's3://{}/{}/output'.format(top_bucket, prefix_training)

# Create unique job name
training_job_name = 'dogs-germanshepherd-detection-Cleaned-01'

training_params = \
    {
        "AlgorithmSpecification": {
            # Indicates the type of built-in SageMaker algorithm
            "TrainingImage": training_image,
            # Defines how the training algorithm obtains training data.
            # With "File", the entire dataset has to be downloaded.
            # With "Pipe", the data is streamed one at a time.
            "TrainingInputMode": "Pipe"
        },
        "RoleArn": role,
        "OutputDataConfig": {
            # The model artifacts output folder
            "S3OutputPath": s3_output_path
        },
        "ResourceConfig": {
            "InstanceCount": 1,
            # For Object Detection, a machine with a GPU is required.
            "InstanceType": "ml.p2.xlarge",
            "VolumeSizeInGB": 50
        },
        "TrainingJobName": training_job_name,
        "HyperParameters": {
            "base_network": "resnet-50",
            "use_pretrained_model": "1",
            "num_classes": "1",
            "mini_batch_size": "1",
            "epochs": "30",
            "learning_rate": "0.002",
            "lr_scheduler_step": "",
            "lr_scheduler_factor": "0.1",
            "optimizer": "adadelta",
            "momentum": "0.9",
            "weight_decay": "0.0005",
            "overlap_threshold": "0.5",
            "nms_threshold": "0.45",
            "image_shape": "300",
            "label_width": "350",
            "num_training_samples": str(num_training_samples)
        },
        "StoppingCondition": {
            "MaxRuntimeInSeconds": 86400
        },
        "InputDataConfig": [
            {
                "ChannelName": "train",
                "DataSource": {
                    "S3DataSource": {
                        "S3DataType": "AugmentedManifestFile",  # NB. Augmented Manifest
                        "S3Uri": s3_train_data_path,
                        "S3DataDistributionType": "FullyReplicated",
                        # NB. This must correspond to the JSON field names in your augmented manifest.
                        "AttributeNames": ['source-ref', 'GermanShepherd']
                    }
                },
                "ContentType": "application/x-recordio",
                "RecordWrapperType": "RecordIO",
                "CompressionType": "None"
            },
            {
                "ChannelName": "validation",
                "DataSource": {
                    "S3DataSource": {
                        "S3DataType": "AugmentedManifestFile",  # NB. Augmented Manifest
                        "S3Uri": s3_validation_data_path,
                        "S3DataDistributionType": "FullyReplicated",
                        # NB. This must correspond to the JSON field names in your augmented manifest.
                        "AttributeNames": ['source-ref', 'GermanShepherd']
                    }
                },
                "ContentType": "application/x-recordio",
                "RecordWrapperType": "RecordIO",
                "CompressionType": "None"
            }
        ]
    }

# Now we create the SageMaker training job.
client = boto3.client(service_name='sagemaker')
client.create_training_job(**training_params)

# Confirm that the training job has started
status = client.describe_training_job(TrainingJobName=training_job_name)['TrainingJobStatus']
print('Training job current status: {}'.format(status))

In [None]:
training_info = client.describe_training_job(TrainingJobName=training_job_name)

print("Training job status: ", training_info['TrainingJobStatus'])
print("Secondary status: ", training_info['SecondaryStatus'])

In [None]:
import time
timestamp = time.strftime('-%Y-%m-%d-%H-%M-%S', time.gmtime())
model_name = training_job_name + '-model' + timestamp

training_image = training_info['AlgorithmSpecification']['TrainingImage']
model_data = training_info['ModelArtifacts']['S3ModelArtifacts']

primary_container = {
    'Image': training_image,
    'ModelDataUrl': model_data,
}

from sagemaker import get_execution_role

role = get_execution_role()

create_model_response = client.create_model(
    ModelName = model_name,
    ExecutionRoleArn = role,
    PrimaryContainer = primary_container)

print(create_model_response['ModelArn'])

In [None]:
timestamp = time.strftime('-%Y-%m-%d-%H-%M-%S', time.gmtime())
endpoint_config_name = training_job_name + '-epc' + timestamp
endpoint_config_response = client.create_endpoint_config(
    EndpointConfigName = endpoint_config_name,
    ProductionVariants=[{
        'InstanceType':'ml.t2.medium',
        'InitialInstanceCount':1,
        'ModelName':model_name,
        'VariantName':'AllTraffic'}])

print('Endpoint configuration name: {}'.format(endpoint_config_name))
print('Endpoint configuration arn:  {}'.format(endpoint_config_response['EndpointConfigArn']))

In [None]:
timestamp = time.strftime('-%Y-%m-%d-%H-%M-%S', time.gmtime())
endpoint_name = training_job_name + '-ep' + timestamp
print('Endpoint name: {}'.format(endpoint_name))

endpoint_params = {
    'EndpointName': endpoint_name,
    'EndpointConfigName': endpoint_config_name,
}
endpoint_response = client.create_endpoint(**endpoint_params)
print('EndpointArn = {}'.format(endpoint_response['EndpointArn']))

In [None]:
response = client.describe_endpoint(EndpointName=endpoint_name)
status = response['EndpointStatus']
print('EndpointStatus = {}'.format(status))

In [None]:
import glob
test_images = glob.glob('Test/*')
print(*test_images, sep="\n")

In [None]:
def prediction_to_bbox_data(image_path, prediction):
    class_id, confidence, xmin, ymin, xmax, ymax = prediction
    width, height = Image.open(image_path).size
    bbox_data = {'class_id': class_id,
               'height': (ymax-ymin)*height,
               'width': (xmax-xmin)*width,
               'left': xmin*width,
               'top': ymin*height}
    return bbox_data

In [None]:
import matplotlib.pyplot as plt

runtime_client = boto3.client('sagemaker-runtime')

# Call SageMaker endpoint to obtain predictions
def get_predictions_for_img(runtime_client, endpoint_name, img_path):
    with open(img_path, 'rb') as f:
        payload = f.read()
        payload = bytearray(payload)

    response = runtime_client.invoke_endpoint(EndpointName=endpoint_name, 
                                       ContentType='application/x-image', 
                                       Body=payload)

    result = response['Body'].read()
    result = json.loads(result)
    return result

# wait until the status has changed
client.get_waiter('endpoint_in_service').wait(EndpointName=endpoint_name)
endpoint_response = client.describe_endpoint(EndpointName=endpoint_name)
status = endpoint_response['EndpointStatus']
if status != 'InService':
    raise Exception('Endpoint creation failed.')

for test_image in test_images:
    result = get_predictions_for_img(runtime_client, endpoint_name, test_image)
    confidence_threshold = .5
    best_n = 3
    # display the best n predictions with confidence > confidence_threshold
    predictions = [prediction for prediction in result['prediction'] if prediction[1] > confidence_threshold]
    predictions.sort(reverse=True, key = lambda x: x[1])
    bboxes = [prediction_to_bbox_data(test_image, prediction) for prediction in predictions[:best_n]]
    show_annotated_image(test_image, bboxes)