Import all of the required libraries

In [None]:
%matplotlib inline
import boto3
import json
import io
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image
import numpy as np
import matplotlib as mpl
from imageio import imread
import pandas as pd

import base64

#Implement AWS Services
rekognition=boto3.client('rekognition')
comprehendmedical = boto3.client(service_name='comprehendmedical')

---
Load the chest x-ray image as numpy array

In [None]:
xray_file='chest-xray.png'
redacted_box_color='red'
dpi = 72
phi_detection_threshold = 0.00

img = np.array(Image.open(xray_file), dtype=np.uint8)

#Set the image color map to grayscale, turn off axis graphing, and display the image
height, width = img.shape
# What size does the figure need to be in inches to fit the image?
figsize = width / float(dpi), height / float(dpi)
# Create a figure of the right size with one axes that takes up the full figure
fig = plt.figure(figsize=figsize)
ax = fig.add_axes([0, 0, 1, 1])
# Hide spines, ticks, etc.
ax.axis('off')
# Display the image.
ax.imshow(img, cmap='gray')
plt.show()

---
Use Amazon Rekognition to detect the text in the image along with the coordinates.

In [None]:
# Use Amazon Rekognition to detect all of the text in the medical image
with open(xray_file, 'rb') as image:
    response=rekognition.detect_text(Image={'Bytes': image.read()})

textDetections=response['TextDetections']
print('Aggregating detected text...')
textblock=""
offsetarray=[]
totallength=0

# The various text detections are returned in a JSON object.  Aggregate the text into a single large block and
# keep track of the offsets.  This will allow us to make a single call to Amazon Comprehend Medical for
# PHI detection and minimize our Comprehend Medical service charges.
for text in textDetections:
    if text['Type'] == "LINE":
            offsetarray.append(totallength)
            totallength+=len(text['DetectedText'])+1
            textblock=textblock+text['DetectedText']+" "  
            print(text['DetectedText']+"', length: "+str(len(text['DetectedText']))+", offsetarray: "+str(offsetarray))
offsetarray.append(totallength)
totaloffsets=len(offsetarray)

---
Using Comprehend Medical, detect PHI from the text detected in the image.

In [None]:
# Call Amazon Comprehend Medical and pass it the aggregated text from our medical image.
phi_boxes_list=[]
philist=comprehendmedical.detect_phi(Text = textblock)

# Amazon Comprehend Medical will return a JSON object that contains all of the PHI detected in the text block with
# offset values that describe where the PHI begins and ends.  We can use this to determine which of the text blocks 
# detected by Amazon Rekognition should be redacted.  The 'phi_boxes_list' list is created to keep track of the
# bounding boxes that potentially contain PHI.
print('Finding PHI text...')
not_redacted=0
for phi in philist['Entities']:
    if phi['Score'] > phi_detection_threshold:
        for i in range(0,totaloffsets-1):
            if offsetarray[i] <= phi['BeginOffset'] < offsetarray[i+1]:
                if textDetections[i]['Geometry']['BoundingBox'] not in phi_boxes_list:
                    print("'"+phi['Text']+"' was detected as type '"+phi['Type']+"' and will be redacted.")
                    phi_boxes_list.append(textDetections[i]['Geometry']['BoundingBox'])
    else:
        print("'{}' was detected as type '{}', but did not meet the confidence score threshold and will not be redacted." % (phi['Text'], phi['Type']))
        not_redacted+=1
print("Found %d text boxes to redact." % (len(phi_boxes_list)))
print("%d additional text boxes were detected, but did not meet the confidence score threshold." % (not_redacted))

---
Based on the detected PHI, mask the section of the image with red boxes.

In [None]:
#Now this list of bounding boxes will be used to draw red boxes over the PHI text.
height, width = img.shape
# What size does the figure need to be in inches to fit the image?
figsize = width / float(dpi), height / float(dpi)
# Create a figure of the right size with one axes that takes up the full figure
fig = plt.figure(figsize=figsize)
ax = fig.add_axes([0, 0, 1, 1])
ax.imshow(img)
plt.imshow(img, cmap='gray')
for box in phi_boxes_list:
    #The bounding boxes are described as a ratio of the overall image dimensions, so we must multiply them
    #by the total image dimensions to get the exact pixel values for each dimension.
    x = img.shape[0] * box['Left']
    y = img.shape[1] * box['Top']
    width = img.shape[0] * box['Width']
    height = img.shape[1] * box['Height']
    rect = patches.Rectangle((x,y),width,height,linewidth=0,edgecolor=redacted_box_color,facecolor=redacted_box_color)
    ax.add_patch(rect)
#Ensure that no axis or whitespaces is printed in the image file we want to save.
plt.axis('off')    
plt.gca().xaxis.set_major_locator(plt.NullLocator())
plt.gca().yaxis.set_major_locator(plt.NullLocator())

#Save redacted medical image to the same Amazon S3 bucket, in PNG format, with 'de-id-' in front of the original
#filename.
img_data = io.BytesIO()
plt.savefig(img_data, bbox_inches='tight', pad_inches=0, format='png')
img_data.seek(0)