# Setup Amazon Comprehend Through AWS Console 

<img src="img/comprehend.png" width="80%" align="left">

## Note that Amazon Comprehend is currently only supported in a subset of regions: 

* US East (N. Virginia), US East (Ohio), US West (Oregon)
* Canada (Central)
* Europe (London), Europe (Ireland), Europe (Frankfurt)
* Asia Pacific (Mumbai), Asia Pacific (Seoul), Asia Pacific (Tokyo), Asia Pacific (Singapore), Asia Pacific (Sydney)

You can check https://aws.amazon.com/about-aws/global-infrastructure/regional-product-services/ for details and updates. 

In [1]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

from botocore.config import Config

config = Config(
   retries = {
      'max_attempts': 10,
      'mode': 'adaptive'
   }
)

iam = boto3.client('iam', config=config)
sm = boto3.Session().client(service_name='sagemaker', region_name=region)

### Check if you current regions supports Comprehend

In [2]:
if region in ['ap-south-1', 'eu-west-2', 'eu-west-1', 'ap-northeast-2', 'ap-northeast-1', 'ca-central-1', 'ap-southeast-1', 'ap-southeast-2', 'eu-central-1', 'us-east-1', 'us-east-2', 'us-west-2']:
    print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' )
    print(' SUCCESS: COMPREHEND IS SUPPORTED IN {}'.format(region))
    print(' Please proceed with this notebook.' )
    print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' )
else:
    print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' )
    print(' !! COMPREHEND IS *NOT* SUPPORTED IN {}!! '.format(region))
    print(' This is OK. Skip this notebook and continue with the next notebook.' )
    print(' This notebook is not required for the rest of this workshop.' )
    print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' )

+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 SUCCESS: COMPREHEND IS SUPPORTED IN us-east-1
 Please proceed with this notebook.
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


In [3]:
comprehend = boto3.client('comprehend')

### Retrieve S3 location of training data

In [4]:
%store -r noheader_train_s3_uri

In [5]:
if not noheader_train_s3_uri:
    print('****************************************************************************************')
    print('**************** PLEASE RE-RUN THE PREVIOUS DATA PREPARATION NOTEBOOK ******************')
    print('**************** THIS NOTEBOOK WILL NOT RUN PROPERLY ***********************************')
    print('****************************************************************************************')

In [6]:
print(noheader_train_s3_uri)

s3://sagemaker-us-east-1-489371866242/data/amazon_reviews_us_Digital_Software_v1_00_noheader.csv


In [7]:
!aws s3 ls $noheader_train_s3_uri

2020-08-22 16:26:29   13646687 amazon_reviews_us_Digital_Software_v1_00_noheader.csv


## See our prepared training data which we use as input for Comprehend

In [8]:
!aws s3 cp $noheader_train_s3_uri ./data/

download: s3://sagemaker-us-east-1-489371866242/data/amazon_reviews_us_Digital_Software_v1_00_noheader.csv to data/amazon_reviews_us_Digital_Software_v1_00_noheader.csv


In [9]:
import csv

df = pd.read_csv('./data/amazon_reviews_us_Digital_Software_v1_00_noheader.csv', header=None)
df.head()

Unnamed: 0,0,1
0,1,Downloading this software onto an iPad is diff...
1,5,It made taxes very easy and I was happy to use...
2,2,The truth? I’ve used some form of Norton for s...
3,2,Save your money. While School Zone does have g...
4,5,Great update; Powerpoint is so much fun! Somew...


# Create Data Access Role for Comprehend

## Create Policy

In [10]:
assume_role_policy_doc = {
  "Version": "2012-10-17",
  "Statement": [
    {
      "Effect": "Allow",
      "Principal": {
        "Service": "comprehend.amazonaws.com"
      },
      "Action": "sts:AssumeRole"
    }
  ]
} 

## Create Role and Attach Policies

In [11]:
iam_comprehend_role_name = 'DSOAWS_Comprehend'

In [12]:
import json
import time

from botocore.exceptions import ClientError

try:
    iam_role_comprehend = iam.create_role(
        RoleName=iam_comprehend_role_name,
        AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),
        Description='DSOAWS Comprehend Role'
    )
except ClientError as e:
    if e.response['Error']['Code'] == 'EntityAlreadyExists':
        iam_role_comprehend = iam.get_role(RoleName=iam_comprehend_role_name)
        print("Role already exists")
    else:
        print("Unexpected error: %s" % e)
        
time.sleep(30)

In [13]:
comprehend_s3_policy_doc = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Action": [
                "s3:GetObject"
            ],
            "Resource": [
                "arn:aws:s3:::{}/*".format(bucket)
            ],
            "Effect": "Allow"
        },
        {
            "Action": [
                "s3:ListBucket"
            ],
            "Resource": [
                "arn:aws:s3:::{}".format(bucket)
            ],
            "Effect": "Allow"
        },
        {
            "Action": [
                "s3:PutObject"
            ],
            "Resource": [
                "arn:aws:s3:::{}/*".format(bucket)
            ],
            "Effect": "Allow"
        }
    ]
}

print(comprehend_s3_policy_doc)

{'Version': '2012-10-17', 'Statement': [{'Action': ['s3:GetObject'], 'Resource': ['arn:aws:s3:::sagemaker-us-east-1-489371866242/*'], 'Effect': 'Allow'}, {'Action': ['s3:ListBucket'], 'Resource': ['arn:aws:s3:::sagemaker-us-east-1-489371866242'], 'Effect': 'Allow'}, {'Action': ['s3:PutObject'], 'Resource': ['arn:aws:s3:::sagemaker-us-east-1-489371866242/*'], 'Effect': 'Allow'}]}


# Attach Policy to Role

In [14]:
import time

response = iam.put_role_policy(
    RoleName=iam_comprehend_role_name,
    PolicyName='DSOAWS_ComprehendPolicyToS3',
    PolicyDocument=json.dumps(comprehend_s3_policy_doc)
)

print(response)

time.sleep(30)

{'ResponseMetadata': {'RequestId': 'cb8af70a-0744-44bf-97a8-b1a41e1f45ad', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'cb8af70a-0744-44bf-97a8-b1a41e1f45ad', 'content-type': 'text/xml', 'content-length': '206', 'date': 'Sat, 22 Aug 2020 17:16:42 GMT'}, 'RetryAttempts': 0}}


# Train the Model

In [15]:
prefix = 'models'

s3_output_job = 's3://{}/{}/{}'.format(bucket, prefix, 'comprehend/output')
print(s3_output_job)

s3://sagemaker-us-east-1-489371866242/models/comprehend/output


In [16]:
iam_role_comprehend_arn = iam_role_comprehend['Role']['Arn']

In [17]:
import datetime
import time

timestamp = str(datetime.datetime.now().strftime("%s"))

comprehend_training_job_name = 'Amazon-Customer-Reviews-Classifier-{}'.format(timestamp) 

print(comprehend_training_job_name)

Amazon-Customer-Reviews-Classifier-1598116632


In [18]:
training_job = comprehend.create_document_classifier(
    DocumentClassifierName=comprehend_training_job_name,
    DataAccessRoleArn=iam_role_comprehend_arn,
    InputDataConfig={
        'S3Uri': noheader_train_s3_uri
    },
    OutputDataConfig={
        'S3Uri': s3_output_job
    },
    LanguageCode='en'
)

time.sleep(30)

In [19]:
comprehend_training_job_arn = training_job['DocumentClassifierArn']

print(comprehend_training_job_arn)

arn:aws:comprehend:us-east-1:489371866242:document-classifier/Amazon-Customer-Reviews-Classifier-1598116632


In [20]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/comprehend/v2/home?region={}#classifier-details/{}">Comprehend Training Job</a></b>'.format(region, comprehend_training_job_arn)))


In [None]:
import time


max_time = time.time() + 3 * 60 * 60 # 3 hours
while time.time() < max_time:
    describe_custom_classifier = comprehend.describe_document_classifier(
        DocumentClassifierArn = comprehend_training_job_arn
    )
    status = describe_custom_classifier["DocumentClassifierProperties"]["Status"]
    print("Custom classifier: {}".format(status))
    
    if status == "TRAINED" or status == "IN_ERROR":
        print('')
        print('Status {}'.format(status))
        print('')
        print(describe_custom_classifier["DocumentClassifierProperties"])
        break
        
    time.sleep(10)

Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom cla

# _Please Wait Until the ^^ Classifier ^^ is Trained Above._

# Show Results of the Classifier

In [None]:
print(describe_custom_classifier["DocumentClassifierProperties"])

In [None]:
model_arn = describe_custom_classifier["DocumentClassifierProperties"]["DocumentClassifierArn"]
print(model_arn)

In [None]:
import os
#Retrieve the S3URI from the model output and create jobkey variable.
job_output = describe_custom_classifier["DocumentClassifierProperties"]["OutputDataConfig"]["S3Uri"]
print(job_output)

path_prefix = 's3://{}/'.format(bucket)

job_key = os.path.relpath(job_output, path_prefix)

print(job_key)

# Download Model Artifacts including Training Metrics

In [None]:
s3 = boto3.resource('s3')

s3.Bucket(bucket).download_file(job_key, './output.tar.gz')

In [None]:
#Unpack the gzip file
!tar xvzf ./output.tar.gz

In [None]:
import json

with open('./output/confusion_matrix.json') as json_file:
    data = json.load(json_file)
print(json.dumps(data, indent=2, default=str))

In [None]:
!pip install tabulate

In [None]:
from IPython.display import HTML, display
import tabulate
table = [['', '1', '2', '3', '4', '5', '(Predicted)'],
         ['1', data['confusion_matrix'][0][0], data['confusion_matrix'][0][1], data['confusion_matrix'][0][2], data['confusion_matrix'][0][3], data['confusion_matrix'][0][4]],
         ['2', data['confusion_matrix'][1][0], data['confusion_matrix'][1][1], data['confusion_matrix'][1][2], data['confusion_matrix'][1][3], data['confusion_matrix'][1][4]],
         ['3', data['confusion_matrix'][2][0], data['confusion_matrix'][2][1], data['confusion_matrix'][2][2], data['confusion_matrix'][2][3], data['confusion_matrix'][2][4]],
         ['4', data['confusion_matrix'][3][0], data['confusion_matrix'][3][1], data['confusion_matrix'][3][2], data['confusion_matrix'][3][3], data['confusion_matrix'][3][4]],
         ['5', data['confusion_matrix'][4][0], data['confusion_matrix'][4][1], data['confusion_matrix'][4][2], data['confusion_matrix'][4][3], data['confusion_matrix'][4][4]],
         ['(Actual)']]
display(HTML(tabulate.tabulate(table, tablefmt='html')))

# Deploy Endpoint

In [None]:
from time import gmtime, strftime, sleep
timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())

comprehend_endpoint_name = 'comprehend-inference-ep-' + timestamp_suffix

inference_endpoint_response = comprehend.create_endpoint(
    EndpointName=comprehend_endpoint_name,
    ModelArn=model_arn,
    DesiredInferenceUnits=1
)

In [None]:
comprehend_endpoint_arn = inference_endpoint_response["EndpointArn"]
print(comprehend_endpoint_arn)

# Pass Variables to the Next Notebook(s)

In [None]:
%store comprehend_endpoint_arn

# Predict with Endpoint

In [None]:
describe_response = comprehend.describe_endpoint(
    EndpointArn = comprehend_endpoint_arn
)
print(describe_response)

In [None]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/comprehend/v2/home?region={}#classifier-details/{}/endpoints/{}/details">Comprehend Model Endpoint</a></b>'.format(region, comprehend_training_job_arn, comprehend_endpoint_arn)))

In [None]:
import time

max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_response = comprehend.describe_endpoint(
        EndpointArn = comprehend_endpoint_arn
    )
    status = describe_response["EndpointProperties"]["Status"]
    print("Endpoint: {}".format(status))
    
    if status == "IN_SERVICE" or status == "IN_ERROR":
        break
        
    time.sleep(5)

In [None]:
txt = """I loved it!  I will recommend this to everyone."""

response = comprehend.classify_document(
    Text= txt,
    EndpointArn = comprehend_endpoint_arn
)

import json
print(json.dumps(response, indent=2, default=str))

In [None]:
txt = """It's OK."""

response = comprehend.classify_document(
    Text= txt,
    EndpointArn = comprehend_endpoint_arn
)

import json
print(json.dumps(response, indent=2, default=str))

In [None]:
txt = """Really bad.  I hope they don't make this anymore."""

response = comprehend.classify_document(
    Text= txt,
    EndpointArn = comprehend_endpoint_arn
)

import json
print(json.dumps(response, indent=2, default=str))

In [None]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/comprehend/v2/home?region={}#classifier-details/{}/endpoints/{}/details">Comprehend Model Endpoint</a></b>'.format(region, comprehend_training_job_arn, comprehend_endpoint_arn)))

In [None]:
%%javascript
Jupyter.notebook.save_checkpoint();
Jupyter.notebook.session.delete();