# Classifying documents and setting up human in the loop for active learning
In this chapter we will walk you through some reference architecture on how you can easily setup a custom classification model using Amazon Comprehend and have a feedback loop setup with Amazon A2I for active learning on your Comprehend custom model. 
Here is conceptual architectural flow:
    
### First using below architecture we will show you how to classify documents by setting a custom classification model and then craeting a real time endpoint for testing

![alt-text](cha15train.png)

### Secondly,Then we will show you how you can use this endpoint with human in the loop to setup model retraining and active learning workflow

![alt-text](chapter15retrain.png)

You can automate the entire end to end flow using step function and lambda for orchestration.

We will walk you through following steps to classify the documents such as pay stubs and bank statments.

### Step 1: Setup and upload  sample documents to Amazon S3
### Step 2: Extract text from sample documents using Amazon Textract
### Step 3: Create Amazon Comprehend Classification training job
### Step 4: Create Amazon Comprehend real time endpoints and test a sample document
### Step 5: Setting up active learning with comprehend realtime endpoint using human in the loop 

Lets start with executing below steps.


## Step 1: Setup and upload  sample documents to Amazon S3

In [None]:
import boto3
import pandas as pd
import numpy as np
import os
import os.path
import time
import json
from sklearn import metrics
from datetime import datetime
from pytz import timezone
from PIL import Image, ImageDraw, ImageFont
import time
import multiprocessing as mp
import io
from pathlib import Path
import botocore
import sagemaker
import boto3
import io
import json
import uuid
import time
# Document
from pprint import pprint

from IPython.display import Image, display
from PIL import Image as PImage, ImageDraw

s3=boto3.client('s3')

textract = boto3.client('textract')
comprehend=boto3.client("comprehend")


Enter bucket name to craete S3 Bucket in your account

In [None]:
data_bucket = "doc-processing-bucket-MMDD"
region = boto3.session.Session().region_name

os.environ["BUCKET"] = data_bucket
os.environ["REGION"] = region

#create s3 bucket
if region=='us-east-1':
    !aws s3api create-bucket --bucket $BUCKET
else:
    !aws s3api create-bucket --bucket $BUCKET --create-bucket-configuration LocationConstraint=$REGION

In [None]:
# Upload images to S3 bucket:
!aws s3 cp documents/train s3://{data_bucket}/train --recursive

# Below cell defines a function to get s3 bucket items

In [None]:
def get_s3_bucket_items(bucket, prefix, start_after):
    list_items=[]
    
    s3=boto3.client('s3')
    paginator = s3.get_paginator('list_objects_v2')
    operation_parameters = {'Bucket': bucket,
                            'Prefix': prefix,
                            'StartAfter':start_after}
    page_iterator = paginator.paginate(**operation_parameters)
    for page in page_iterator:
        for item in page['Contents']:
            list_items.append(item['Key'])
    names=list(set([os.path.dirname(x)+'/' for x in list_items]))
    images=[x for x in list_items if x not in names and '.ipynb_checkpoints' not in x ]
    names=[x.replace(prefix,'').strip('/') for x in names if  '.ipynb_checkpoints' not in x]
    return list_items, names, images

In [None]:

images=[]

train_objects, names, train_images=get_s3_bucket_items(data_bucket, 'train', 'train/') 
images.append(train_images)


if type(images[0]) is list:
    images=[item for sublist in images for item in sublist]
    
names, images[:10]

# Setting up local directory structure for extarcted data

In [None]:
word_prefix=os.getcwd()+'/SAMPLE8/WORDS/'
box_prefix=os.getcwd()+'/SAMPLE8/BBOX/'

# Step 2: Extract text from sample documents using Amazon Textract

In [None]:
#### FUNCTION FOR EXTRACTING TEXT FROM EACH DOCUMENT AND STORING AS .TXT FILE FOR TRAIN LAYOUTLM USING TEXTRACT

def textract_store_train_LM(table, bucket=data_bucket):
          
    try:

        response = textract.detect_document_text(
                Document={
                    'S3Object': {
                        'Bucket': bucket,
                        'Name': table
                    }
                })    
        a=[]
        b=[]
                # Print detected text
        for item in response["Blocks"]:

            if item["BlockType"] == "WORD":
                a.append(item['Geometry']['BoundingBox'])
                b.append(item["Text"])
                #print (item["Text"], end=" ")
                #print (item["Text"], end=" ")
        print(word_prefix)
        print(os.path.dirname(table))
        Path(word_prefix+os.path.dirname(table)).mkdir(parents=True, exist_ok=True)
        Path(box_prefix+os.path.dirname(table)).mkdir(parents=True, exist_ok=True)
        with open(word_prefix+table+'.txt', 'w', encoding="utf-8") as f:
            for item in b:
                f.write(item+'\n')
        with open(box_prefix +table+'.txt', 'w', encoding="utf-8") as p:
            for item in a:
                p.write(str(item)+'\n')
    except Exception as e:
        print (e)

# Call the Textract function defined above

In [None]:
tic = time.time()
pool = mp.Pool(mp.cpu_count())
pool.map(textract_store_train_LM, [table for table in images ])
print("--- %s seconds for extracting ---" % (time.time() - tic))
pool.close()

# Step 3: Create Amazon Comprehend Classification training job

This section deals with the processing of data for training the comprehend model. 

The code block below maps extracted text file path and reads the text from each file and stores in a dataframe with the corresponding label in a different column.


In [None]:
##lOOPING THRU THE DIRECTORY AND CREATING A DICT TO HOLD EACH TEXTRACT DOC PATH
def data_retriever_from_path(path):    
    
    mapping={}
    for i in names:
        if os.path.isdir(path+i):
            mapping[i] = sorted(os.listdir(path+i))
    # label or class or target list
    label_compre = []
    # text file data list
    text_compre = []
    # unpacking and iterating through dictionary
    for i, j in mapping.items():
        # iterating through list of files for each class
        for k in j:
            # appending labels/class/target
            label_compre.append(i)
            # reading the file and appending to data list
            text_compre.append(open(path+i+"/"+k, encoding="utf-8").read().replace('\n',' '))
    return label_compre, text_compre

 therefore, all datasets are combined into one and fed to comprehend regardless of your s3 bucket structure.
The text for each document are saved in a pandas row (one document per row format) with the corresponding class in another column.

In [None]:
label_compre, text_compre=[],[]

path=word_prefix+'train/'
label_compre_train, text_compre_train=data_retriever_from_path(path)
label_compre.append(label_compre_train)
text_compre.append(text_compre_train)

if type(label_compre[0]) is list:
        label_compre=[item for sublist in label_compre for item in sublist]
        #print(label_compre)
        text_compre=[item for sublist in text_compre for item in sublist]
        #print(text_compre)


data_compre= pd.DataFrame()
data_compre["label"] =label_compre   
data_compre["document"] = text_compre
data_compre

### Craeting Training file from extracted text and saving in Amazon S3 

In [None]:
csv_compre=io.StringIO()
data_compre.to_csv(csv_compre,index=False, header=False)

key='comprehend_train_data.csv'  ### change
input_bucket=data_bucket        #### change
output_bucket= data_bucket        ### change

response2 = s3.put_object(
        Body=csv_compre.getvalue(),
        Bucket=input_bucket,
        Key=key)

## Go to Amazon Comprehend Console https://console.aws.amazon.com/comprehend/v2/home?region=us-east-1#classification to craete a custom classification job

Once your job is completed move on to next step, This job take 30 minutes to complete

# Step 4: Create Amazon Comprehend real time endpoints and test a sample document
https://console.aws.amazon.com/comprehend/v2/home?region=us-east-1#endpoints and copy paste the endpoint ARN below

In [None]:
ENDPOINT_ARN='enter your comprehend custom classification endpoint ARN'

Test the endpoint by passing a test file

In [None]:

documentName = "paystubsample.png"

display(Image(filename=documentName))

# Extract Text from this sample doc using Textract

In [None]:
client = boto3.client(service_name='textract',
         region_name= 'us-east-1',
         endpoint_url='https://textract.us-east-1.amazonaws.com')

with open(documentName, 'rb') as file:
            img_test = file.read()
            bytes_test = bytearray(img_test)
            print('Image loaded', documentName)

    # process using image bytes
response = client.detect_document_text(Document={'Bytes': bytes_test})

In [None]:
#Extract key values
# Iterate over elements in the document
from trp import Document


doc = Document(response)
page_string = ''
for page in doc.pages:
    # Print lines and words
       
        for line in page.lines:
            #print((line.text))
            page_string += str(line.text)+"\n"
print(page_string)

# Pass this extracted text to Comprehend classification real time endpoint to classify the document

In [None]:
response = comprehend.classify_document(
    Text= page_string,
    EndpointArn=ENDPOINT_ARN
)

print(response)

# Step 5: Setting up active learning with comprehend realtime endpoint using human in the loop 
We have trained a comprehend custom model and created an endpoint for real time inferencing.
Now, In this section we will show you how you can setup human in the loop for model retraining and active learning using below
architecure

![alt-text](chapter15retrain.png)

### Setting up an Amazon A2I human loop

In this section, you set up a human review loop for low-confidence detection in Amazon A2I. It includes the following steps:

#### Create a Worker Task template.
#### Create a Human review workflow.
#### Creating and Starting A2I human loop
#### Check the human loop status and start labelling



# Environment Setup¶

We need to set up the following data:
WORKTEAM_ARN - To create your Private Workteam, visit the instructions here: https://docs.aws.amazon.com/sagemaker/latest/dg/sms-workforce-private.html After you have created your workteam, replace


In [None]:
REGION = 'us-east-1'
WORKTEAM_ARN= "enter your workteam arn"
BUCKET = data_bucket
ENDPOINT_ARN= ENDPOINT_ARN
role = sagemaker.get_execution_role()
region = boto3.session.Session().region_name
prefix = "custom-classify" + str(uuid.uuid1())

In [None]:
# Amazon SageMaker client
sagemaker = boto3.client('sagemaker', REGION)
# A2I Runtime client
a2i_runtime_client = boto3.client('sagemaker-a2i-runtime', REGION)


# Create a Worker Task template

It is 2 step process:
    
    1. Select the UI template you want to use For over 70 pre built UIs, check: https://github.com/aws-samples/amazon-a2i-sample-task-uis
    
    2. Create Task template using create_human_task_ui API or you can do the same thing using the AWS Console.
Refer to this blog to follow AWS Console steps:https://aws.amazon.com/blogs/machine-learning/active-learning-workflow-for-amazon-comprehend-custom-classification-part-2/



In [None]:
#1. Select the UI template for custom classification and modify the categories based on your labels
template = """<script src="https://assets.crowd.aws/crowd-html-elements.js"></script>

<crowd-form>
    <crowd-classifier-multi-select
      name="category"
      categories="['Bank Statement', 'Pay Stubs']"
      header="Select the relevant categories"
    >
      <classification-target>
        {{ task.input.taskObject }}
      </classification-target>
      
      <full-instructions header="Text Categorization Instructions">
        <p><strong>Bank Statement</strong>Related to payments</p>
        <p><strong>Pay Stubs</strong>Related to payment</p>
      </full-instructions>

      <short-instructions>
       Choose all categories that are expressed by the text. 
      </short-instructions>
    </crowd-classifier-multi-select>
</crowd-form>
"""

# Create a worker task template using boto3 API 

https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.create_human_task_ui

In [None]:
def create_task_ui():
    '''
    Creates a Human Task UI resource.

    Returns:
    struct: HumanTaskUiArn
    '''
    response = sagemaker.create_human_task_ui(
        HumanTaskUiName=taskUIName,
        UiTemplate={'Content': template})
    return response

In [None]:
# Task UI name - this value is unique per account and region. You can also provide your own value here.
taskUIName = prefix + '-ui' 

# Create task UI
humanTaskUiResponse = create_task_ui()
humanTaskUiArn = humanTaskUiResponse['HumanTaskUiArn']
print(humanTaskUiArn)

# Use an Amazon Augmented AI (Amazon A2I) human review workflow, or flow definition, to specify the following: 
  

     The workforce that your tasks will be sent to.

     The instructions that your workforce will receive, which is called a worker task template.

     The configuration of your worker tasks, including the number of workers that receive a task and time limits to complete tasks.

     Where your output data will be stored.
        
To create a flow definition using the SageMaker API, you use the CreateFlowDefinition operation

This demo is going to use the API, but you can optionally create this workflow definition in the console as well.

For more details and instructions, see: https://docs.aws.amazon.com/sagemaker/latest/dg/a2i-create-flow-definition.html.


In [None]:
# Flow definition name - this value is unique per account and region. You can also provide your own value here.
flowDefinitionName = prefix + '-fd-a2i' 

create_workflow_definition_response = sagemaker.create_flow_definition(
        FlowDefinitionName= flowDefinitionName,
        RoleArn= role,
        HumanLoopConfig= {
            "WorkteamArn": WORKTEAM_ARN,
            "HumanTaskUiArn": humanTaskUiArn,
            "TaskCount": 1,
            "TaskDescription": "Read the instructions",
            "TaskTitle": "Classify the text"
        },
        OutputConfig={
            "S3OutputPath" : "s3://"+BUCKET+"/output"
        }
    )
flowDefinitionArn = create_workflow_definition_response['FlowDefinitionArn'] # let's save this ARN for future use

# Sample Data to Test Comprehend Endpoint and create a request for A2I

In [None]:
response = comprehend.classify_document(
    Text= page_string,
    EndpointArn=ENDPOINT_ARN
)
print(response)
p = response['Classes'][0]['Name']
score = response['Classes'][0]['Score']
        #print(f"S:{sentence}, Score:{score}")
response = {}
response['utterance']=page_string
response['prediction']=p
response['confidence'] = score
print(response)

# Creating and Starting A2I human loop

For more information https://docs.aws.amazon.com/sagemaker/latest/dg/a2i-start-human-loop.html#a2i-instructions-starthumanloop

When using Amazon A2I for a custom task, a human loops starts when StartHumanLoop is called in your application. Prerequisites

To complete this procedure, you need:

Input data formatted as a string representation of a JSON-formatted file.

The Amazon Resource Name (ARN) of your flow definition

In [None]:
import json
human_loops_started = []
CONFIDENCE_SCORE_THRESHOLD = .90
if(response['confidence'] > CONFIDENCE_SCORE_THRESHOLD):
        humanLoopName = str(uuid.uuid4())
        human_loop_input = {}
  
        human_loop_input['taskObject'] = response['utterance']
        start_loop_response = a2i_runtime_client.start_human_loop(
        HumanLoopName=humanLoopName,
        FlowDefinitionArn=flowDefinitionArn,
        HumanLoopInput={
                "InputContent": json.dumps(human_loop_input)
            }
        )
        print(human_loop_input)
        human_loops_started.append(humanLoopName)
        print(f'Score is less than the threshold of {CONFIDENCE_SCORE_THRESHOLD}')
        print(f'Starting human loop with name: {humanLoopName}  \n')
else:
         print('No human loop created. \n')

# Navigate to the private worker portal and start Labelling!

Make sure you've invited yourself to your workteam!


In [None]:
workteamName = WORKTEAM_ARN[WORKTEAM_ARN.rfind('/') + 1:]
print("Navigate to the private worker portal and do the tasks. Make sure you've invited yourself to your workteam!")
print('https://' + sagemaker.describe_workteam(WorkteamName=workteamName)['Workteam']['SubDomain'])

In [None]:
completed_human_loops = []
resp = a2i_runtime_client.describe_human_loop(HumanLoopName=humanLoopName)
print(f'HumanLoop Name: {humanLoopName}')
print(f'HumanLoop Status: {resp["HumanLoopStatus"]}')
#print(f'HumanLoop Output Destination: {resp["HumanLoopOutput"]}')
print('\n')
    
if resp["HumanLoopStatus"] == "Completed":
    completed_human_loops.append(resp)

# Review the labelling results in Amazon S3


In [None]:
import re
import pprint

pp = pprint.PrettyPrinter(indent=4)

for resp in completed_human_loops:
    splitted_string = re.split('s3://' + data_bucket  + '/', resp['HumanLoopOutput']['OutputS3Uri'])
    output_bucket_key = splitted_string[1]
    print(output_bucket_key)
    response = s3.get_object(Bucket=data_bucket, Key=output_bucket_key)
    print(data_bucket)
    content = response["Body"].read()
    json_output = json.loads(content)
    pp.pprint(json_output)
    print('\n')

# Combining this augmented data for retraining with original training data

In [None]:
# New values
for i in json_output['humanAnswers']:
    x = i['answerContent']
    print(x)


# Clean Up

# Deleteing the model endpoint and Comprehend training jobs in your account.
#Run below code to delete endpoints

In [None]:
response = comprehend.delete_endpoint(
    EndpointArn=ENDPOINT_ARN
)


print(response)

# Go to AWS Console and delete the S3 bucket, Comprehend the model training jobs and workflow definition

https://docs.aws.amazon.com/AmazonS3/latest/userguide/delete-bucket.html
    
https://docs.aws.amazon.com/sagemaker/latest/dg/a2i-delete-flow-definition.html