In this demo we will use Amazon Comprehend to create a custom document classifier.

---
Load the needed the libraries

In [None]:
import boto3, os
import json
import numpy as np
import pandas as pd
import time
import feedparser
from datetime import datetime
from io import BytesIO
import io
import gzip

s3 = boto3.resource("s3")
comprehend = boto3.client('comprehend')

---
Update the following two values

1. `bucket` - Enter S3 bucket which will be used to stage the files.
2. `object_prefix` - Enter the prefix in S3 where the object should be uploaded

In [None]:
bucket = 'ai-data-eu-west-1'
object_prefix = 'comprehend'

test_data_file = 'ag_news_csv/testdata.txt'

---
We are using [AG's corpus of news articles](http://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html) data set to train our Comprehend Custom Model.

Load the data set in CSV format as pandas data frame.

In [None]:
train_data = pd.read_csv('ag_news_csv/aggregates.csv', engine='python', names=['ID','Title','Description'])

---
Drop the *Title* column since we need only class name (ID) and Description.

In [None]:
train_data = train_data.drop('Title', axis=1)
train_data.sample(n=3)

---
Write the data frame to a CSV file and upload it to S3

In [None]:
train_data.to_csv("ag_news_csv/processed.csv", index= False, header = False)
s3.Bucket(bucket).upload_file(
    'ag_news_csv/processed.csv',
    object_prefix + '/news-classification.csv')

Now create a document classifier job in Comprehend by providing the S3 location where we uploaded the file.

You also need an IAM role which has permission to access the S3 location as part of permission policy and the trust policy should allow Comprehend to assume the role.

Update the variable `data_acess_iam_role` and enter the ARN of the IAM role with the above mentioned policies.

In [None]:
timestamp = datetime.now().strftime('%Y-%m-%d-%H%M%S')
input_data = 's3://{}/{}/news-classification.csv'.format(bucket, object_prefix)
data_acess_iam_role = 'arn:aws:iam::0123345678901:role/service-role/AmazonComprehendServiceRoleS3Access'

classifier = comprehend.create_document_classifier(
    DocumentClassifierName='docclassifier-techsummit-demo-' + timestamp,
    DataAccessRoleArn=data_acess_iam_role,
    InputDataConfig={
        'S3Uri': input_data
    },
    LanguageCode='en'
)
classifierArn = classifier['DocumentClassifierArn']
print(classifierArn)

Let's describe the document classifier job to validate it. Status should be either 'SUBMITTED' or 'TRAINING'.

In [None]:
classifier_status = comprehend.describe_document_classifier(
    DocumentClassifierArn=classifierArn
)
print(json.dumps(classifier_status, sort_keys=True, indent=4, default=str))

---
Now let's prepare our test data set. We are going to use the BBC RSS feeds of categories World, Business, Technology and Sports.

In [None]:
world_feed = 'http://feeds.bbci.co.uk/news/world/rss.xml'
business_feed = 'http://feeds.bbci.co.uk/news/business/rss.xml'
tech_feed = 'http://feeds.bbci.co.uk/news/technology/rss.xml'
sports_feed = 'http://feeds.bbci.co.uk/sport/rss.xml'

In [None]:
world_items = feedparser.parse(world_feed)['entries']
business_items = feedparser.parse(business_feed)['entries']
tech_items = feedparser.parse(tech_feed)['entries']
sports_items = feedparser.parse(sports_feed)['entries']

---
Get the top 4 articles from the feeds and extract the summary of the news article and store it to a file.

Upload that file to S3 which shall be used to test the trained model.

In [None]:
with open(test_data_file, 'w+') as testfile:
    for feed in [world_items, business_items, tech_items, sports_items]:
        for i in range(0,4):
            testfile.write(feed[i]['summary'] + '\n')
            
s3.Bucket(bucket).upload_file(test_data_file, object_prefix + '/testdata.txt')

---
Let's make sure that the training job has completed successfully.

In [None]:
print('Document Classifier job is in progress ', end='')
while True:
    status = comprehend.describe_document_classifier(DocumentClassifierArn=classifierArn
)
    if status['DocumentClassifierProperties']['Status'] in ['TRAINED', 'FAILED']:
        print('')
        print('Document Classifier job completed with status %s\n' % status['DocumentClassifierProperties']['Status'])
        break
    print('.', end='')
    time.sleep(5)


---
Now we have a trained classifier. Createa document classification job and provide the test data set as input.

In [None]:
timestamp = datetime.now().strftime('%Y-%m-%d-%H%M%S')
classify_job = comprehend.start_document_classification_job(
    JobName='classify-techsummit-demo-' + timestamp,
    DocumentClassifierArn=classifierArn,
    InputDataConfig={
        'S3Uri': 's3://{}/{}/testdata.txt'.format(bucket, object_prefix),
        'InputFormat': 'ONE_DOC_PER_LINE'
    },
    OutputDataConfig={
        'S3Uri': 's3://{}/{}/output/custom-classifier/'.format(bucket, object_prefix)
    },
    DataAccessRoleArn=data_acess_iam_role
)
print('Job with Id %s is in status %s' % (classify_job['JobId'], classify_job['JobStatus']))

---
Wait for the classification job to be complete.

In [None]:
print('Custom Classification job is in progress ', end='')
while True:
    status = comprehend.describe_document_classification_job(JobId=classify_job['JobId'])
    if status['DocumentClassificationJobProperties']['JobStatus'] in ['COMPLETED', 'FAILED']:
        print('')
        print('Custom Classification completed with status %s\n' % status['DocumentClassificationJobProperties']['JobStatus'])
        break
    print('.', end='')
    time.sleep(5)

o_bucket = status['DocumentClassificationJobProperties']['OutputDataConfig']['S3Uri'].split('/')[2]
o_key = '/'.join(status['DocumentClassificationJobProperties']['OutputDataConfig']['S3Uri'].split('/')[3:])

output_gzip = gzip.open(BytesIO(s3.Object(o_bucket, o_key).get()['Body'].read()), 'rt')

---
Now let's look at the result of classification job for our test data

In [None]:
flag = True
line_count = 0
classify_result = []
for line in output_gzip.readlines():
    if flag:
        line = '{' + line.split(sep='{', maxsplit=1)[1]
        flag = False
    classify_result.append(json.loads(line))
    line_count += 1
    if line_count == 16:
        break
        
class_list = {'1': 'World', '2': 'Sports', '3': 'Business', '4': 'Science_Technology'}

with open(test_data_file, 'r') as f:
    lines = f.readlines()
    for res in classify_result:
        line_num = int(res['Line'])
        class_detected = res['Classes'][0]['Name']
        class_score = res['Classes'][0]['Score']
        print('Article with description "%s" is classified as "%s" with score "%s".' % 
              (lines[line_num].strip(), class_list[class_detected], class_score))
        print()