## Setup

In [None]:
%pip install -r requirements.txt > /dev/null

In [None]:
!python -m pip install amazon-textract-response-parser

In [None]:
%matplotlib inline
import time
import json
import requests
import uuid

import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import boto3
import smart_open

from time import sleep
from matplotlib import cm, colors
from spacy import displacy
from collections import Counter
from pyvis.network import Network
from trp import Document

In [None]:
# Amazon Textract client
textract = boto3.client('textract')
#amazon comprehend
comprehend_client = boto3.client('comprehend')
# Client and session information
session = boto3.Session()
s3_client = session.client(service_name="s3")
# Amazon S3 client
s3 = boto3.client('s3')

# Enter your Amazon S3 bucket name 

In [None]:
# Constants for S3 bucket and input data file
bucket = "<enter your s3 bucket name>"

# Download sample financial documents to S3

We've included a set of Amazon press releases as example documents. Here we upload them as a single file `sample_financial_news_doc.pdf` to an S3 bucket for processing. The same bucket will be used to return service output.

In [None]:
filename = "sample_financial_news_doc.pdf"
# Upload the local file to S3
s3_client.upload_file(filename, bucket, filename)


In [None]:
# Document name in Amazon S3 Bucket
documentName = bucket + '/'+filename

# Convert pdf documents to text using Amazon Textract

In [None]:
def startJob(s3BucketName, objectName):
    response = None
    response = textract.start_document_text_detection(
    DocumentLocation={
        'S3Object': {
            'Bucket': s3BucketName,
            'Name': objectName
        }
    })

    return response["JobId"]

def isJobComplete(jobId):
    response = textract.get_document_text_detection(JobId=jobId)
    status = response["JobStatus"]
    print("Job status: {}".format(status))

    while(status == "IN_PROGRESS"):
        time.sleep(5)
        response = textract.get_document_text_detection(JobId=jobId)
        status = response["JobStatus"]
        print("Job status: {}".format(status))

    return status

def getJobResults(jobId):

    pages = []
    response = textract.get_document_text_detection(JobId=jobId)
    
    pages.append(response)
    print("Resultset page recieved: {}".format(len(pages)))
    nextToken = None
    if('NextToken' in response):
        nextToken = response['NextToken']

    while(nextToken):
        response = textract.get_document_text_detection(JobId=jobId, NextToken=nextToken)

        pages.append(response)
        print("Resultset page recieved: {}".format(len(pages)))
        nextToken = None
        if('NextToken' in response):
            nextToken = response['NextToken']

    return pages

In [None]:
jobId = startJob(bucket, filename)
print("Started job with id: {}".format(jobId))
if(isJobComplete(jobId)):
    response = getJobResults(jobId)


# Convert the extracted data from Amazon Textract into UTF 8 Text file

In [None]:
# Lets get the  data into a text file
text_filename = 'sample_finance_data.txt'
doc = Document(response)
with open(text_filename, 'w', encoding='utf-8') as f:
    for page in doc.pages:
    # Print lines and words
        page_string = ''
        for line in page.lines:
            #print((line.text))
            page_string += str(line.text)
        #print(page_string)
        f.writelines(page_string + "\n")
    

In [None]:
# Load the documents locally for later analysis
with open(text_filename, "r") as fi:
    raw_texts = [line.strip() for line in fi.readlines()]


# Upload this text file to Amazon S3 for Comprehend events analysis jobs

In [None]:
# Upload the local file to S3
s3_client.upload_file(text_filename, bucket, text_filename)

# Metadata Extraction
With Comprehend entity detection
and with Comprehend Events

# Lets extract some metadata using Amazon Comprehend Events

# Two choices here
1. Create Comprehend events analysis job through console OR
2. Start an asnynchronous job with python SDK by running below notebook cell

If you want to follow the steps using AWS Console click here https://console.aws.amazon.com/comprehend/v2/home?region=us-east-1#home
and follow instrcutions in the chapter 9

Note: If you are craeting events using Amazon Comprehend console, skip the "Start an asynchronous job with the SDK" section and move to "Collect the results from S3" Section

### Start an asynchronous job with the SDK

The first task is to kick off the inference job. We'll do this with the `start_events_detection_job` endpoint. Note that the API requires an IAM role with List, Read, and Write access to the bucket specified above.

In [None]:

input_data_s3_path = f's3://{bucket}/' + text_filename
output_data_s3_path = f's3://{bucket}/'



## Create a IAM role with Access to Comprehend and specified s3 bucket

In [None]:

# IAM role with access to Comprehend and specified S3 buckets
job_data_access_role = '<enter iam role or refer to code in action video>'


In [None]:
# Other job parameters
input_data_format = 'ONE_DOC_PER_LINE'
job_uuid = uuid.uuid1()
job_name = f"events-job-{job_uuid}"
event_types = ["BANKRUPTCY", "EMPLOYMENT", "CORPORATE_ACQUISITION", 
               "INVESTMENT_GENERAL", "CORPORATE_MERGER", "IPO",
               "RIGHTS_ISSUE", "SECONDARY_OFFERING", "SHELF_OFFERING",
               "TENDER_OFFERING", "STOCK_SPLIT"]

In [None]:
# Begin the inference job
response = comprehend_client.start_events_detection_job(
    InputDataConfig={'S3Uri': input_data_s3_path,
                     'InputFormat': input_data_format},
    OutputDataConfig={'S3Uri': output_data_s3_path},
    DataAccessRoleArn=job_data_access_role,
    JobName=job_name,
    LanguageCode='en',
    TargetEventTypes=event_types
)

# Get the job ID
events_job_id = response['JobId']

# The above code will submit a job in Comprehend Analysis job.
Go to Amazon Console to get the Job Id once the job is completed.
https://console.aws.amazon.com/comprehend/v2/home?region=us-east-1#analysis

 Note that, as an asynchronous inference job, the task will take several minutes to complete. 

# If you have created events job using Comprehend console, go to the analysis job and copy the job id and paste it below else continue.

In [None]:
#Uncomment and enter job id to run this after job is completed
events_job_id ="<enter completed analysis job id>"

In [None]:
# Get current job status
job = comprehend_client.describe_events_detection_job(JobId=events_job_id)

# Loop until job is completed
waited = 0
timeout_minutes = 30
while job['EventsDetectionJobProperties']['JobStatus'] != 'COMPLETED':
    sleep(60)
    waited += 60
    assert waited//60 < timeout_minutes, "Job timed out after %d seconds." % waited
    job = comprehend_client.describe_events_detection_job(JobId=events_job_id)
    print("Job Status {}".format(job['EventsDetectionJobProperties']['JobStatus']))
    

In [None]:
# The output filename is the input filename + ".out"
output_data_s3_file = job['EventsDetectionJobProperties']['OutputDataConfig']['S3Uri'] + text_filename + '.out'
print(output_data_s3_file)
# Load the output into a result dictionary    # Get the files.
results = []
with smart_open.open(output_data_s3_file) as fi:
    results.extend([json.loads(line) for line in fi.readlines() if line])

## Analyzing Comprehend Events output

The remainder of this notebook provides examples of different ways to analyze a given document. For our example document, we'll use the kind of online posting that a Financial analyst might consume when projecting market trends, a [2017 press release about Amazon's acquisition of Whole Foods Market, Inc.](https://press.aboutamazon.com/news-releases/news-release-details/amazoncom-announces-third-quarter-sales-34-437-billion). It's the first document in the data set we submitted to the Comprehend Events API.

### Understanding Comprehend Events system output

The system returns JSON output for each submitted document. The structure of a response is shown below. Note:

* Events system output contains separate objects for `Entities` and `Events`, each organized into groups of coreferential object.  
* Two additional fields, `File` and `Line` will be present as well to track document provenance.

In [None]:
# Use the first result document for analysis
result = results[0]

In [None]:
result

#### Events are groups of Triggers

* The API output includes the text, character offset, and type of each trigger.  

* Confidence scores for classification tasks are given as `Score`. Confidence of event group membership is given with `GroupScore`.  

In [None]:
result['Events'][1]['Triggers']

#### Arguments are linked to Entities by EntityIndex

* The API also return the classification confidence of the role assignment.
It talks about how the entity is related to the event

In [None]:
result['Events'][1]['Arguments']

#### Entities are groups of Mentions

* The API output includes the text, character offset, and type of each mention.  

* Confidence scores for classification tasks are given as `Score`. Confidence of entity group membership is given with `GroupScore`.  

In [None]:
result['Entities'][5]['Mentions']

### Visualizing the Events and Entities

In the remainder of the notebook, we'll give a number of tabulations and visualizations to help understand what the API is returning.

First we'll consider visualization of spans, both triggers and entity mentions. One of the most essential visualization tasks for sequence labeling tasks is highlighting of tagged text in documents. For demo purposes, we'll do this with [displaCy](https://spacy.io/usage/visualizers).

In [None]:
# Convert Events output to displaCy format.
entities = [
    {'start': m['BeginOffset'], 'end': m['EndOffset'], 'label': m['Type']}
    for e in result['Entities']
    for m in e['Mentions']
]

triggers = [
    {'start': t['BeginOffset'], 'end': t['EndOffset'], 'label': t['Type']}
    for e in result['Events']
    for t in e['Triggers']
]

# Spans need to be sorted for displaCy to process them correctly
spans = sorted(entities + triggers, key=lambda x: x['start'])
tags = [s['label'] for s in spans]

output = [{"text": raw_texts[0], "ents": spans, "title": None, "settings": {}}]

In [None]:
# Misc. objects for presentation purposes
spectral = cm.get_cmap("Spectral", len(tags))
tag_colors = [colors.rgb2hex(spectral(i)) for i in range(len(tags))]
color_map = dict(zip(*(tags, tag_colors)))

In [None]:
# Note that only Entities participating in Events are shown.
displacy.render(output, style="ent", options={"colors": color_map}, manual=True)

### Rendering as tabular data

Many users will use Events to create structured data from unstructured text. Here we'll demonstrate how to do this with `pandas`. First, we flatten hierarchical JSON to pandas dataframe. 

In [None]:
# Creation of the entity dataframe. Entity indices must be explicitly created.
entities_df = pd.DataFrame([
    {"EntityIndex": i, **m}
    for i, e in enumerate(result['Entities'])
    for m in e['Mentions']
])

# Creation of the events dataframe. Event indices must be explicitly created.
events_df = pd.DataFrame([
    {"EventIndex": i, **a, **t}
    for i, e in enumerate(result['Events'])
    for a in e['Arguments']
    for t in e['Triggers']
])

# Join the two tables into one flat data structure.
events_df = events_df.merge(entities_df, on="EntityIndex", suffixes=('Event', 'Entity'))

In [None]:
events_df

### A more succinct representation

We're primarity interested in the *event structure*, so let's make that more transparent by creating a new table with Roles as column headers, grouped by Event.

In [None]:
def format_compact_events(x):
    """Collapse groups of mentions and triggers into a single set."""
    # Take the most commonly occurring EventType and the set of triggers.
    d = {"EventType": Counter(x['TypeEvent']).most_common()[0][0],
         "Triggers": set(x['TextEvent'])}
    # For each argument Role, collect the set of mentions in the group.
    for role in x['Role']:
        d.update({role: set((x[x['Role']==role]['TextEntity']))})
    return d

# Group data by EventIndex and format.
event_analysis_df = pd.DataFrame(
    events_df.groupby("EventIndex").apply(format_compact_events).tolist()
).fillna('')

In [None]:
event_analysis_df

### Graphing event semantics

The most striking representation of Comprehend Events output is found in a semantic graph, a network of the entities and events referenced in a document or documents. The code below uses two open source libraries, `networkx` and `pyvis`, to render events system output. In the resulting graph, nodes are entity mentions and triggers, while edges are the argument roles held by the entities in relation to the triggers.
In the graph, vertices are entity mentions and triggers; edges are the argument roles held by the entities in relation to the triggers

#### Formatting the data

System output must first be conformed to the node (i.e., vertex) and edge list format required by `networkx`. This requires iterating over triggers, entities, and argument structural relations. Note that we can use the `GroupScore` and `Score` keys on various objects to prune nodes and edges in which the model has less confidence. We can also use various strategies to pick a 'canonical' mention from each mention group to appear in the graph; here we chose the mention with the string-wise longest extent.

In [None]:
# Entities are associated with events by group, not individual mention; for simplicity, 
# assume the canonical mention is the longest one.
def get_canonical_mention(mentions):
    extents = enumerate([m['Text'] for m in mentions])
    longest_name = sorted(extents, key=lambda x: len(x[1]))
    return [mentions[longest_name[-1][0]]]

# Set a global confidence threshold
thr = 0.5

# Nodes are (id, type, tag, score, mention_type) tuples.
trigger_nodes = [
    ("tr%d" % i, t['Type'], t['Text'], t['Score'], "trigger")
    for i, e in enumerate(result['Events'])
    for t in e['Triggers'][:1]
    if t['GroupScore'] > thr
]
entity_nodes = [
    ("en%d" % i, m['Type'], m['Text'], m['Score'], "entity")
    for i, e in enumerate(result['Entities'])
    for m in get_canonical_mention(e['Mentions'])
    if m['GroupScore'] > thr
]

# Edges are (trigger_id, node_id, role, score) tuples.
argument_edges = [
    ("tr%d" % i, "en%d" % a['EntityIndex'], a['Role'], a['Score'])
    for i, e in enumerate(result['Events'])
    for a in e['Arguments']
    if a['Score'] > thr
]    

#### Create a compact graph

Once the nodes and edges are defines, we can create and visualize the graph.

In [None]:
G = nx.Graph()

# Iterate over triggers and entity mentions.
for mention_id, tag, extent, score, mtype in trigger_nodes + entity_nodes:
    label = extent if mtype.startswith("entity") else tag
    G.add_node(mention_id, label=label, size=score*10, color=color_map[tag], tag=tag, group=mtype)
    
# Iterate over argument role assignments
for event_id, entity_id, role, score in argument_edges:
    G.add_edges_from(
        [(event_id, entity_id)],
        label=role,
        weight=score*100,
        color="grey"
    )

# Drop mentions that don't participate in events
G.remove_nodes_from(list(nx.isolates(G)))

In [None]:
nt = Network("600px", "800px", notebook=True, heading="")
nt.from_nx(G)
nt.show("compact_nx.html")

#### A more complete graph

The graph above is compact, only relaying essential event type and argument role information. We can use a slightly more complicated set of functions to graph all of the information returned by the API.

In [None]:
# This convenience function in `events_graph.py` plots a complete graph of the document,
# showing all events, triggers, entities, and their groups.

import events_graph as evg

evg.plot(result, node_types=['event', 'trigger', 'entity_group', 'entity'], thr=0.5)

# Clean up

Delete Amazon S3 Bucket and objects in the bucket