In [1]:
import spacy
import neuralcoref
from spacy.symbols import nsubj, nsubjpass, det, dobj, pobj, prep, root, neg, agent, cc, conj, acl, xcomp, punct, VERB
import io
import datetime
import json
import codecs
import csv
import re
import uuid
import pandas as pd
import numpy as np
import csv
import time
from ipywidgets import IntProgress
from keras.preprocessing.sequence import pad_sequences
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from collections import defaultdict
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import BertForSequenceClassification, AdamW, BertConfig
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

is_cuda = torch.cuda.is_available()

if is_cuda:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(device)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

#------------------------------------- Load data ---------------------------------------#

def load_csv_to_dict(csvfile, language, language_position):
    """
     Creates a dictionary with word tokens as keys, and their 'features' as values.
    """
    dictionary = dict()
    data = [row for row in csv.reader(open('%s' % csvfile, encoding='utf-8-sig'), delimiter=";")]
    header = data.pop(0)       # first row of csv = header.
    header.pop(0)       # remove first element of header.
    for row in data:
        if (language in row[language_position].lower()) or (language == "all"):
            key = row.pop(0)    # first element in row = key
            for index, element in enumerate(row):
                dictionary.setdefault(key.lower(), {})[header[index].lower()] = element.lower()
    return (dictionary)


def setup_spacy(user_text, nlp):
    """
     Lowers the user text and sets the spacy pipeline.
    """
    user_text = user_text.lower()
    return nlp(user_text)

### Matching

In [3]:
#------------------------------------- Matcher Algorithm ---------------------------------------#

def find_match_dict(user_text, dicts, user_highlights, reinforced_highlights, nlp):
    """
     This function loops over all word tokens within the user text,
     to find a match with a key within a set of dictionaries.
     It returns a dictionary with the matches string possition and attributes.
    """
    doc = setup_spacy(user_text, nlp)
    results = dict()
    for key in dicts.keys():    # iterates over the three datatypes.
        part_dict = dict()
        sent_counter = 0
        for sent in doc.sents:
            for token in sent:       # iterates over all spacy tokens in the text.
                if token.text in dicts[key] or token.lemma_ in dicts[key]:
                    id = uuid.uuid4()
                    if check_user_highlight(user_highlights, 
                                            key, 
                                            target = {"text": token.text, 
                                                      "index" : (token.idx, 
                                                                 token.idx+len(token.text))}) == True:
                        reinforced_highlights[str(id)] = {"text": token.text, 
                                                          "index" : (token.idx, token.idx+len(token.text))}
                    else:
                        noun_update = False
                        for chunk in doc.noun_chunks:
                            if (((chunk.root.idx == token.idx) and (chunk.text != token.text)) and (key == "roles")):
                                noun_update = True
                                part_dict[str(id)] = {"text": chunk.text,
                                                      "index" : (chunk.start_char, chunk.end_char), 
                                                      "sent": sent_counter}
                        
                        if noun_update == False:
                            part_dict[str(id)] = {"text": token.text,
                                                  "index" : (token.idx, token.idx+len(token.text)), 
                                                  "sent": sent_counter}
            sent_counter = sent_counter+1
        results[key] = part_dict
    return results

### Highlight Intersection

In [2]:
#------------------------------------- Intersection Checker ---------------------------------------#

def check_user_highlight(user_highlights, datatype, target):
    """
     This function checks if a target annotation already exists in the user_highlights.
     Best result is exact matches. Otherwise it checks for match in index intervals and texts.
    """
    match = False
    interval_match = False
    text_match = False
    if datatype in user_highlights.keys():  # Check if the datatype exists.
        for key, value in user_highlights[datatype].items():
            if datatype == 'activities':  # Activities are nested in the dict.
                val_start, val_end = int(value['label']['index'][0]), int(value['label']['index'][1])  # Set index boundaries.
                tar_start, tar_end = int(target['verb']['index'][0]), int(target['verb']['index'][1]) 
                
                if 'text' in target['verb'].keys():  # Check if it contains text.
                    target_text = target['verb']['text']
                    value_text = value['label']['text']
            else:  # All other datatypes:
                val_start, val_end = int(value['index'][0]), int(value['index'][1])  # Set index boundaries.
                tar_start, tar_end = int(target['index'][0]), int(target['index'][1]) 
                
                if 'text' in target.keys():  # Check if it contains text.
                    target_text = target['text']
                    value_text = value['text']

            if (val_start, val_end) == (tar_start, tar_end):  # Checking if exact matches.
                match = True

                if target_text is not None:  # Failsafe: Checking if the strings are identical.
                    error = 'Match Error: Index match, but strings are different: {}'.format((target_text, value_text))
                    assert (target_text == value_text), error
                  
            else:   # Checks if the two intervals overlap.
                t_ran = range(tar_start, tar_end+1)
                v_ran = range(val_start, val_end+1)

                if len(list(set(t_ran) & set(v_ran))) >= 1:   
                    match = True

                    if target_text is not None:  # Failsafe: Checking string intervals.
                        error = 'Match Error: Index match, but strings are different: {}'.format((target_text, value_text))
                        assert (target_text in value_text or value_text in target_text), error

    return match

In [4]:
def merge_two_dicts(x, y):
    z = x.copy()   # start with x's keys and values
    z.update(y)    # modifies z with y's keys and values & returns None
    return z

### Rule-based

In [5]:
#------------------------------------- Rule-based Algorithm ---------------------------------------#


def voice_detector(token):
    """
    Function for identifying Voice type
    Identifying passive or active voice in a given sentence.
    """
    doc = token.doc
    idx = token.i

    # passive voice:
    if token.tag_ == 'VBN' and (doc[idx-1].lemma_ == 'be' or
                                doc[idx-2].lemma_ == 'be' or
                                doc[idx-1].lemma_ == 'have'):
        voice = 'passive'
        return voice

    # active voice:
    elif token.dep_ == "ROOT" or (token.tag_ == 'VBG' and (token.dep != prep and 
         token.dep != acl)) or token.tag_ == 'VBD' or (token.tag_ == 'VBZ' 
         and token.dep_ != 'aux'):
        
        voice = 'active'
        return voice
    
    else:
        return None
    
def instance_check(idx):
    """
    Check whether index is of datatype int. 
    """
    if isinstance(idx, int):
        return idx
    else:
        return int(idx)
    
def dependencies_check(children_deps):
    """
    Check dependencies of a token's children.
    """
    deps = [nsubj, nsubjpass, dobj]
    count = 0
    for child_dep in children_deps:
        if child_dep in deps:
            count += 1
    return count

def extract_indexes(token, voice):
    """
    Exctract index span for activity text.
    """
    indexes = list()
    children = [child for child in token.children]
    children_deps = [child.dep for child in token.children]
    length = len(children)
    
    for idx, child in enumerate(children):
        
        if token.dep != conj:
            count = dependencies_check(children_deps)
            if count == 0: # if there are no children (dependency tags), break the loop
                break
        
        if voice == 'active':
            if child.dep == nsubj:
                start_idx = instance_check(child.i + 1) # don't include subject in span
                indexes.append(start_idx)
            elif child.dep == dobj:
                end_idx = instance_check(child.i + 1)
                indexes.append(end_idx)
                break
        elif voice == 'passive':
            if child.dep == nsubj:
                start_idx = instance_check(child.i + 1)
                indexes.append(start_idx) # don't include subject in span
            elif child.dep == nsubjpass:
                start_idx = child.i # include object in span (nsubjpass in passive voice is an object and not a subject)
                indexes.append(start_idx)
            elif child.dep == agent: # corresponding pobj is the actual subject in a passive voice sentence
                end_idx = child.i
                indexes.append(end_idx)
                break
            elif child.dep == dobj:
                end_idx = instance_check(child.i + 1)
                indexes.append(end_idx)
                break
        
        if idx == length - 1 and (child.dep == conj and children[idx-1].dep != cc):
            # handle conjuncts separately (as their own activities), 
            # if the previous token is not a conjunction (e.g., or, and)
            pass
        elif idx == length - 1 and (child.dep == dobj or (child.dep == conj and children[idx-1].dep == cc) or 
            child.dep == xcomp):
            end_idx = instance_check(child.i + 1)
            indexes.append(end_idx)
        elif idx == length - 1 and child.dep != dobj:
            end_idx = child.i
            indexes.append(end_idx)
    
    return indexes 


def create_act_desc(token, voice):
    """
    Takes a token and a voice type (passive or active) and returns a dictionary consisting of an activity text 
    and its corresponding indexes.
    """
    act_text = None
    doc = token.doc
    idx = token.i
    indexes = extract_indexes(token, voice)

    if indexes != list():
        # create activity description
        if len(indexes) == 1:
            if indexes[0] < idx:
                start_idx = indexes[0]
                end_idx = idx+1
                act_text = doc[start_idx:end_idx]
            elif indexes[0] > idx:
                start_idx = idx
                end_idx = indexes[0]
                act_text = doc[start_idx:end_idx]

        elif len(indexes) >= 2: 
            indexes = sorted(indexes, reverse = False) # sort in ascending order 
            start_idx = indexes[0]
            end_idx = indexes[1]
            if end_idx <= idx:
                act_text = doc[start_idx:idx+1]
            else:
                act_text = doc[start_idx:end_idx]
        
    return {"text": str(act_text), "index": (act_text[0].idx, act_text[0].idx+len(str(act_text)))}
    
    
def process_token(token):
    """
    Given a token, its important attributes are returned.
    Returnes a set-dictionary.
    """
    prep = False
    neg = False
    children = [child for child in token.children]
    for child in children:

        if child.dep_ == "prep" or child.dep_ == "advcl":
            prep = True
        if child.dep_ == "neg":
            neg = True

    conjuncts = token.conjuncts
    
    if neg == True:
        return {"text": token.text,
            "negated": neg,
            "index": (token.idx, token.idx+len(token.text))}
    else:
        return {"text": token.text,
                "index": (token.idx, token.idx+len(token.text))}


def identify_act(token, sent):
    """
    Find object and subject for activity.
    Return token values, activity voice and activity description.
    """
    act_stat = False # used to check if there has been found an activity.
    act = {}
    # Find if activty is passive or active voice:
    act["voice"] = voice_detector(token)
    # Get token values for activity word:
    act["verb"] = process_token(token)
    act["sent"] = sent
    # passive voice handling:
    if act["voice"] == 'passive':
        for child in token.children:
            if child.dep == nsubj:
                act["subject"] = process_token(child)
                act_stat = True

            elif child.dep == nsubjpass or child.dep == dobj:
                act["object"] = process_token(child)
                act_stat = True

    # active Voice handling:
    elif act["voice"] == 'active':
        for child in token.children:
            if child.dep == nsubj:
                act["subject"] = process_token(child)
                act_stat = True

            elif child.dep == dobj:
                act["object"] = process_token(child)
                act_stat = True
    else:
        pass

    if act_stat == True:
        act["activity_label"] = create_act_desc(token, act["voice"])
        return act
    else:
        return None


def process_sent(sent, idx, user_highlights, reinforced_highlights):
    """
    Checks a sentence, if it meets the criteria for having one or more activities.
    Takes a sent, and returns a dictionary of activities.
    """
    acts_stat = False
    acts = {}
    for token in sent:
        if token.pos_ == 'VERB': 
            # If a verb is found, try and construct activity
            activity = identify_act(token, idx)
            id = uuid.uuid4()
            if identify_act(token, sent) != None:
                if check_user_highlight(user_highlights, 'activities', activity) == True:
                    reinforced_highlights[str(id)] = {"text": token.text, "index" : (token.idx, token.idx+len(token.text))}
                else:
                    acts[str(id)] = activity
                    acts_stat = True
    if acts_stat == True:
        return acts
    else:
        return None


def activity_recognition(desc, user_highlights, reinforced_highlights, nlp):
    """
    Top iteration, taking care of the document is split into sentences.
    Takes a text, and returns all identified activities not already highlighted by the user.
    Output is structured in 3 layers: 1. Sentences ( 2. Activities ( 3. Tokens)
    """
    doc = setup_spacy(desc, nlp)
    sent_acts = {}

    for i, sent in enumerate(doc.sents):
        analysis = process_sent(sent, i, user_highlights, reinforced_highlights)
        if analysis != None:
            sent_acts = merge_two_dicts(sent_acts, analysis)

    return sent_acts

### Rule-based Handler

In [6]:
#------------------------------------- API & Data Input ---------------------------------------#

# load data from csv into dictionaries:
languages = ["all", "en", "da", "pt"] # all = all languages
datatypes = [["roles", 3], ["relations", 3], ["alias", 1]]  # type, language_position
datasets = dict()
for language in languages:
    temp = dict()
    for datatype in datatypes:
        temp[datatype[0]] = load_csv_to_dict("%s.csv" % datatype[0], 
                                             language, 
                                             datatype[1])
    datasets[language] = temp
    
    
def postJsonHandler(event, context):
    """
    Handles the API call created by AWS API Gateway.
    """

    nlp = spacy.load('en_core_web_sm')

    plain_text = event["plain_text"] # Handles the input request, by breaking it into parts. 
    user_language = event["language"]
    user_highlights = event["highlights"]


    reg = re.findall(r"^\w+",user_language) # language handling:
    if reg and reg[0] in languages:
        reg_language = reg[0]

    dict_results = {}   # structures results in a combined dictionary:
    reinforced_highlights = dict()
    dict_results["reinforced_highlights"] = dict()
    dict_results["entity_recognition"] = find_match_dict(plain_text, 
                                                         datasets[reg_language], 
                                                         user_highlights, 
                                                         reinforced_highlights, 
                                                         nlp)
    if reg_language == "en" or user_language == "all":
        activities = activity_recognition(plain_text, user_highlights, reinforced_highlights, nlp)
        dict_results["activity_recognition"] = activities
        #dict_results["reinforced_highlights"] = activities[1]

    return (dict_results)

## Preprocessing

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
test = re.split(r'(?<=\.) ', text)

df = pd.DataFrame(test, columns=['Text'])
sentences = df.Text.values

### Coreference

In [None]:
nlp = spacy.load('en_core_web_lg')
neuralcoref.add_to_pipe(nlp)

doc = nlp(text)

# Generate cluster, start index and end index
for cluster in doc._.coref_clusters:
    for reference in cluster:
        #print(reference, (reference.start, reference.end))
        references.append(reference)
        clusters_idx_start.append(reference.start)
        clusters_idx_end.append(reference.end)
        
references  = [i.text for i in references]
df = pd.DataFrame(references, columns=['Mentions'])
df_1 = pd.DataFrame(clusters_idx_start, columns=["Start_Idx"])
df_2 = pd.DataFrame(clusters_idx_end, columns=["End_Idx"])

In [None]:
df["Start_Idx"] = df_1["Start_Idx"]
df["End_Idx"] = df_2["End_Idx"]

# Relations

### Transformation

In [None]:
input_ids = []

for sent in sentences:
    encoded_sent = tokenizer.encode(sent) # Add '[CLS]' and '[SEP]'
    input_ids.append(encoded_sent)

input_ids = pad_sequences(input_ids, maxlen=30, 
                          dtype="long", truncating="post", padding="post")
input_ids = torch.tensor(input_ids)

### Prediction

In [None]:
config_1 = BertConfig.from_pretrained('Models/Relations/config.json')
model_1 = BertForSequenceClassification.from_pretrained('Models/Relations/pytorch_model.bin', config=config_1)

predictions = []

model_1.eval()
with torch.no_grad():
    output = model_1(input_ids)
    #print(output)

logits = output[0]
logits = logits.detach().cpu().numpy()

# Store predictions and true labels
predictions.append(logits)

predictions = np.argmax(logits, axis=1)

sent_pred = zip(sentences, predictions)
df_pred = pd.DataFrame(sent_pred, columns=["Sentence", "Tag"]) 
df_pred['Tag'].replace(0, 'Non-Relation',inplace=True)
df_pred['Tag'].replace(1, 'Relation',inplace=True)

In [None]:
df_2 = df_pred[(df_pred.Tag == "Relation")]
sentences_2 = df_2.Sentence.values

input_ids_2 = []

for sent in sentences_2:
    encoded_sent_2 = tokenizer.encode(sent) # Add '[CLS]' and '[SEP]'
    input_ids_2.append(encoded_sent_2)

input_ids_2 = pad_sequences(input_ids_2, maxlen=30, 
                          dtype="long", truncating="post", padding="post")
input_ids_2 = torch.tensor(input_ids_2)

%%capture
config_2 = BertConfig.from_pretrained('Models/Relations_Type/config.json')
model_2 = BertForSequenceClassification.from_pretrained('Models/Relations_Type/pytorch_model.bin', config=config_2)

In [None]:
predictions_2 = []

model_2.eval()
with torch.no_grad():
    output_2 = model_2(input_ids_2)

logits_2 = output_2[0]
logits_2 = logits_2.detach().cpu().numpy()

# Store predictions and true labels
predictions_2.append(logits_2)
print('Predictions are DONE.')

predictions_2 = np.argmax(logits_2, axis=1)

sent_pred_2 = zip(sentences_2, predictions_2)
df_pred_2 = pd.DataFrame(sent_pred_2, columns=["Sentence", "Tag"]) 
df_pred_2['Tag'].replace(0, 'Conditions',inplace=True)
df_pred_2['Tag'].replace(1, 'Non-Relation',inplace=True)
df_pred_2['Tag'].replace(2, 'Response',inplace=True)

In [None]:
def relation_enrichment(text, rel_pred, type_pred, model, method="intersection"):
    """
    Takes relations from BERT Model, and process them into sentences. 
    And outputs the comparison with RB.
    """
    doc = setup_spacy(text, model)
    results = []
    for idx, sentence in enumerate(doc.sents):
        hl_txt = rel_pred.values[idx][0]
        hl_type = type_pred.values[idx][1]
        hl_sent = setup_spacy(hl_txt, spacy.load('en_core_web_sm'))
        hl_input = {'text': hl_sent.text, 'index': (0, hl_sent[len(hl_sent)-1].idx)}
        input = {  
            "graphid": "0",
            "userid": "0",
            "organizationid": "0",
            "language": "en-US",
            "plain_text": sentence.text,
            "highlights": {'roles': {}, 'relations': {0: hl_input}, "activities":{}}}
        output = postJsonHandler(input,())
        results.append([True, hl_type, output['reinforced_highlights']])
        print(output['reinforced_highlights'])
    return results

stats = (relation_enrichment(text, df_pred, df_pred_2, spacy.load('en_core_web_sm')))
#for el in stats:
#    print("Pred True for sent: " + str(el[0]), "Type: " + el[1], "Reinforced: ", el[2]['reinforced_highlights'])
    
stats_df = pd.DataFrame((stats), columns =['Prediction', 'Type', 'Reinforced'])
stats_df

# Roles & Activities

In [None]:
tokenized_sentence = tokenizer.encode(test_sentence)
input_ids = torch.tensor([tokenized_sentence]).cuda()

with torch.no_grad():
    output = model(input_ids)
label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)

# join bpe split tokens
tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
new_tokens, new_labels = [], []
for token, label_idx in zip(tokens, label_indices[0]):
    if token.startswith("##"):
        new_tokens[-1] = new_tokens[-1] + token[2:]
    else:
        new_labels.append(tag_values[label_idx])
        new_tokens.append(token)

In [None]:
sent_pred = zip(new_tokens, new_labels)
df_pred = pd.DataFrame(sent_pred, columns=["Sentence", "Tag"]) 

# Get indexes 
test_sentence = "HI, this is is a test"
test_sentence_res = nlp(test_sentence)
doc = test_sentence_res
sent_index = [(token.idx, token.idx + len(token.text)) for token in doc]
df_sent = pd.DataFrame(sent_index, columns=["Start_Idx", "End_Idx"]) 

# Combine dataframes 
df_pred["Start_Idx"] = df_sent["Start_Idx"]
df_pred["End_Idx"] = df_sent["End_Idx"]