In [3]:
import spacy
import pandas as pd
from spacy.pipeline import merge_entities
import dill as pickle
%load_ext line_profiler

In [4]:
sub_tags = ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW']

In [5]:
obj_tags = ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW']

In [None]:
sl = nate.import_csv('../data/sl2.csv', text='content', columns_to_keep=["title", "case"])

In [None]:
%time df = sl.svo(sub_tags, obj_tags, to_df=True)

In [6]:
SUBJECTS = {"nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"}
# dependency markers for objects
OBJECTS = {"dobj", "dative", "attr", "oprd"}
# POS tags that will break adjoining items
BREAKER_POS = {"CCONJ", "VERB"}
# words that are negations
NEGATIONS = {"no", "not", "n't", "never", "none"}

sub_ner_tags = False
obj_ner_tags = False
sub_ent_types = []
obj_ent_types = []


# does dependency set contain any coordinating conjunctions?
def contains_conj(depSet):
    return "and" in depSet or "or" in depSet or "nor" in depSet or \
           "but" in depSet or "yet" in depSet or "so" in depSet or "for" in depSet


# get subs joined by conjunctions
def _get_subs_from_conjunctions(subs):
    more_subs = []
    for sub in subs:
        # rights is a generator
        rights = list(sub.rights)
        rightDeps = {tok.lower_ for tok in rights}
        if contains_conj(rightDeps):
            if sub_ner_tags:
                more_subs.extend([tok for tok in rights if tok.dep_ in SUBJECTS and tok.ent_type_ in sub_ner_tags])
            else:
                more_subs.extend([tok for tok in rights if tok.dep_ in SUBJECTS or tok.pos_ == "NOUN"])
            if len(more_subs) > 0:
                more_subs.extend(_get_subs_from_conjunctions(more_subs))
    return more_subs


# get objects joined by conjunctions
def _get_objs_from_conjunctions(objs):
    more_objs = []
    for obj in objs:
        # rights is a generator
        rights = list(obj.rights)
        rightDeps = {tok.lower_ for tok in rights}
        if contains_conj(rightDeps):
            if obj_ner_tags:
                more_objs.extend([tok for tok in rights if (tok.dep_ in OBJECTS and tok.ent_type_ in obj_ner_tags) or (tok.pos_ == "NOUN" and tok.ent_type_ in obj_ner_tags)])
            else:            
                more_objs.extend([tok for tok in rights if tok.dep_ in OBJECTS or tok.pos_ == "NOUN"])
            if len(more_objs) > 0:
                more_objs.extend(_get_objs_from_conjunctions(more_objs))
    return more_objs


# find sub dependencies
def _find_subs(tok):
    head = tok.head
    while head.pos_ != "VERB" and head.pos_ != "NOUN" and head.head != head:
        head = head.head
    if head.pos_ == "VERB":
        if sub_ner_tags:
            subs = [tok for tok in head.lefts if tok.dep_ == "SUB" and tok.ent_type_ in sub_ner_tags]
        else:
            subs = [tok for tok in head.lefts if tok.dep_ == "SUB"]
        if len(subs) > 0:
            verb_negated = _is_negated(head)
            subs.extend(_get_subs_from_conjunctions(subs))
            return subs, verb_negated
        elif head.head != head:
            return _find_subs(head)
    elif sub_ner_tags and head.ent_type_ in sub_ner_tags:
        return [head], _is_negated(tok)
    elif not sub_ner_tags and head.pos_ == "NOUN":
        return [head], _is_negated(tok)
    return [], False


# is the tok set's left or right negated?
def _is_negated(tok):
    parts = list(tok.lefts) + list(tok.rights)
    for dep in parts:
        if dep.lower_ in NEGATIONS:
            return True
    return False


# get all the verbs on tokens with negation marker
def _find_svs(tokens):
    svs = []
    verbs = [tok for tok in tokens if tok.pos_ == "VERB"]
    for v in verbs:
        subs, verbNegated = _get_all_subs(v)
        if len(subs) > 0:
            for sub in subs:
                svs.append((sub.orth_, "!" + v.orth_ if verbNegated else v.orth_))
    return svs


# get grammatical objects for a given set of dependencies (including passive sentences)
def _get_objs_from_prepositions(deps, is_pas):
    objs = []
    for dep in deps:
        if obj_ner_tags:
            if dep.pos_ == "ADP" and (dep.dep_ == "prep" or (is_pas and dep.dep_ == "agent")):
                objs.extend([tok for tok in dep.rights if (tok.dep_  in OBJECTS and tok.ent_type_ in obj_ner_tags)])
                             #(is_pas and tok.ent_type_ in obj_ner_tags and tok.dep_ == 'pobj')]) #temporarily disabled
        else:
            if dep.pos_ == "ADP" and (dep.dep_ == "prep" or (is_pas and dep.dep_ == "agent")):
                objs.extend([tok for tok in dep.rights if tok.dep_ in OBJECTS or
                             (tok.pos_ == "PRON" and tok.lower_ == "me") or
                             (is_pas and tok.dep_ == 'pobj')])
    return objs


# get objects from the dependencies using the attribute dependency
# *NOTE* disabled for unknown reason in _get_all_objs, this needs NER option if it should be enabled
def _get_objs_from_attrs(deps, is_pas):
    for dep in deps:
        if dep.pos_ == "NOUN" and dep.dep_ == "attr":
            verbs = [tok for tok in dep.rights if tok.pos_ == "VERB"]
            if len(verbs) > 0:
                for v in verbs:
                    rights = list(v.rights)
                    objs = [tok for tok in rights if tok.dep_ in OBJECTS]
                    objs.extend(_get_objs_from_prepositions(rights, is_pas))
                    if len(objs) > 0:
                        return v, objs
    return None, None


# xcomp; open complement - verb has no suject
def _get_obj_from_xcomp(deps, is_pas):
    for dep in deps:
        if dep.pos_ == "VERB" and dep.dep_ == "xcomp":
            v = dep
            rights = list(v.rights)
            if obj_ner_tags:
                objs = [tok for tok in rights if tok.dep_ in OBJECTS and tok.ent_type_ in obj_ner_tags]
            else:
                objs = [tok for tok in rights if tok.dep_ in OBJECTS]
            objs.extend(_get_objs_from_prepositions(rights, is_pas))
            if len(objs) > 0:
                return v, objs
    return None, None


# get all functional subjects adjacent to the verb passed in
def _get_all_subs(v):
    verb_negated = _is_negated(v)
    if sub_ner_tags:
        subs = [tok for tok in v.lefts if tok.dep_ in SUBJECTS and tok.ent_type_ in sub_ner_tags and tok.pos_ != "DET"]
    else:
        subs = [tok for tok in v.lefts if tok.dep_ in SUBJECTS and tok.pos_ != "DET"]
    if len(subs) > 0:
        subs.extend(_get_subs_from_conjunctions(subs))
    else:
        foundSubs, verb_negated = _find_subs(v)
        subs.extend(foundSubs)

    global sub_ent_types
    sub_ent_types = [sub.ent_type_ for sub in subs]
        
    return subs, verb_negated


# is the token a verb?  (excluding auxiliary verbs)
def _is_non_aux_verb(tok):
    return tok.pos_ == "VERB" and (tok.dep_ != "aux" and tok.dep_ != "auxpass")


# return the verb to the right of this verb in a CCONJ relationship if applicable
# returns a tuple, first part True|False and second part the modified verb if True
def _right_of_verb_is_conj_verb(v):
    # rights is a generator
    rights = list(v.rights)

    # VERB CCONJ VERB (e.g. he beat and hurt me)
    if len(rights) > 1 and rights[0].pos_ == 'CCONJ':
        for tok in rights[1:]:
            if _is_non_aux_verb(tok):
                return True, tok

    return False, v


# get all objects for an active/passive sentence
def _get_all_objs(v, is_pas):
    # rights is a generator
    rights = list(v.rights)
    if obj_ner_tags:
        objs = [tok for tok in rights if (tok.dep_ in OBJECTS and tok.ent_type_ in obj_ner_tags) or (is_pas and tok.dep_ == 'pobj' and tok.ent_type_ in obj_ner_tags)]
    else:
        objs = [tok for tok in rights if tok.dep_ in OBJECTS or (is_pas and tok.dep_ == 'pobj')]
    objs.extend(_get_objs_from_prepositions(rights, is_pas))

    #potentialNewVerb, potentialNewObjs = _get_objs_from_attrs(rights)
    #if potentialNewVerb is not None and potentialNewObjs is not None and len(potentialNewObjs) > 0:
    #    objs.extend(potentialNewObjs)
    #    v = potentialNewVerb

    potential_new_verb, potential_new_objs = _get_obj_from_xcomp(rights, is_pas)
    if potential_new_verb is not None and potential_new_objs is not None and len(potential_new_objs) > 0:
        objs.extend(potential_new_objs)
        v = potential_new_verb
    if len(objs) > 0:
        objs.extend(_get_objs_from_conjunctions(objs))
    
    global obj_ent_types
    obj_ent_types = [obj.ent_type_ for obj in objs]

    return v, objs


# return true if the sentence is passive - at he moment a sentence is assumed passive if it has an auxpass verb
def _is_passive(tokens):
    for tok in tokens:
        if tok.dep_ == "auxpass":
            return True
    return False


# resolve a 'that' where/if appropriate
def _get_that_resolution(toks):
    for tok in toks:
        if 'that' in [t.orth_ for t in tok.lefts]:
            return tok.head
    return toks


# simple stemmer using lemmas
def _get_lemma(word: str):
    tokens = nlp(word)
    if len(tokens) == 1:
        return tokens[0].lemma_
    return word


# print information for displaying all kinds of things of the parse tree
def printDeps(toks):
    for tok in toks:
        print(tok.orth_, tok.dep_, tok.pos_, tok.head.orth_, [t.orth_ for t in tok.lefts], [t.orth_ for t in tok.rights])


# expand an obj / subj np using its chunk
def expand(item, tokens, visited):
    if item.lower_ == 'that':
        item = _get_that_resolution(tokens)

    parts = []

    if hasattr(item, 'lefts'):
        for part in item.lefts:
            if part.pos_ in BREAKER_POS:
                break
            if not part.lower_ in NEGATIONS:
                parts.append(part)

    parts.append(item)

    if hasattr(item, 'rights'):
        for part in item.rights:
            if part.pos_ in BREAKER_POS:
                break
            if not part.lower_ in NEGATIONS:
                parts.append(part)

    if hasattr(parts[-1], 'rights'):
        for item2 in parts[-1].rights:
            if item2.pos_ == "DET" or item2.pos_ == "NOUN":
                if item2.i not in visited:
                    visited.add(item2.i)
                    parts.extend(expand(item2, tokens, visited))
            break

    return parts


# convert a list of tokens to a string
def to_str(tokens):
    return ' '.join([item.text for item in tokens])


# find verbs and their subjects / objects to create SVOs, detect passive/active sentences
def findSVOs(tokens, sub_tags=False, obj_tags=False):
    global sub_ner_tags
    sub_ner_tags = sub_tags
    global obj_ner_tags
    obj_ner_tags = obj_tags
    svos = []
    is_pas = _is_passive(tokens)
    verbs = [tok for tok in tokens if _is_non_aux_verb(tok)]
    visited = set()  # recursion detection
    sub_ent_types = []
    obj_ent_types = []
    for v in verbs:
        subs, verbNegated = _get_all_subs(v)
        # hopefully there are subs, if not, don't examine this verb any longer
        if len(subs) > 0:
            isConjVerb, conjV = _right_of_verb_is_conj_verb(v)
            if isConjVerb:
                v2, objs = _get_all_objs(conjV, is_pas)
                for sub in subs:
                    for obj in objs:
                        objNegated = _is_negated(obj)
                        if is_pas:  # reverse object / subject for passive
                            svos.append((to_str(expand(obj, tokens, visited)),
                                         "!" + v.lemma_ if verbNegated or objNegated else v.lemma_, to_str(expand(sub, tokens, visited))))
                            sub_ent_types.append(sub.ent_type_)
                            obj_ent_types.append(obj.ent_type_)
                            svos.append((to_str(expand(obj, tokens, visited)),
                                         "!" + v2.lemma_ if verbNegated or objNegated else v2.lemma_, to_str(expand(sub, tokens, visited))))
                            sub_ent_types.append(sub.ent_type_)
                            obj_ent_types.append(obj.ent_type_)
                        else:
                            svos.append((to_str(expand(sub, tokens, visited)),
                                         "!" + v.lower_ if verbNegated or objNegated else v.lower_, to_str(expand(obj, tokens, visited))))
                            sub_ent_types.append(sub.ent_type_)
                            obj_ent_types.append(obj.ent_type_)                            
                            svos.append((to_str(expand(sub, tokens, visited)),
                                         "!" + v2.lower_ if verbNegated or objNegated else v2.lower_, to_str(expand(obj, tokens, visited))))
                            sub_ent_types.append(sub.ent_type_)
                            obj_ent_types.append(obj.ent_type_)         
            else:
                v, objs = _get_all_objs(v, is_pas)
                for sub in subs:
                    for obj in objs:
                        objNegated = _is_negated(obj)
                        if is_pas:  # reverse object / subject for passive
                            svos.append((to_str(expand(obj, tokens, visited)),
                                         "!" + v.lemma_ if verbNegated or objNegated else v.lemma_, to_str(expand(sub, tokens, visited))))
                            sub_ent_types.append(sub.ent_type_)
                            obj_ent_types.append(obj.ent_type_)                        
                        else:
                            svos.append((to_str(expand(sub, tokens, visited)),
                                         "!" + v.lower_ if verbNegated or objNegated else v.lower_, to_str(expand(obj, tokens, visited))))
                            sub_ent_types.append(sub.ent_type_)
                            obj_ent_types.append(obj.ent_type_)
                            
    return (svos, sub_ent_types, obj_ent_types)

In [7]:
from joblib import dump, load, Parallel, delayed, cpu_count
from toolz import partition_all
import itertools

In [None]:
import itertools
results = list(itertools.chain(*temp))

In [None]:
from functools import partial
from spacy.util import minibatch

In [8]:
def spacy_process(texts, nlp):
    processed_list = [doc for doc in nlp.pipe(texts)]
    return processed_list

In [9]:
def mp(items, function, cpu, *args):
    batch_size = round(len(items)/cpu)
    partitions = partition_all(batch_size, items)
    temp = Parallel(n_jobs=cpu, max_nbytes=None)(delayed(function)(v, *args) for v in partitions)
    if isinstance(temp[0], dict):
        results = {}
        for batch in temp:
            for key, value in batch.items():
                results.setdefault(key, []).extend(value)
    elif isinstance(temp[0], (list, tuple)):
        results = list(itertools.chain(*temp))
    return results

In [None]:
def mp(items, function, cpu, *args):
    batch_size = round(len(items)/cpu)
    partitions = minibatch(items, size=batch_size)
    executor = Parallel(n_jobs=cpu, backend='multiprocessing', prefer='processes')
    do = delayed(partial(function, *args))
    tasks = (do(batch) for batch in partitions)
    temp = executor(tasks)
    if isinstance(temp[0], dict):
        results = {}
        for batch in temp:
            for key, value in batch.items():
                results.setdefault(key, []).extend(value)
    elif isinstance(temp[0], (list, tuple)):
        results = list(itertools.chain(*temp))
    return results

In [10]:
if cpu_count() >= 8:   #to avoid overtaxing Brad, save some cores
    cpu = 10
else:
    cpu = cpu_count()

In [11]:
def process_svo(text_list, sub_tags = False, obj_tags = False):
    nlp = spacy.load('en_core_web_sm')
    nlp.add_pipe(merge_entities)
    post_nlp = mp(text_list, spacy_process, cpu, nlp)
    sentences = [[x.string.strip() for x in y.sents] for y in post_nlp]
    svo_items = [[findSVOs(x, sub_tags, obj_tags) for x in y.sents] for y in post_nlp]

    
    return sentences, svo_items
    
    
def svo_to_df(sentences, svo_items):
    df = pd.DataFrame()
    doc_id = []
    sent_id = []
    sent_list_flat = []
    svo_list_flat = []
    sub_list_flat = []
    verb_list_flat = []
    obj_list_flat = []
    sub_ent_types = []
    obj_ent_types = []
    for i, doc in enumerate(sentences):
        for j, sent in enumerate(doc):
            for k, svo_item in enumerate(svo_items[i][j][0]):
                doc_id.append(i)
                sent_id.append(j)
                sent_list_flat.append(sent)
                svo_list_flat.append(svo_item)
                sub_list_flat.append(svo_item[0])
                verb_list_flat.append(svo_item[1])
                obj_list_flat.append(svo_item[2])
                sub_ent_types.append(svo_items[i][j][1][k])
                obj_ent_types.append(svo_items[i][j][2][k])

            
    df['doc_id'], df['sent_id'], df ['sentence'], df['svo'] = doc_id, sent_id, sent_list_flat, svo_list_flat
    df['subject'], df['sub_type'], df['verb'], df['object'], df['obj_type'] = sub_list_flat, sub_ent_types, verb_list_flat, obj_list_flat, obj_ent_types
    
    return df

In [12]:
df = pd.read_csv('../data/sl2.csv')

In [13]:
def svo(sub_tags=False, obj_tags=False):
    """
    This is a docstring
    """ 
    text_list = df.content.tolist()
    sentences, svo_items = process_svo(text_list, sub_tags, obj_tags)

    return svo_to_df(sentences, svo_items)


In [14]:
%lprun -f process_svo svo(sub_tags, obj_tags)

TypeError: self.c cannot be converted to a Python object for pickling