In [6]:
import spacy

# Noun chunks

Basic usage of spaCy from its web page. https://spacy.io/usage/linguistic-features#dependency-parse

> Noun chunks are “base noun phrases” – flat phrases that have a noun as their head. You can think of noun chunks as a noun plus the words describing the noun – for example, “the lavish green grass” or “the world’s largest tech fund”. To get the noun chunks in a document, simply iterate over Doc.noun_chunks.

In [7]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
            chunk.root.head.text)

Autonomous cars cars nsubj shift
insurance liability liability dobj shift
manufacturers manufacturers pobj toward


# Navigating the Parse Tree

> spaCy uses the terms head and child to describe the words connected by a single arc in the dependency tree. The term dep is used for the arc label, which describes the type of syntactic relation that connects the child to the head. As with other attributes, the value of .dep is a hash value. You can get the string value with .dep_.

In [11]:
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])

Autonomous amod cars NOUN []
cars nsubj shift VERB [Autonomous]
shift ROOT shift VERB [cars, liability, toward]
insurance compound liability NOUN []
liability dobj shift VERB [insurance]
toward prep shift VERB [manufacturers]
manufacturers pobj toward ADP []


# Visualizing Dependencies

In [42]:
from spacy import displacy
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
# Since this is an interactive Jupyter environment, we can use displacy.render here
displacy.render(doc, style='dep')

# 40. Read the Parsed Result as Words

In [79]:
# https://stackoverflow.com/questions/1535327/how-to-print-instances-of-a-class-using-print

class Word(object):
    def __init__(self, text, lemma, pos):
        self.text = text
        self.lemma = lemma
        self.pos = pos
    
    def __str__(self):
        return self.text
    
    def __repr__(self):
        return self.__str__()

In [80]:
with open('./ai.en.txt') as f:
    text = f.read()
    text = text.replace('."', '".')
    text = text.replace('e.g.', 'eg')
    text = text.replace('U.S.', 'US')
    text = text.replace('U.S', 'US')
    text = text.replace('R.U.R.', 'RUR')
    text = text.replace('c.', 'c')
    text = text.replace('No.', 'No')
    text = text.split('\n')
    text = list(filter(lambda x: x, text))

In [81]:
sentences = []

for t in text:
    for sentence in t.split('.'):
        s = sentence.strip()
        if s:
            sentences.append(s)

In [82]:
for s in sentences:
    doc = nlp(s)
    words = [Word(token.text, token.lemma_, token.pos_) for token in doc]
    
    print(words)
    break

[In, computer, science, ,, artificial, intelligence, (, AI, ), ,, sometimes, called, machine, intelligence, ,, is, intelligence, demonstrated, by, machines, ,, in, contrast, to, the, natural, intelligence, displayed, by, humans, and, animals]


# 41. Depencency Tree

In [95]:
class Word(object):
    def __init__(self, text, lemma, pos, head, dep, children):
        self.token = token
        self.text = text
        self.lemma = lemma
        self.pos = pos
        self.head = head
        self.dep = dep
        self.children = children
    
    def __str__(self):
        return self.text
    
    def __repr__(self):
        return self.__str__()

In [99]:
my_sentences = []
for s in sentences:
    doc = nlp(s)
    
    words = [Word(token.text, token.lemma_, token.pos_, token.head, token.dep_, list(token.children)) for token in doc]
    
    my_sentences.append(words)

# 42. Root Words

In [112]:
docs = [nlp(s) for s in sentences]


In [116]:
for doc in docs:
    print(doc)
    displacy.render(doc, style='dep')
    
    for token in doc:
        if token.dep_ == 'ROOT':
            print(token, token.dep_)
    break

In computer science, artificial intelligence (AI), sometimes called machine intelligence, is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals


called ROOT
in ROOT


# 43. Verbs and Dependents

In [164]:
for doc in docs:
    for token in doc:
        if token.pos_ == "VERB":
            print(token, "\n")
            for child in token.children:
                # print(list(token.children))
                if child.pos_ == "NOUN":
                    print("  ", child)
    
    break

called 

   intelligence
   intelligence
is 

   intelligence
demonstrated 

displayed 



# 45. Subject, Verb, Object

In [166]:
doc = nlp("Frank Rosenblatt invented the perceptron")

In [167]:
displacy.render(doc, style='dep')

In [207]:
class Triplet(object):
    def __init__(self, token):
        self.token = token
        self.subject = None
        self.object = None
        
        for child in self.token.children:
            if child.dep_ == "nsubj":
                self.subject = child
            elif child.dep_ == "dobj":
                self.object = child
        
        self.subject_chunk = None
        self.object_chunk = None
        return
    
    def __str__(self):
        subject = str(self.subject) if self.subject_chunk is None else str(self.subject_chunk)
        predicate = str(self.token)
        object_ = str(self.object) if self.object_chunk is None else str(self.object_chunk)
        return "(subject, predicate, object) = (%s, %s, %s)" % (subject, predicate, object_)
    
    def __repr__(self):
        return self.__str__()
     

In [208]:
for token in doc:
    if token.pos_ == "VERB":
        triplet = Triplet(token)
        print(triplet)

(subject, predicate, object) = (Rosenblatt, invented, perceptron)


# 46 Expanding Subjects and Objects

In [209]:
doc = nlp("Frank Rosenblatt invented the perceptron")

In [210]:
for token in doc:
    if token.pos_ == "VERB":
        triplet = Triplet(token)
        print(triplet)

(subject, predicate, object) = (Rosenblatt, invented, perceptron)


In [211]:
for chunk in doc.noun_chunks:
    for token in chunk:
        # print(token, triplet.subject)
        if token == triplet.subject:
            triplet.subject_chunk = chunk

print(triplet)

(subject, predicate, object) = (Frank Rosenblatt, invented, perceptron)


# 47. Triple from a Passive Voice Sentence

In [212]:
doc = nlp("Artificial intelligence was founded as an academic discipline in 1955")

In [214]:
for token in doc:
    print(token, token.pos_)

Artificial ADJ
intelligence NOUN
was AUX
founded VERB
as ADP
an DET
academic ADJ
discipline NOUN
in ADP
1955 NUM


In [215]:
displacy.render(doc, style='dep')