In [1]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP dobj X.X. False False
startup startup NOUN NN dep xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


In [3]:
import spacy


In [5]:
for chunk in doc.noun_chunks:
    print(chunk)

Autonomous cars
insurance liability
manufacturers


In [7]:
for sentence in doc.sents:
    print(sentence.root)

shift


In [8]:
for token in doc:
    print(token, token.subtree)

Autonomous <generator object at 0x7a31691419e0>
cars <generator object at 0x7a31691419e0>
shift <generator object at 0x7a31691419e0>
insurance <generator object at 0x7a31691419e0>
liability <generator object at 0x7a31691419e0>
toward <generator object at 0x7a31691419e0>
manufacturers <generator object at 0x7a31691419e0>


In [9]:
def skip_and_print(*args):
    """ Act like print(), but skip a line before printing. """
    print('\n' + str(args[0]), *args[1:])

In [10]:
def print_table(rows, padding=0):
    """ Print `rows` with content-based column widths. """
    col_widths = [
        max(len(str(value)) for value in col) + padding
        for col in zip(*rows)
    ]
    total_width = sum(col_widths) + len(col_widths) - 1
    fmt = ' '.join('%%-%ds' % width for width in col_widths)
    print(fmt % tuple(rows[0]))
    print('~' * total_width)
    for row in rows[1:]:
        print(fmt % tuple(row))

In [11]:
document_string = """
The Waystone Inn lay in silence,
and it was a silence of three parts.
"""

In [12]:
document_string = ' '.join(document_string.split())

skip_and_print('Working with string: "%s"' % document_string)
doc = nlp(document_string)


Working with string: "The Waystone Inn lay in silence, and it was a silence of three parts."


In [13]:
skip_and_print('All the found noun chunks & some properties:')

rows = [['Chunk', '.root', 'root.dep_', '.root.head']]
for chunk in doc.noun_chunks:
    rows.append([
        chunk,            # A Span object with the full phrase.
        chunk.root,       # The key Token within this phrase.
        chunk.root.dep_,  # The grammatical role of this phrase.
        chunk.root.head   # The grammatical parent Token.
    ])
print_table(rows, padding=4)


All the found noun chunks & some properties:
Chunk                .root       root.dep_     .root.head    
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The Waystone Inn     Inn         nsubj         lay           
silence              silence     pobj          in            
it                   it          nsubj         was           
a silence            silence     attr          was           
three parts          parts       pobj          of            


In [15]:
document_string = '''
It's the questions we can't answer that teach us the most.
They teach us how to think.'''

In [16]:
document_string = ' '.join(document_string.split())


In [17]:
skip_and_print('Working with string: "%s"' % document_string)
doc = nlp(document_string)


Working with string: "It's the questions we can't answer that teach us the most. They teach us how to think."


In [18]:
skip_and_print('Root word of each sentence:')
rows = [['Root', '|', 'Sentence']]
for sentence in doc.sents:
    rows.append([sentence.root, '|', sentence.text])
print_table(rows)


Root word of each sentence:
Root  | Sentence                                                  
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
's    | It's the questions we can't answer that teach us the most.
teach | They teach us how to think.                               


In [19]:
skip_and_print('Dependent words (aka subtree) of some tokens:')
rows = [['Token', '|', 'Subtree']]


Dependent words (aka subtree) of some tokens:


In [20]:
for token in [doc[9], doc[12], doc[15]]:
    subtree = [
        ('((%s))' if t is token else '%s') % t.text
        for t in token.subtree
    ]
    rows.append([token.text, '|', ' '.join(subtree)])
print_table(rows)

Token | Subtree                         
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
teach | that ((teach)) us the most      
most  | the ((most))                    
teach | They ((teach)) us how to think .


In [26]:
# Import required libraries
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag, word_tokenize, RegexpParser

# Example text
sample_text = "The quick brown fox jumps over the lazy dog"

# Find all parts of speech in the above sentence
tagged = pos_tag(word_tokenize(sample_text))

# Extract all parts of speech from any text
chunker = RegexpParser('''
    NP: {<DT>?<JJ>*<NN>} # To extract Noun Phrases
    P: {<IN>}           # To extract Prepositions
    V: {<V.*>}           # To extract Verbs
    PP: {<P> <NP>}       # To extract Prepositional Phrases
    VP: {<V> <NP|PP>*}   # To extract Verb Phrases
''')

# Print all parts of speech in the above sentence
output = chunker.parse(tagged)
print("After Extracting\n", output)


After Extracting
 (S
  (NP The/DT quick/JJ brown/NN)
  (NP fox/NN)
  (VP (V jumps/VBZ) (PP (P over/IN) (NP the/DT lazy/JJ dog/NN))))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [29]:
# To draw the parse tree
output.draw()


TclError: ignored