In [26]:
import spacy
from spacy import displacy
from spacy.tokens import Doc
from spacy.lang.en import English
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

In [6]:
for token in doc:
    print(token.text, token.pos_, token.dep_)

Apple PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.K. PROPN dobj
startup NOUN dep
for ADP prep
$ SYM quantmod
1 NUM compound
billion NUM pobj


In [7]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x1f4ae09df10>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x1f4ae09dc10>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x1f4ae07de00>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x1f4ae369ad0>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x1f4ae371190>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x1f4ae07df50>)]

In [8]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [11]:
doc2 = nlp(u"This is a sentence. This is another sentence.")

In [14]:
for token in doc2:
    print(token.text, token.pos_, token.dep_)

This PRON nsubj
is AUX ROOT
a DET det
sentence NOUN attr
. PUNCT punct
This PRON nsubj
is AUX ROOT
another DET det
sentence NOUN attr
. PUNCT punct


In [12]:

for sent in doc2.sents:
    print(sent)

This is a sentence.
This is another sentence.


In [16]:

# 1. Create a simple Doc object
words = ["Hello", "world", "!"]
spaces = [True, False, False]
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

Hello world!


In [17]:

# 2. Create a Doc from the words and spaces
doc = nlp.make_doc("Hello world!")
print(doc.text)


Hello world!


In [18]:

# 3. Create a Doc from the words and spaces
doc = Doc(nlp.vocab, words=["Hello", "world", "!"], spaces=[True, False, False])
print(doc.text)


Hello world!


In [19]:

# 4. Create a Doc from the words and spaces
doc = nlp("Hello world!")
print(doc.text)

Hello world!


In [23]:
doc2[5].is_sent_start
print(doc2[5])

This


Tokenization

In [27]:
mystring = '"We\'re moving to L.A.!"'

In [28]:
doc = nlp(mystring)

In [None]:
for token in doc:
    print(token.text)

In [30]:
doc2 = nlp(u"We're here to help! Send snail-mail, email support@outsite.com or visit us at http://www.outsite.com!")

In [None]:

for t in doc2:
    print(t)

In [32]:
doc3 = nlp(u"A 5km NYC cab ride costs $10.30")

In [None]:

for t in doc3:
    print(t)

In [34]:

doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")

In [None]:

for t in doc4:
    print(t)

In [36]:

len(doc4)

11

In [37]:

len(doc4.vocab)

796

In [40]:
doc5 = nlp(u"It is better to give than to receive.")

doc5[0]

doc5[2:5]

better to give

In [49]:
doc8 = nlp(u"Apple to build a Hong Kong factory for $6 million")

In [57]:
for token in doc8:
    print(token.text, end=' | ')


Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 

In [63]:
doc9 = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")

In [64]:
for chunk in doc9.noun_chunks:
    print(chunk)

Autonomous cars
insurance liability
manufacturers


In [65]:
for ent in doc9.ents:
    print(ent)

In [68]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(ent.text, 
          ent.label_,
          spacy.explain(ent.label_))

Apple ORG Companies, agencies, institutions, etc.
U.K. GPE Countries, cities, states
$1 billion MONEY Monetary values, including unit


In [69]:

doc = nlp(u"San Francisco considers banning sidewalk delivery robots")

In [74]:

displacy.render(doc, style='dep', jupyter=True, options={'distance': 200})

In [75]:

doc = nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.")

In [76]:
displacy.render(doc, style='ent', jupyter=True)