In [8]:
import spacy

In [9]:
nlp = spacy.load("en_core_web_sm")

In [3]:
doc = nlp(u"Tesla is looking at buying U.S. startup for $6 Million")

In [4]:
doc

Tesla is looking at buying U.S. startup for $6 Million

In [8]:
for token in doc:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
Million NUM pobj


In [9]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x17c8f75f908>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x17c8f763168>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x17c8f763708>)]

In [12]:
doc2 = nlp(u"Tesla isn't looking into startups anymore.")

In [13]:
for token in doc2:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
n't PART neg
looking VERB ROOT
into ADP prep
startups NOUN pobj
anymore ADV advmod
. PUNCT punct


In [14]:
doc2[0].pos_

'PROPN'

In [15]:
doc2[0].dep_

'nsubj'

In [16]:
doc2[0].lemma_

'Tesla'

In [17]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [18]:
life_quote = doc3[16:30]

In [19]:
life_quote

"Life is what happens to us while we are making other plans"

In [20]:
type(life_quote)

spacy.tokens.span.Span

In [21]:
type(doc3)

spacy.tokens.doc.Doc

In [27]:
doc4 = nlp(u"This is the first sentence. This is another sentence. This is the last sentence.")

In [28]:
for sentence in doc4.sents:
    print(sentence)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [29]:
doc4[6].is_sent_start

True

In [31]:
doc4[7].is_sent_start

# Tokenization

In [14]:
mystring = '"We\'re moving to L.A.!"'

In [15]:
mystring

'"We\'re moving to L.A.!"'

In [16]:
doc = nlp(mystring)

In [17]:
for tokens in doc:
    print(tokens.text)

"
We
're
moving
to
L.A.
!
"


In [20]:
doc2 = nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!")

In [21]:
for t in doc2:
    print(t)

We
're
here
to
help
!
Send
snail
-
mail
,
email
support@oursite.com
or
visit
us
at
http://www.oursite.com
!


In [22]:
doc3 = nlp(u"A 5km NYC cab ride costs $10.30")

In [23]:
for t in doc3:
    print(t)

A
5
km
NYC
cab
ride
costs
$
10.30


In [24]:
doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")

In [26]:
for t in doc4:
    print(t)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


In [27]:
len(doc4)

11

In [29]:
len(doc4.vocab)

512

In [32]:
doc5 = nlp(u"It is better to give than receiver.")

In [33]:
doc5[2:5]

better to give

In [39]:
doc6 = nlp(u"Apple to build a Hong Kong factory for $6 million")

In [40]:
for t in doc6:
    print(t.text, end = " | ")

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 

In [43]:
for ent in doc6.ents:
    print(ent)
    print(ent.label_)
    print(str(spacy.explain(ent.label_)))
    print("\n")

Apple
ORG
Companies, agencies, institutions, etc.


Hong Kong
GPE
Countries, cities, states


$6 million
MONEY
Monetary values, including unit




In [44]:
doc9 = nlp(u"Autonomous cars shift insurance liability toward manufacturers")

In [45]:
for chunk in doc9.noun_chunks:
    print(chunk)

Autonomous cars
insurance liability
manufacturers


In [46]:
from spacy import displacy

In [47]:
doc = nlp(u"Aplle is going to build a U.K. factory for $6million.")

In [48]:
displacy.render(doc, style="dep", jupyter=True, options={"distance":110})

In [55]:
doc = nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million")

In [56]:
displacy.render(doc,style="ent", jupyter=True)

In [57]:
doc = nlp(u"I am a runner running in a race because I love to run since I ran today")

In [59]:
for t in doc:
    print(t.text,"\t",t.pos_,"\t",t.lemma,"\t",t.lemma_)

I 	 PRON 	 561228191312463089 	 -PRON-
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 561228191312463089 	 -PRON-
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 561228191312463089 	 -PRON-
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


In [60]:
 print(nlp.Defaults.stop_words)

{'being', 'several', 'show', 'only', 'fifty', 'whereas', 'perhaps', 'elsewhere', 'hence', '‘ll', 'sixty', 'on', 'either', 'say', 'next', 'least', 'since', 'when', 'whenever', 'before', 'six', 'whereafter', 'within', 'moreover', 'over', 'therein', 'off', 'i', 'call', 'whither', 'used', 'across', 'even', 'two', 'and', 'seem', 'between', 'whose', 'itself', 'us', 'thereafter', "'ve", 'must', 'together', 'becoming', 'my', 'rather', 'take', 'move', 'anything', 'our', 'toward', '’ll', 'be', 'does', 'front', 'for', 'hereby', 'why', 'anyone', 'whatever', 'who', 'one', 'latterly', 'whether', 'wherever', 'everything', 'see', 'but', 'please', 'due', 'forty', '’ve', 'as', 'made', 'or', 'into', 'first', 'without', 'five', 'any', 'no', 'former', 'thus', 'did', 'have', 'we', 'third', 'while', 'none', 'other', 'thru', 'now', 'was', 'how', 'eleven', 'various', 'than', 'upon', 'doing', 'more', 'they', 'whole', 'thereby', '’re', 'full', 'nor', 'whereupon', 'do', 'indeed', 'yourself', 'formerly', 'four', '

In [61]:
 len(nlp.Defaults.stop_words)

326

# Phrase matching and Vocabulary

In [62]:
from spacy.matcher import Matcher

In [63]:
matcher = Matcher(nlp.vocab)

In [64]:
pattern1 = [{"LOWER":"solarpower"}]
pattern2 = [{"LOWER":"solar"},{"IS_PUNCT":True}, {"LOWER":"power"}]
pattern3 = [{"LOWER":"solar"},{"LOWER":"power"}]

In [65]:
matcher.add("SolarPower", None, pattern1, pattern2, pattern3)

In [66]:
doc = nlp(u"The Solar Power industry continues to grow as solarpower increases. Solar-power is amazing")

In [67]:
matches = matcher(doc)

In [68]:
matches

[(8656102463236116519, 1, 3),
 (8656102463236116519, 8, 9),
 (8656102463236116519, 11, 14)]

In [69]:
for match_id, start, end in matches:
    print(doc[start:end])

Solar Power
solarpower
Solar-power


In [70]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [71]:
with open("reaganomics.txt") as f:
    doc3 = nlp(f.read())

In [72]:
len(doc3)

5933

In [74]:
phrase_list = ["voodoo economics", "supply-side economics", "trickle-down economics", "free-market economics"]

In [75]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [76]:
matcher.add("EconMatcher", None, *phrase_patterns)

In [77]:
matches = matcher(doc3)

In [78]:
matches

[(3680293220734633682, 41, 45),
 (3680293220734633682, 49, 53),
 (3680293220734633682, 54, 56),
 (3680293220734633682, 61, 65),
 (3680293220734633682, 673, 677),
 (3680293220734633682, 2987, 2991)]

In [82]:
for match_id, start, end in matches:
    print(doc3[start:end])

supply-side economics
trickle-down economics
voodoo economics
free-market economics
supply-side economics
trickle-down economics
