In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [7]:
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')

In [15]:
for token in doc:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [16]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x2b1ea415308>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x2b1eab341c8>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x2b1eab34288>)]

In [17]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [18]:
doc2 = nlp(u"Tesla isn't looking into startups anymore")

In [19]:
for token in doc2:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
n't PART neg
looking VERB ROOT
into ADP prep
startups NOUN pobj
anymore ADV advmod


In [20]:
doc2[0]

Tesla

In [21]:
doc2[0].pos_

'PROPN'

In [22]:
doc2[4].text

'into'

In [23]:
doc2[4].lemma_

'into'

In [24]:
doc3 = nlp(u"This is the first sentence. This is another sentence. This is the last sentence.")

In [25]:
for sentence in doc3.sents:
    print(sentence)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [29]:
doc3[6]

This

In [28]:
doc3[6].is_sent_start

True

In [30]:
doc4 = nlp(u"A 5km NYC cab ride costs $10.30")

In [31]:
for t in doc4:
    print(t)

A
5
km
NYC
cab
ride
costs
$
10.30


In [32]:
doc4[0]

A

In [33]:
doc4[2:5]

km NYC cab

In [34]:
type(doc4)

spacy.tokens.doc.Doc

In [35]:
doc5 = nlp(u'Apple to build a Hong Kong factory for $6 million')

In [36]:
for token in doc5:
    print(token.text, end=' | ')

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 

In [39]:
for entity in doc5.ents:
    print(entity, entity.label_)
    print(str(spacy.explain(entity.label_)))
    print("\n")

Apple ORG
Companies, agencies, institutions, etc.


Hong Kong GPE
Countries, cities, states


$6 million MONEY
Monetary values, including unit




In [40]:
doc6 = nlp(u'Autonomous cars shift insurance liability toward manufacturers.')

In [41]:
for chunk in doc6.noun_chunks:
    print(chunk)

Autonomous cars
insurance liability
manufacturers


In [21]:
from spacy import displacy

In [46]:
doc = nlp(u"Apple is going to build a U.K. factory for $6 million")

In [22]:
displacy.render(doc, style='dep', jupyter=True, options={'distance':110})

In [51]:
displacy.render(doc, style='ent', jupyter=True)

In [None]:
# Lemmatization

In [5]:
doc = nlp(u"I am a runner running in a race because I love to run since I ran today")

In [8]:
for token in doc:
    print(token.text,'\t', token.pos_,'\t', token.lemma,'\t', token.lemma_)

I 	 PRON 	 561228191312463089 	 -PRON-
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 561228191312463089 	 -PRON-
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 561228191312463089 	 -PRON-
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


In [9]:
## Stopwords

In [11]:
print(nlp.Defaults.stop_words)

{'they', 'a', 'rather', 'throughout', 'above', 'unless', 'put', 'down', 'noone', 'no', 'after', 'be', 'via', 'out', 'ca', 'up', 'seem', 'make', 'which', 'through', 'three', 'whither', 'hereafter', 'one', 'yourselves', 'whoever', 'keep', 'so', 'may', 'see', 'that', "'ll", '‘d', 'most', 'further', "'s", 'just', 'much', 'using', '‘m', 'not', 'together', 'whereby', 'few', 'below', 'whence', 'an', 'itself', 'too', 'beside', "'ve", 'n‘t', 'latterly', 'us', 'part', 'somewhere', 'twelve', 'whatever', 'five', 'sixty', 'somehow', 'during', 'herself', 'whereafter', 'along', 'eleven', 'when', 'own', 'show', 'almost', 'thus', 'give', 'then', 'meanwhile', 'forty', 'made', 'i', 'more', 'back', 'on', 'beyond', 'hers', 'always', 'before', 'whole', 'really', 'your', 'third', 'what', 'those', 'are', 'themselves', 'still', 'serious', "n't", 'them', 'never', 'ten', 'amount', 'must', 'alone', 'about', 'ever', 'over', 're', 'hereby', 'take', 'bottom', 'move', 'call', 'under', 'since', 'side', 'used', 'this',

In [13]:
nlp.Defaults.stop_words.add('btw')

In [14]:
nlp.vocab['btw'].is_stop=True

In [15]:
nlp.vocab['btw'].is_stop

True

In [23]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back")

In [24]:
print(doc.text)

The quick brown fox jumped over the lazy dog's back


In [27]:
doc[4].tag_

'VBD'

In [28]:
doc[4].pos_

'VERB'

In [30]:
for token in doc:
    print(f"{token.text:{10}} {token.tag_:{10}} {token.pos_:{10}} {spacy.explain(token.tag_)}")

The        DT         DET        determiner
quick      JJ         ADJ        adjective
brown      JJ         ADJ        adjective
fox        NNP        PROPN      noun, proper singular
jumped     VBD        VERB       verb, past tense
over       IN         ADP        conjunction, subordinating or preposition
the        DT         DET        determiner
lazy       JJ         ADJ        adjective
dog        NN         NOUN       noun, singular or mass
's         POS        PART       possessive ending
back       NN         NOUN       noun, singular or mass


In [36]:
POS_Counts = doc.count_by(spacy.attrs.POS)

In [37]:
POS_Counts

{90: 2, 84: 3, 96: 1, 100: 1, 85: 1, 92: 2, 94: 1}

In [38]:
displacy.render(doc,style='dep',jupyter=True)

In [40]:
options={'distane':110, 'compact':'True', 'color':'yellow', 'bg':'#09a3d5', 'font':'times'}

In [41]:
displacy.render(doc,style='dep',jupyter=True,options=options)

In [42]:
## NER

In [46]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text +" - "+ent.label_+" - "+str(spacy.explain(ent.label_)))
    else:
        print("No entities")

In [47]:
doc = nlp(u"Hi! How are you")

In [48]:
show_ents(doc)

No entities


In [49]:
doc = nlp(u"Apple is going to build a U.K. factory for $6 million")

In [50]:
show_ents(doc)

Apple - ORG - Companies, agencies, institutions, etc.
U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [61]:
colors = {"ORG":"radial-gradient(yellow,red)"}
options = {'ents':['PRODUCT','ORG','GPE'], 'colors': colors}

In [62]:
displacy.render(doc, style='ent', jupyter=True, options=options)

In [16]:
doc = nlp(u'"Management is doing the right things; Leadership is doing the right things."-PeterDrucker')

In [17]:
doc.text

'"Management is doing the right things; Leadership is doing the right things."-PeterDrucker'

In [18]:
for sent in doc.sents:
    print(sent)
    print("\n")

"Management is doing the right things; Leadership is doing the right things.


"-PeterDrucker


