Tokenisation using spacy

In [None]:
# Import spaCy and load the language library
import spacy
nlp = spacy.load('en_core_web_sm')

# Create a Doc object

In [None]:
# create a string that includews opening and closing quotation marks
mystring = '"We\'re moving to L.A.!"'
print(mystring)

"We're moving to L.A.!"


In [None]:
# create a doc object and explore tokens
doc = nlp(mystring)
for token in doc:
    print(token.text, end=' | ')

" | We | 're | moving | to | L.A. | ! | " | 

Prefixes, Suffixes and Infixes

we're here to help! Send snail-email, email support@oursite.com or visit us at http://www.oursite.com!

In [None]:
doc2 = nlp(u"We're here to help! Send snail-email, email support@oursite.com or visit us at http://www.oursite.com!")
for t in doc2:
    print(t)

We
're
here
to
help
!
Send
snail
-
email
,
email
support@oursite.com
or
visit
us
at
http://www.oursite.com
!


In [None]:
doc3 = nlp(u'A 5km NYC cab ride costs $10.30')
for t in doc3:
    print(t)

A
5
km
NYC
cab
ride
costs
$
10.30


Exceptions

In [None]:
doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")

for t in doc4:
    print(t)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


Counting Tokens

In [None]:
len(doc)

8

Counting vocab Entries

In [None]:
len(doc.vocab)

794

TOkens can be rretrieved by index position and slice

In [None]:
doc5 = nlp(u'It is better to give than to receive.')

# Retrieve the third token:
doc5[2]

better

In [None]:
# retrieve 3 tokens fom the middle
doc5[2:5]

better to give

In [None]:
# retrieve from the last
doc5[-4]

than

In [None]:
# retrieve the last four tokens
doc5[-4:]

than to receive.

Tokens can't be reassigned

In [None]:
doc6 = nlp(u"My dinner was horrible.")
doc7 = nlp(u"Your dinner was delicious.")

In [None]:
# Try to change "My dinner was horrible" to "My dinner was delicious"
doc6[3] = doc7[3]

TypeError: 'spacy.tokens.doc.Doc' object does not support item assignment

Named Entities

In [None]:
doc8 = nlp(u"Apple to build a Hong Kong factory for $6 million")

for token in doc8:
    print(token.text, end=' | ')
print("\n----")

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 
----


In [None]:
for ent in doc8.ents:
    print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))

Apple - ORG - Companies, agencies, institutions, etc.
Hong Kong - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [None]:
len(doc8.ents)

3

Noun Chunks

In [None]:
doc9 = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")

for chunk in doc9.noun_chunks:
    print(chunk.text)

Autonomous cars
insurance liability
manufacturers


In [None]:
doc10 = nlp(u"Red cars do not carry higher insurence rates.")
for chunk in doc10.noun_chunks:
    print(chunk.text)

Red cars
higher insurence rates


In [None]:
doc11 = nlp(u"He was a one-eyed, one-horned, flying, purple people-eater.")
for chunk in doc11.noun_chunks:
    print(chunk.text)

He
a one-eyed, one-horned, flying, purple people-eater


Build-in visualiser

Visualising the dependency parse

In [None]:
from spacy import displacy

doc = nlp(u"Apple is going to build a U.K. factory for $6 million.")
displacy.render(doc, style='dep', jupyter=True, options={'distance': 80})

In [None]:
doc = nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.")
displacy.render(doc, style='ent', jupyter=True)


In [None]:
doc = nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.")
displacy.render(doc, style='ent', jupyter=False)


'<div class="entities" style="line-height: 2.5; direction: ltr">Over \n<mark class="entity" style="background: #bfe1d9; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    the last quarter\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">DATE</span>\n</mark>\n \n<mark class="entity" style="background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Apple\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">ORG</span>\n</mark>\n sold \n<mark class="entity" style="background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    nearly 20 thousand\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">CARDINAL