In [1]:
import spacy
nlp = spacy.load("en_core_web_md")
doc = nlp("I went there")

In [2]:
for token in doc:
    print(token, type(token), token.text, type(token.text))

I <class 'spacy.tokens.token.Token'> I <class 'str'>
went <class 'spacy.tokens.token.Token'> went <class 'str'>
there <class 'spacy.tokens.token.Token'> there <class 'str'>


In [3]:
doc = nlp("I own a pretty cat.")

print([token.text for token in doc], type([token.text for token in doc]))

['I', 'own', 'a', 'pretty', 'cat', '.'] <class 'list'>


In [4]:
doc = nlp("It's been a crazy week!!!")

print([token for token in doc])

[It, 's, been, a, crazy, week, !, !, !]


In [16]:
from spacy.symbols import ORTH

nlp = spacy.load("en_core_web_md")
doc = nlp("lemme that")
print([token.text for token in doc])

special_case = [ {ORTH: "lem"}, {ORTH: "me"}]
nlp.tokenizer.add_special_case("lemme", special_case)
print([token.text for token in doc])

doc = nlp("Let's try again! Lemme that, lemme")
print([token.text for token in doc])

['lemme', 'that']
['lemme', 'that']
['Let', "'s", 'try', 'again', '!', 'Lemme', 'that', ',', 'lem', 'me']


In [17]:
special_case1 = [ {ORTH: "lem"}, {ORTH:"me"}]
special_case2 = [ {ORTH: "Lem"}, {ORTH:"me"}]
nlp.tokenizer.add_special_case("lemme", special_case1)
nlp.tokenizer.add_special_case("Lemme", special_case2)

doc = nlp("Let's try again! Lemme that, lemme")
print([token.text for token in doc])

['Let', "'s", 'try', 'again', '!', 'Lem', 'me', 'that', ',', 'lem', 'me']


In [18]:

special_case = [ {ORTH: "...lemme...?"}]
nlp.tokenizer.add_special_case("...lemme...?", special_case)


doc = nlp("...lemme...?")
print([token.text for token in doc])

['...lemme...?']


In [21]:
nlp = spacy.load("en_core_web_md")

text = "Let's go!"
doc = nlp(text)
print([token.text for token in doc])

detail_tokens = nlp.tokenizer.explain(text)
for detail_token in detail_tokens:
    print(detail_token, type(detail_token))
    print(detail_token[1], "\t", detail_token[0])

['Let', "'s", 'go', '!']
('SPECIAL-1', 'Let') <class 'tuple'>
Let 	 SPECIAL-1
('SPECIAL-2', "'s") <class 'tuple'>
's 	 SPECIAL-2
('TOKEN', 'go') <class 'tuple'>
go 	 TOKEN
('SUFFIX', '!') <class 'tuple'>
! 	 SUFFIX


In [23]:
import spacy
nlp = spacy.load("en_core_web_md")

text = "I flied to N.Y yesterday. It was around 5 pm."
doc = nlp(text)

for sentence in doc.sents:
    print(sentence)

print([token.text for token in doc])

I flied to N.Y yesterday.
It was around 5 pm.
['I', 'flied', 'to', 'N.Y', 'yesterday', '.', 'It', 'was', 'around', '5', 'pm', '.']


In [24]:
text = "According to the vaccine unit of SK Group, the trials will be conducted by comparing the candidate with AstraZeneca vaccines. In late May, the Ministry of Food and Drug Safety designed new comparative-style clinical trials, which compare the immunogenicity of an already authorized vaccine with a candidate under development in order to prove its efficacy. That was supposed to help local companies speed up development of Covid-19 vaccines without having to go through clinical trials that require large control groups."
doc = nlp(text)

for sentence in doc.sents:
    print(sentence)

According to the vaccine unit of SK Group, the trials will be conducted by comparing the candidate with AstraZeneca vaccines.
In late May, the Ministry of Food and Drug Safety designed new comparative-style clinical trials, which compare the immunogenicity of an already authorized vaccine with a candidate under development in order to prove its efficacy.
That was supposed to help local companies speed up development of Covid-19 vaccines without having to go through clinical trials that require large control groups.


In [25]:
text = "부산 해운대해수욕장에서 중학생 3명이 물놀이를 하던 중 1명이 실종되고 1명이 숨지는 사고가 발생했다. 25일 경찰과 소방당국에 따르면 이날 오전 3시 41분께 부산 해운대해수욕장에서 중학생 3명이 물놀이 하던 중 실종됐다는 신고가 접수됐다."
doc = nlp(text)

for sentence in doc.sents:
    print(sentence)

부산 해운대해수욕장에서 중학생 3명이 물놀이를 하던 중 1명이 실종되고 1명이 숨지는 사고가 발생했다.
25일 경찰과 소방당국에 따르면 이날 오전 3시 41분께 부산 해운대해수욕장에서 중학생 3명이 물놀이 하던 중
실종됐다는 신고가 접수됐다.


In [28]:
import spacy
nlp = spacy.load("en_core_web_md")

text = "I went there for working and worked for 3 years."
doc = nlp(text)

for token in doc:
    print(token.text, "\t", token.lemma_)

I 	 I
went 	 go
there 	 there
for 	 for
working 	 working
and 	 and
worked 	 work
for 	 for
3 	 3
years 	 year
. 	 .


In [29]:
import spacy
from spacy.symbols import ORTH, LEMMA
nlp = spacy.load("en_core_web_md")

special_case = [ {ORTH: "Angeltown", LEMMA: "Los Angeles"}]
nlp.tokenizer.add_special_case("Angeltown", special_case)

doc = nlp("I am flying to Angeltown")
for token in doc:
    print(token.text, token.lemma_)

ValueError: [E1005] Unable to set attribute 'LEMMA' in tokenizer exception for 'Angeltown'. Tokenizer exceptions are only allowed to specify ORTH and NORM.

In [30]:
import spacy
nlp = spacy.load("en_core_web_md")

doc = nlp("I know that you have been to Korea.")

for token in doc:
    print(token)

print(doc[2:4])
print(doc[4:])
print(doc[3:-1])
print(doc[6:])

I
know
that
you
have
been
to
Korea
.
that you
have been to Korea.
you have been to Korea
to Korea.


In [31]:
import spacy
nlp = spacy.load("en_core_web_md")

doc = nlp("I know that you have been to Korea.")
span = doc[2:4]
for token in span:
    print(token.text)

that
you


In [32]:
import spacy
nlp = spacy.load("en_core_web_md")

doc = nlp("Hello, hi!")

In [33]:
doc[0]

Hello

In [34]:
doc[0].lower_

'hello'

In [36]:
doc = nlp("HELLO, Hello, hello, hEllo")
for token in doc:
    print(token. text)

HELLO
,
Hello
,
hello
,
hEllo


In [38]:
print(doc[0].is_upper)
print(doc[0].is_lower)

True
False


In [45]:
doc = nlp("Cat and Cat123")
print(doc[0].is_alpha)
print(doc[2].is_alpha)

True
False


In [43]:
doc = nlp("ENglish and 한글!")
print(doc[0].is_ascii)
print(doc[2].is_ascii)
print(doc[3].is_ascii)

True
False
True


In [46]:
doc = nlp("Cat Cat123 123")
print(doc[0].is_digit)
print(doc[1].is_digit)
print(doc[2].is_digit)

False
False
True


In [48]:
doc = nlp("Hey, You and me!")
print(doc[0].is_punct)
print(doc[1].is_punct)
print(doc[2].is_punct)
print(doc[3].is_punct)
print(doc[4].is_punct)
print(doc[5].is_punct)

False
True
False
False
False
True


In [50]:
a = []

for d in doc:
    if d.is_punct:
        a.append(d)
    else:
        pass

print(a, len(a))

[,, !] 2


In [56]:
doc = nlp("( [ He said yes. ] )")
print(doc[0], doc[0].is_left_punct)
print(doc[1], doc[1].is_left_punct)
print(doc[-2], doc[-2].is_left_punct)
print(doc[-1], doc[-1].is_left_punct)
print(doc[-1], doc[-1].is_right_punct)

( True
[ True
] False
) False
) True


In [58]:
doc = nlp(" ")
print(doc[0], len(doc[0]), doc[0].is_space)

  1 True


In [59]:
doc = nlp("   ")
print(doc[0], len(doc[0]), doc[0].is_space)

    3 True


In [62]:
doc = nlp("( you said [1] and {2} is not applicable.)")
print(doc[0], doc[0].is_bracket, doc[-1], doc[-1].is_bracket)
print(doc[3], doc[3].is_bracket, doc[5], doc[5].is_bracket)

( True ) True
[ True ] True


In [64]:
doc = nlp("( you said '1\" is not applicable.)")
print(doc[3], doc[3].is_quote)
print(doc[5], doc[5].is_quote)

' True
" True


In [67]:
doc = nlp("I paid $12 for the shirt")
print(doc[2], doc[2].is_currency)

$ True


In [68]:
doc = nlp("I emailed ou at least 100 times")
print(doc[-2], doc[-2].like_num)

100 True


In [69]:
doc = nlp("I emailed ou at least hundred times")
print(doc[-2], doc[-2].like_num)

hundred True


In [73]:
doc = nlp("My email is lilybad@naver.com and lilya!#b@naver.com you can visit me under http://www.naver.com any time you want")

print([token.text for token in doc])
print(doc[3], doc[3].like_email)
print(doc[5], doc[5].like_email)
print(doc[-5], doc[-5].like_email)

['My', 'email', 'is', 'lilybad@naver.com', 'and', 'lilya!#b@naver.com', 'you', 'can', 'visit', 'me', 'under', 'http://www.naver.com', 'any', 'time', 'you', 'want']
lilybad@naver.com True
lilya!#b@naver.com False
http://www.naver.com False


In [74]:
doc = nlp("Girl called Kathy has a nickname Cat123.")
for token in doc:
    print(token.text, token.lemma_, token.shape_)


Girl girl Xxxx
called call xxxx
Kathy Kathy Xxxxx
has have xxx
a a x
nickname nickname xxxx
Cat123 cat123 Xxxddd
. . .


In [79]:
doc = nlp("I visited Jenny at Korean Restorant Busan Myoinbong Seoungeun")
for token in doc:
    print(token, token.is_oov)

I False
visited False
Jenny False
at False
Korean False
Restorant True
Busan False
Myoinbong True
Seoungeun True


In [78]:
doc = nlp("I just want to inform you that I was with the principle.")
for token in doc:
    print(token, token.is_stop)

I True
just True
want False
to True
inform False
you True
that True
I True
was True
with True
the True
principle False
. False


In [1]:
import spacy
nlp = spacy.load("en_core_web_md")

In [2]:
doc = nlp("Alicia and me went to the school by bus")

In [7]:
for token in doc:
    print(token.text, "\t", token.pos_,"\t", token.tag_, "\t", spacy.explain(token.pos_), "\t\t\t", spacy.explain(token.tag_))

Alicia 	 PROPN 	 NNP 	 proper noun 			 noun, proper singular
and 	 CCONJ 	 CC 	 coordinating conjunction 			 conjunction, coordinating
me 	 PRON 	 PRP 	 pronoun 			 pronoun, personal
went 	 VERB 	 VBD 	 verb 			 verb, past tense
to 	 ADP 	 IN 	 adposition 			 conjunction, subordinating or preposition
the 	 DET 	 DT 	 determiner 			 determiner
school 	 NOUN 	 NN 	 noun 			 noun, singular or mass
by 	 ADP 	 IN 	 adposition 			 conjunction, subordinating or preposition
bus 	 NOUN 	 NN 	 noun 			 noun, singular or mass


In [8]:
doc = nlp("My friend will fly to New York fast and she is staying there for 3 days.")

for token in doc:
    print(token.text, "\t", token.pos_,"\t", token.tag_, "\t", spacy.explain(token.pos_), "\t\t\t", spacy.explain(token.tag_))

My 	 PRON 	 PRP$ 	 pronoun 			 pronoun, possessive
friend 	 NOUN 	 NN 	 noun 			 noun, singular or mass
will 	 AUX 	 MD 	 auxiliary 			 verb, modal auxiliary
fly 	 VERB 	 VB 	 verb 			 verb, base form
to 	 ADP 	 IN 	 adposition 			 conjunction, subordinating or preposition
New 	 PROPN 	 NNP 	 proper noun 			 noun, proper singular
York 	 PROPN 	 NNP 	 proper noun 			 noun, proper singular
fast 	 ADV 	 RB 	 adverb 			 adverb
and 	 CCONJ 	 CC 	 coordinating conjunction 			 conjunction, coordinating
she 	 PRON 	 PRP 	 pronoun 			 pronoun, personal
is 	 AUX 	 VBZ 	 auxiliary 			 verb, 3rd person singular present
staying 	 VERB 	 VBG 	 verb 			 verb, gerund or present participle
there 	 ADV 	 RB 	 adverb 			 adverb
for 	 ADP 	 IN 	 adposition 			 conjunction, subordinating or preposition
3 	 NUM 	 CD 	 numeral 			 cardinal number
days 	 NOUN 	 NNS 	 noun 			 noun, plural
. 	 PUNCT 	 . 	 punctuation 			 punctuation mark, sentence closer


In [11]:
doc = nlp("I will ship the package tomorrow. ---- I saw a red ship")

for token in doc:
    print(token.text, "\t", token.pos_,"\t", token.tag_, "\t", spacy.explain(token.pos_), "\t\t\t", spacy.explain(token.tag_))

I 	 PRON 	 PRP 	 pronoun 			 pronoun, personal
will 	 AUX 	 MD 	 auxiliary 			 verb, modal auxiliary
ship 	 VERB 	 VB 	 verb 			 verb, base form
the 	 DET 	 DT 	 determiner 			 determiner
package 	 NOUN 	 NN 	 noun 			 noun, singular or mass
tomorrow 	 NOUN 	 NN 	 noun 			 noun, singular or mass
. 	 PUNCT 	 . 	 punctuation 			 punctuation mark, sentence closer
---- 	 PUNCT 	 NFP 	 punctuation 			 superfluous punctuation
I 	 PRON 	 PRP 	 pronoun 			 pronoun, personal
saw 	 VERB 	 VBD 	 verb 			 verb, past tense
a 	 DET 	 DT 	 determiner 			 determiner
red 	 ADJ 	 JJ 	 adjective 			 adjective (English), other noun-modifier (Chinese)
ship 	 NOUN 	 NN 	 noun 			 noun, singular or mass


In [12]:
doc = nlp("My cat will fish for a fish tomorrow in a fishy way.")

for token in doc:
    print(token.text, "\t", token.pos_,"\t", token.tag_, "\t", spacy.explain(token.pos_), "\t\t\t", spacy.explain(token.tag_))

My 	 PRON 	 PRP$ 	 pronoun 			 pronoun, possessive
cat 	 NOUN 	 NN 	 noun 			 noun, singular or mass
will 	 AUX 	 MD 	 auxiliary 			 verb, modal auxiliary
fish 	 VERB 	 VB 	 verb 			 verb, base form
for 	 ADP 	 IN 	 adposition 			 conjunction, subordinating or preposition
a 	 DET 	 DT 	 determiner 			 determiner
fish 	 NOUN 	 NN 	 noun 			 noun, singular or mass
tomorrow 	 NOUN 	 NN 	 noun 			 noun, singular or mass
in 	 ADP 	 IN 	 adposition 			 conjunction, subordinating or preposition
a 	 DET 	 DT 	 determiner 			 determiner
fishy 	 ADJ 	 JJ 	 adjective 			 adjective (English), other noun-modifier (Chinese)
way 	 NOUN 	 NN 	 noun 			 noun, singular or mass
. 	 PUNCT 	 . 	 punctuation 			 punctuation mark, sentence closer


In [13]:
doc = nlp("고기를 잡으러 바다에 간다.")

for token in doc:
    print(token.text, "\t", token.pos_,"\t", token.tag_, "\t", spacy.explain(token.pos_), "\t\t\t", spacy.explain(token.tag_))

고기를 	 INTJ 	 UH 	 interjection 			 interjection
잡으러 	 PROPN 	 NNP 	 proper noun 			 noun, proper singular
바다에 	 PROPN 	 NNP 	 proper noun 			 noun, proper singular
간다 	 ADV 	 RB 	 adverb 			 adverb
. 	 PUNCT 	 . 	 punctuation 			 punctuation mark, sentence closer


In [18]:
sentence1 = "I flew to Rome."
sentence2 = "I'm flying to Rome."
sentence3 = "I will fly to Rome."

doc1 = nlp(sentence1)
doc2 = nlp(sentence2)
doc3 = nlp(sentence3)

for doc in [doc1, doc2, doc3]:
    print( [(w.text, w.lemma_) for w in doc if w.tag_ == "VBG" or w.tag_ == "VB"])

[]
[('flying', 'fly')]
[('fly', 'fly')]


In [21]:
for doc in [doc1, doc2, doc3]:
    for w in doc:
        if w.tag_ =="VBG" or w.tag_ =="VB":
            print([w.text, w.lemma_])
        else:
            print("None")

None
None
None
None
None
None
None
['flying', 'fly']
None
None
None
None
None
['fly', 'fly']
None
None
None


In [23]:
doc = nlp("He earned $5.5 million in 2020 and paid %35 max.")

for token in doc:
    print(token.text, "\t", token.tag_, "\t",spacy.explain(token.tag_))

He 	 PRP 	 pronoun, personal
earned 	 VBD 	 verb, past tense
$ 	 $ 	 symbol, currency
5.5 	 CD 	 cardinal number
million 	 CD 	 cardinal number
in 	 IN 	 conjunction, subordinating or preposition
2020 	 CD 	 cardinal number
and 	 CC 	 conjunction, coordinating
paid 	 VBD 	 verb, past tense
% 	 NN 	 noun, singular or mass
35 	 CD 	 cardinal number
max 	 NN 	 noun, singular or mass
. 	 . 	 punctuation mark, sentence closer


In [25]:
doc = nlp("blue flower")

for token in doc:
    print(token.text, token.dep_, spacy.explain(token.dep_))

blue amod adjectival modifier
flower ROOT None


In [37]:
doc = nlp("The president Donald Trump visited France.")
print(doc.ents)
print(doc.ents[1])
print(spacy.explain(doc[-2].ent_type_))

(Donald Trump, France)
France
Countries, cities, states


In [27]:
spacy.explain("ORG")

'Companies, agencies, institutions, etc.'

In [30]:
doc = nlp("He worked for NASA")
print(doc.ents)
print(doc[3])
print(doc[3].ent_type_, "//", spacy.explain(doc[3].ent_type_))

(NASA,)
NASA
ORG // Companies, agencies, institutions, etc.


In [38]:
doc = nlp("Albert Einstein was born in Ulm on 1879. He studied electronical engineering at ETH Zurich.")
doc.ents

(Albert Einstein, Ulm, 1879, ETH Zurich)

In [47]:
print("ents: ", doc.ents, "\n")
for token in doc:
    print(token.text, "\t", token.ent_type_, "\t\t\t", spacy.explain(token.ent_type_))

ents:  (Albert Einstein, Ulm, 1879, ETH Zurich) 

Albert 	 PERSON 			 People, including fictional
Einstein 	 PERSON 			 People, including fictional
was 	  			 None
born 	  			 None
in 	  			 None
Ulm 	 GPE 			 Countries, cities, states
on 	  			 None
1879 	 DATE 			 Absolute or relative dates or periods
. 	  			 None
He 	  			 None
studied 	  			 None
electronical 	  			 None
engineering 	  			 None
at 	  			 None
ETH 	 ORG 			 Companies, agencies, institutions, etc.
Zurich 	 ORG 			 Companies, agencies, institutions, etc.
. 	  			 None


In [51]:
doc = nlp("Jean-Michel Basquiat was an American artist of Haitian and Puerto Rican descent who gained fame with his graffiti and street art work")

print("ents: ", doc.ents, "\n")
for token in doc:
    print(token.text, "\t", token.ent_type_, "\t\t\t\t", spacy.explain(token.ent_type_))

ents:  (Jean-Michel Basquiat, American, Haitian, Puerto Rican) 

Jean 	 PERSON 				 People, including fictional
- 	 PERSON 				 People, including fictional
Michel 	 PERSON 				 People, including fictional
Basquiat 	 PERSON 				 People, including fictional
was 	  				 None
an 	  				 None
American 	 NORP 				 Nationalities or religious or political groups
artist 	  				 None
of 	  				 None
Haitian 	 NORP 				 Nationalities or religious or political groups
and 	  				 None
Puerto 	 NORP 				 Nationalities or religious or political groups
Rican 	 NORP 				 Nationalities or religious or political groups
descent 	  				 None
who 	  				 None
gained 	  				 None
fame 	  				 None
with 	  				 None
his 	  				 None
graffiti 	  				 None
and 	  				 None
street 	  				 None
art 	  				 None
work 	  				 None


In [53]:
for ent in doc.ents:
    print(ent, type(ent), ent.label_, spacy.explain(ent.label_))

Jean-Michel Basquiat <class 'spacy.tokens.span.Span'> PERSON People, including fictional
American <class 'spacy.tokens.span.Span'> NORP Nationalities or religious or political groups
Haitian <class 'spacy.tokens.span.Span'> NORP Nationalities or religious or political groups
Puerto Rican <class 'spacy.tokens.span.Span'> NORP Nationalities or religious or political groups


In [54]:
doc = nlp("Saturn is the sixth planet from the Sun and the second-largest in the Solar System, after Jupiter. It is a gas giant with an average radius about nine times that of Earth.")
doc.ents

(Saturn, sixth, Sun, second, the Solar System, Jupiter, about nine, Earth)

In [56]:
for ent in doc.ents:
    print(ent, "\t", ent.label_,"\t", spacy.explain(ent.label_))

Saturn 	 PRODUCT 	 Objects, vehicles, foods, etc. (not services)
sixth 	 ORDINAL 	 "first", "second", etc.
Sun 	 PERSON 	 People, including fictional
second 	 ORDINAL 	 "first", "second", etc.
the Solar System 	 ORG 	 Companies, agencies, institutions, etc.
Jupiter 	 LOC 	 Non-GPE locations, mountain ranges, bodies of water
about nine 	 CARDINAL 	 Numerals that do not fall under another type
Earth 	 LOC 	 Non-GPE locations, mountain ranges, bodies of water


In [1]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_md")
doc = nlp("Bill Gates is the CEO of Microsoft.")
displacy.render(doc, style="ent")

In [2]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_md")
doc = nlp("Bill Gates is the CEO of Microsoft.")
displacy.render(doc, style="dep")

In [3]:
import spacy
from spacy import displacy
from pathlib import Path

nlp = spacy.load("en_core_web_md")
doc = nlp("I am a butterfly.")
svg = displacy.render(doc, style="dep", jupyter = False)
filename = "butterfly.svg"
output_path = Path(filename)
output_path.open("w", encoding="utf-8").write(svg)

3022

In [7]:
import spacy
from spacy import displacy
from pathlib import Path

nlp = spacy.load("en_core_web_md")
doc = nlp("Bill Gates is the CEO of Microsoft.")
svg = displacy.render(doc, style="ent", jupyter = False)
filename = "Bill_Gates.svg"
output_path = Path(filename)
output_path.open("w", encoding="utf-8").write(svg)

695