In [1]:
import spacy
nlp = spacy.load("en_core_web_md")
doc = nlp("I went there")

In [2]:
for token in doc:
    print(token, type(token), token.text, type(token.text))

I <class 'spacy.tokens.token.Token'> I <class 'str'>
went <class 'spacy.tokens.token.Token'> went <class 'str'>
there <class 'spacy.tokens.token.Token'> there <class 'str'>


In [3]:
doc = nlp("I own a pretty cat.")

print([token.text for token in doc], type([token.text for token in doc]))

['I', 'own', 'a', 'pretty', 'cat', '.'] <class 'list'>


In [4]:
doc = nlp("It's been a crazy week!!!")

print([token for token in doc])

[It, 's, been, a, crazy, week, !, !, !]


In [16]:
from spacy.symbols import ORTH

nlp = spacy.load("en_core_web_md")
doc = nlp("lemme that")
print([token.text for token in doc])

special_case = [ {ORTH: "lem"}, {ORTH: "me"}]
nlp.tokenizer.add_special_case("lemme", special_case)
print([token.text for token in doc])

doc = nlp("Let's try again! Lemme that, lemme")
print([token.text for token in doc])

['lemme', 'that']
['lemme', 'that']
['Let', "'s", 'try', 'again', '!', 'Lemme', 'that', ',', 'lem', 'me']


In [17]:
special_case1 = [ {ORTH: "lem"}, {ORTH:"me"}]
special_case2 = [ {ORTH: "Lem"}, {ORTH:"me"}]
nlp.tokenizer.add_special_case("lemme", special_case1)
nlp.tokenizer.add_special_case("Lemme", special_case2)

doc = nlp("Let's try again! Lemme that, lemme")
print([token.text for token in doc])

['Let', "'s", 'try', 'again', '!', 'Lem', 'me', 'that', ',', 'lem', 'me']


In [18]:

special_case = [ {ORTH: "...lemme...?"}]
nlp.tokenizer.add_special_case("...lemme...?", special_case)


doc = nlp("...lemme...?")
print([token.text for token in doc])

['...lemme...?']


In [21]:
nlp = spacy.load("en_core_web_md")

text = "Let's go!"
doc = nlp(text)
print([token.text for token in doc])

detail_tokens = nlp.tokenizer.explain(text)
for detail_token in detail_tokens:
    print(detail_token, type(detail_token))
    print(detail_token[1], "\t", detail_token[0])

['Let', "'s", 'go', '!']
('SPECIAL-1', 'Let') <class 'tuple'>
Let 	 SPECIAL-1
('SPECIAL-2', "'s") <class 'tuple'>
's 	 SPECIAL-2
('TOKEN', 'go') <class 'tuple'>
go 	 TOKEN
('SUFFIX', '!') <class 'tuple'>
! 	 SUFFIX


In [23]:
import spacy
nlp = spacy.load("en_core_web_md")

text = "I flied to N.Y yesterday. It was around 5 pm."
doc = nlp(text)

for sentence in doc.sents:
    print(sentence)

print([token.text for token in doc])

I flied to N.Y yesterday.
It was around 5 pm.
['I', 'flied', 'to', 'N.Y', 'yesterday', '.', 'It', 'was', 'around', '5', 'pm', '.']


In [24]:
text = "According to the vaccine unit of SK Group, the trials will be conducted by comparing the candidate with AstraZeneca vaccines. In late May, the Ministry of Food and Drug Safety designed new comparative-style clinical trials, which compare the immunogenicity of an already authorized vaccine with a candidate under development in order to prove its efficacy. That was supposed to help local companies speed up development of Covid-19 vaccines without having to go through clinical trials that require large control groups."
doc = nlp(text)

for sentence in doc.sents:
    print(sentence)

According to the vaccine unit of SK Group, the trials will be conducted by comparing the candidate with AstraZeneca vaccines.
In late May, the Ministry of Food and Drug Safety designed new comparative-style clinical trials, which compare the immunogenicity of an already authorized vaccine with a candidate under development in order to prove its efficacy.
That was supposed to help local companies speed up development of Covid-19 vaccines without having to go through clinical trials that require large control groups.


In [25]:
text = "부산 해운대해수욕장에서 중학생 3명이 물놀이를 하던 중 1명이 실종되고 1명이 숨지는 사고가 발생했다. 25일 경찰과 소방당국에 따르면 이날 오전 3시 41분께 부산 해운대해수욕장에서 중학생 3명이 물놀이 하던 중 실종됐다는 신고가 접수됐다."
doc = nlp(text)

for sentence in doc.sents:
    print(sentence)

부산 해운대해수욕장에서 중학생 3명이 물놀이를 하던 중 1명이 실종되고 1명이 숨지는 사고가 발생했다.
25일 경찰과 소방당국에 따르면 이날 오전 3시 41분께 부산 해운대해수욕장에서 중학생 3명이 물놀이 하던 중
실종됐다는 신고가 접수됐다.


In [28]:
import spacy
nlp = spacy.load("en_core_web_md")

text = "I went there for working and worked for 3 years."
doc = nlp(text)

for token in doc:
    print(token.text, "\t", token.lemma_)

I 	 I
went 	 go
there 	 there
for 	 for
working 	 working
and 	 and
worked 	 work
for 	 for
3 	 3
years 	 year
. 	 .


In [29]:
import spacy
from spacy.symbols import ORTH, LEMMA
nlp = spacy.load("en_core_web_md")

special_case = [ {ORTH: "Angeltown", LEMMA: "Los Angeles"}]
nlp.tokenizer.add_special_case("Angeltown", special_case)

doc = nlp("I am flying to Angeltown")
for token in doc:
    print(token.text, token.lemma_)

ValueError: [E1005] Unable to set attribute 'LEMMA' in tokenizer exception for 'Angeltown'. Tokenizer exceptions are only allowed to specify ORTH and NORM.

In [30]:
import spacy
nlp = spacy.load("en_core_web_md")

doc = nlp("I know that you have been to Korea.")

for token in doc:
    print(token)

print(doc[2:4])
print(doc[4:])
print(doc[3:-1])
print(doc[6:])

I
know
that
you
have
been
to
Korea
.
that you
have been to Korea.
you have been to Korea
to Korea.


In [31]:
import spacy
nlp = spacy.load("en_core_web_md")

doc = nlp("I know that you have been to Korea.")
span = doc[2:4]
for token in span:
    print(token.text)

that
you


In [32]:
import spacy
nlp = spacy.load("en_core_web_md")

doc = nlp("Hello, hi!")

In [33]:
doc[0]

Hello

In [34]:
doc[0].lower_

'hello'

In [36]:
doc = nlp("HELLO, Hello, hello, hEllo")
for token in doc:
    print(token. text)

HELLO
,
Hello
,
hello
,
hEllo


In [38]:
print(doc[0].is_upper)
print(doc[0].is_lower)

True
False


In [45]:
doc = nlp("Cat and Cat123")
print(doc[0].is_alpha)
print(doc[2].is_alpha)

True
False


In [43]:
doc = nlp("ENglish and 한글!")
print(doc[0].is_ascii)
print(doc[2].is_ascii)
print(doc[3].is_ascii)

True
False
True


In [46]:
doc = nlp("Cat Cat123 123")
print(doc[0].is_digit)
print(doc[1].is_digit)
print(doc[2].is_digit)

False
False
True


In [48]:
doc = nlp("Hey, You and me!")
print(doc[0].is_punct)
print(doc[1].is_punct)
print(doc[2].is_punct)
print(doc[3].is_punct)
print(doc[4].is_punct)
print(doc[5].is_punct)

False
True
False
False
False
True


In [50]:
a = []

for d in doc:
    if d.is_punct:
        a.append(d)
    else:
        pass

print(a, len(a))

[,, !] 2


In [56]:
doc = nlp("( [ He said yes. ] )")
print(doc[0], doc[0].is_left_punct)
print(doc[1], doc[1].is_left_punct)
print(doc[-2], doc[-2].is_left_punct)
print(doc[-1], doc[-1].is_left_punct)
print(doc[-1], doc[-1].is_right_punct)

( True
[ True
] False
) False
) True


In [58]:
doc = nlp(" ")
print(doc[0], len(doc[0]), doc[0].is_space)

  1 True


In [59]:
doc = nlp("   ")
print(doc[0], len(doc[0]), doc[0].is_space)

    3 True


In [62]:
doc = nlp("( you said [1] and {2} is not applicable.)")
print(doc[0], doc[0].is_bracket, doc[-1], doc[-1].is_bracket)
print(doc[3], doc[3].is_bracket, doc[5], doc[5].is_bracket)

( True ) True
[ True ] True


In [64]:
doc = nlp("( you said '1\" is not applicable.)")
print(doc[3], doc[3].is_quote)
print(doc[5], doc[5].is_quote)

' True
" True


In [67]:
doc = nlp("I paid $12 for the shirt")
print(doc[2], doc[2].is_currency)

$ True


In [68]:
doc = nlp("I emailed ou at least 100 times")
print(doc[-2], doc[-2].like_num)

100 True


In [69]:
doc = nlp("I emailed ou at least hundred times")
print(doc[-2], doc[-2].like_num)

hundred True


In [73]:
doc = nlp("My email is lilybad@naver.com and lilya!#b@naver.com you can visit me under http://www.naver.com any time you want")

print([token.text for token in doc])
print(doc[3], doc[3].like_email)
print(doc[5], doc[5].like_email)
print(doc[-5], doc[-5].like_email)

['My', 'email', 'is', 'lilybad@naver.com', 'and', 'lilya!#b@naver.com', 'you', 'can', 'visit', 'me', 'under', 'http://www.naver.com', 'any', 'time', 'you', 'want']
lilybad@naver.com True
lilya!#b@naver.com False
http://www.naver.com False


In [74]:
doc = nlp("Girl called Kathy has a nickname Cat123.")
for token in doc:
    print(token.text, token.lemma_, token.shape_)


Girl girl Xxxx
called call xxxx
Kathy Kathy Xxxxx
has have xxx
a a x
nickname nickname xxxx
Cat123 cat123 Xxxddd
. . .


In [79]:
doc = nlp("I visited Jenny at Korean Restorant Busan Myoinbong Seoungeun")
for token in doc:
    print(token, token.is_oov)

I False
visited False
Jenny False
at False
Korean False
Restorant True
Busan False
Myoinbong True
Seoungeun True


In [78]:
doc = nlp("I just want to inform you that I was with the principle.")
for token in doc:
    print(token, token.is_stop)

I True
just True
want False
to True
inform False
you True
that True
I True
was True
with True
the True
principle False
. False
