In [1]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("I went there")

In [1]:
import spacy 
from spacy.symbols import ORTH 

nlp = spacy.load("en_core_web_sm") 
doc = nlp("lemme that") 
print([w.text for w in doc]) 


['lemme', 'that']


In [2]:
special_case = [{ORTH: "lem"}, {ORTH: "me"}] 
nlp.tokenizer.add_special_case("lemme", special_case) 
print([w.text for w in nlp("lemme that")])

['lem', 'me', 'that']


In [3]:
print([w.text for w in nlp("lemme!")])

['lem', 'me', '!']


In [4]:
nlp.tokenizer.add_special_case("...lemme...?", [{"ORTH": "...lemme...?"}])
print([w.text for w in nlp("...lemme...?")])

['...lemme...?']


In [5]:
import spacy 

nlp = spacy.load("en_core_web_sm") 
text = "Let's go!" 
doc = nlp(text) 

tok_exp = nlp.tokenizer.explain(text) 
for t in tok_exp: 
    print(t[1], "\t", t[0]) 

Let 	 SPECIAL-1
's 	 SPECIAL-2
go 	 TOKEN
! 	 SUFFIX


In [6]:
import spacy 

nlp = spacy.load("en_core_web_sm")
text = "I flied to N.Y yesterday. It was around 5 pm."
doc = nlp(text) 
for sent in doc.sents:
    print(sent.text)

I flied to N.Y yesterday.
It was around 5 pm.


In [1]:
import spacy 

nlp = spacy.load("en_core_web_sm") 
doc = nlp("I went there for working and worked for 3 years.") 
for token in doc: 
    print(token.text, token.lemma_) 

I I
went go
there there
for for
working work
and and
worked work
for for
3 3
years year
. .


In [2]:
import spacy

nlp = spacy.load("en_core_web_sm")
nlp.get_pipe("attribute_ruler").add([[{"TEXT": "Angeltown"}]], {"LEMMA": "Los Angeles"})

doc = nlp("I am flying to Angeltown")
for token in doc: 
    print(token.text, token.lemma_) 

I I
am be
flying fly
to to
Angeltown Los Angeles


In [3]:
doc = nlp("I like cats.")
print(doc.text)

I like cats.


In [4]:
for token in doc: 
    print(token.text)

I
like
cats
.


In [5]:
print(doc[1])

like


In [6]:
print(len(doc))

4


In [7]:
doc = nlp("This is a sentence. This is the second sentence")
sentences = list(doc.sents)
print(sentences)

[This is a sentence., This is the second sentence]


In [8]:
doc = nlp("I flied to New York with Ashley.")
print(doc.ents)

(New York, Ashley)


In [9]:
doc = nlp("Sweet brown fox jumped over the fence.")
print(list(doc.noun_chunks))

[Sweet brown fox, the fence]


In [10]:
print(doc.lang_)

en


In [12]:
from pprint import pprint

doc = nlp("Hi")
json_doc = doc.to_json()
pprint(json_doc)

{'ents': [],
 'sents': [{'end': 2, 'start': 0}],
 'text': 'Hi',
 'tokens': [{'dep': 'ROOT',
             'end': 2,
             'head': 0,
             'id': 0,
             'lemma': 'hi',
             'morph': '',
             'pos': 'INTJ',
             'start': 0,
             'tag': 'UH'}]}


In [None]:
token.text
token.text_with_ws
token.i
token.idx
token.doc
token.sent
token.is_sent_start
token.ent_type

In [13]:
doc = nlp("Hello Madam!")
print(doc[0])

Hello


In [16]:
print(doc[0].text)

Hello


In [17]:
print(doc[0].text_with_ws)
print(doc[2].text_with_ws)

Hello 
!


In [18]:
doc[0].text_with_ws

'Hello '

In [19]:
doc[2].text_with_ws

'!'

In [20]:
print(len(doc[0]))

5


In [21]:
token = doc[2]
print(token.i)

2


In [22]:
print(doc[0].idx)
print(doc[1].idx)

0
6


In [23]:
token = doc[0]
print(token.doc)

Hello Madam!


In [25]:
token = doc[1]
print(token.sent)

Hello Madam!


In [27]:
doc = nlp("He entered the room. Then he nodded.") 
print(doc[0].is_sent_start)
print(doc[5].is_sent_start)
print(doc[6].is_sent_start)

True
True
False


In [28]:
doc = nlp("I went there.")
print(doc[1].lemma_)

go


In [35]:
doc = nlp("The Brazilian president visited Beijing")
print(doc.ents)
print(doc[0].ent_type_)
print(doc[1].ent_type_, spacy.explain(doc[1].ent_type_))
print(doc[2].ent_type_)
print(doc[3].ent_type_)
print(doc[4].ent_type_)

(Brazilian, Beijing)

NORP Nationalities or religious or political groups


GPE


In [5]:
import spacy

nlp = spacy.load("en_core_web_sm")

doc = nlp("All my moves are coldly calculated.") 
print(doc[4:6])

coldly calculated


In [10]:
doc = nlp("All my moves are coldly calculated.") 
print(doc[5:]) # end index empty means rest of the string calculated. 
print(doc[4:-1]) # minus indexes are supported 

calculated.
coldly calculated



In [48]:
doc = nlp("Recife has a thousand charms, it's a little piece of Brazil.") 
print(doc.char_span(37, 59))

little piece of Brazil


In [54]:
doc = nlp("You went there after you saw me")
span = doc[2:4] 
for token in span: 
    print(token)

there
after


In [57]:
doc = nlp("Hi Lorena!!") 
span = doc[:2] 
len(span)

2

In [59]:
doc = nlp("You went there after you saw me") 
span = doc[2:6] 
print(span)
print(span[1:3])

there after you saw
after you


In [60]:
doc = nlp("You went there after you saw me")
span = doc[2:6] 
print(span.doc)
print(span.sent)

You went there after you saw me
You went there after you saw me


In [61]:
doc = nlp("You went there after you saw me")
span = doc[2:6]
print(span.start) # the index of the first token of the Span
print(span.end)
print(span.start_char) # the start offset of the Span at character level
print(span.end_char)

2
6
9
28


In [63]:
doc = nlp("You went there after you saw me")
span = doc[2:6]
small_doc = span.as_doc()
print(type(small_doc))

<class 'spacy.tokens.doc.Doc'>


In [64]:
doc = nlp("Hello, hi!") 
print(doc[0].lower_)

hello


In [65]:
doc = nlp("Cat and Cat123") 
print(doc[0].is_alpha)
print(doc[2].is_alpha)

True
False


In [66]:
doc = nlp("UR7 and Várzea")
print(doc[0].is_ascii)
print(doc[2].is_ascii)

True
False


In [67]:
doc = nlp("Cat Cat123 123")
print(doc[0].is_digit)
print(doc[1].is_digit)
print(doc[2].is_digit)

False
False
True


In [68]:
doc = nlp("You, him and Sally")
print(doc[1])
print(doc[1].is_punct)

,
True


In [69]:
doc = nlp("( [ He said yes. ] )") 
print(doc[0])
print(doc[0].is_left_punct)
print(doc[-2])
print(doc[-2].is_right_punct)

(
True
]
True


In [76]:
doc = nlp("I emailed you at least 100 times") 
print(doc[-2])
print(doc[-2].like_num)

doc = nlp("I emailed you at least hundred times") 
print(doc[-2])
print(doc[-2].like_num)

doc = nlp("His email is hello@hello.com and his website is https://nicewebsite.com") 
print(doc[3])
print(doc[3].like_email)
print(doc[8])
print(doc[8].like_url)

100
True
hundred
True
hello@hello.com
True
https://nicewebsite.com
True


In [77]:
doc = nlp("Girl called Kathy has a nickname Cat123.") 
for token in doc: 
    print(token.text, token.shape_)

Girl Xxxx
called xxxx
Kathy Xxxxx
has xxx
a x
nickname xxxx
Cat123 Xxxddd
. .


In [78]:
doc = nlp("I visited Jenny at Mynks Resort")
for token in doc: 
    print(token, token.is_oov)

I True
visited True
Jenny True
at True
Mynks True
Resort True


In [80]:
doc = nlp("One step forward, and you're no longer in the same place.")
for token in doc: 
    print(token, token.is_stop)

One True
step False
forward False
, False
and True
you True
're True
no True
longer False
in True
the True
same True
place False
. False
