In [3]:
import spacy
from spacy.symbols import ORTH
nlp = spacy.load("en_core_web_sm")


In [10]:
doc = nlp("I love GFG, what's you choice?")
for token in doc:
    print(token.text)

I
love
GFG
,
what
's
you
choice
?


## Adding special case tokenization rules

In [5]:
doc = nlp("gimme that")  # phrase to tokenize
print([w.text for w in doc])  # ['gimme', 'that']

# Add special case rule
special_case = [{ORTH: "gim"}, {ORTH: "me"}]
nlp.tokenizer.add_special_case("gimme", special_case)

# Check new tokenization
print([w.text for w in nlp("gimme that")])  # ['gim', 'me', 'that']


['gimme', 'that']
['gim', 'me', 'that']


## Debugging the tokenizer


In [6]:
from spacy.lang.en import English

nlp = English()


In [7]:
text = '''"Let's go!"'''
doc = nlp(text)


In [8]:
tok_exp = nlp.tokenizer.explain(text)
assert [t.text for t in doc if not t.is_space] == [t[1] for t in tok_exp]
for t in tok_exp:
    print(t[1], "\\t", t[0])

" \t PREFIX
Let \t SPECIAL-1
's \t SPECIAL-2
go \t TOKEN
! \t SUFFIX
" \t SUFFIX
