## Tokenization in NLTK

In [1]:
from nltk.tokenize import word_tokenize
text1 = "The chicken danced because she loved disco."
tokens = word_tokenize(text1)
print(tokens)

['The', 'chicken', 'danced', 'because', 'she', 'loved', 'disco', '.']


In [2]:
# notice the 'Mr.' is one token but 'tacology.' is two tokens.
text2 = "Mr. Smith loves tacos. He has a Ph.D. in tacology."
tokens = word_tokenize(text2)
print(tokens)

['Mr.', 'Smith', 'loves', 'tacos', '.', 'He', 'has', 'a', 'Ph.D.', 'in', 'tacology', '.']


## Tokenization in spaCy

In [3]:
from spacy.en import English
parser = English()
tokens = parser(text1)
tokens = [token.orth_ for token in tokens if not token.orth_.isspace()]
print(tokens)

['The', 'chicken', 'danced', 'because', 'she', 'loved', 'disco', '.']


In [4]:
# Here we see why spaCy made the sentence segmentation error in the previous lesson
# ** Seems to be fixed in upgrades to spaCy
tokens = parser(text2)
tokens = [token.orth_ for token in tokens if not token.orth_.isspace()]
print(tokens)

['Mr.', 'Smith', 'loves', 'tacos', '.', 'He', 'has', 'a', 'Ph.D.', 'in', 'tacology', '.']


In [5]:
# It can be fixed with a little effort https://github.com/explosion/spaCy/issues/592
import spacy
text = "He has a Ph.D. in tacology."
spacy.en.English.Defaults.tokenizer_exceptions["Ph.D."] = [{"F": "Ph.D."}]
parser = English()
tokens = parser(text2)
tokens = [token.orth_ for token in tokens if not token.orth_.isspace()]
print(tokens)

['Mr.', 'Smith', 'loves', 'tacos', '.', 'He', 'has', 'a', 'Ph.D.', 'in', 'tacology', '.']


In [6]:
# tokens in spaCy have a lot of information
dir(tokens[0])

['__add__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getnewargs__',
 '__gt__',
 '__hash__',
 '__init__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__mod__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmod__',
 '__rmul__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'capitalize',
 'casefold',
 'center',
 'count',
 'encode',
 'endswith',
 'expandtabs',
 'find',
 'format',
 'format_map',
 'index',
 'isalnum',
 'isalpha',
 'isdecimal',
 'isdigit',
 'isidentifier',
 'islower',
 'isnumeric',
 'isprintable',
 'isspace',
 'istitle',
 'isupper',
 'join',
 'ljust',
 'lower',
 'lstrip',
 'maketrans',
 'partition',
 'replace',
 'rfind',
 'rindex',
 'rjust',
 'rpartition',
 'rsplit',
 'rstrip',
 'split',
 'splitlines',
 'startswith',
 'strip',
 'swapcase',
 'title',
 'translate',
 'upper',
 'zfill']

In [7]:
def print_token(token):
    print("===================================")
    print("value:", token.orth_)
    print("lemma:", token.lemma_)
    print("shape:", token.shape_)

In [8]:
# Note the lemma for "ran" and "was"
text3 = "He ran to the store because he was king of the apes."
tokens = parser(text3)
for token in tokens:
    print_token(token)

value: He
lemma: -PRON-
shape: Xx
value: ran
lemma: run
shape: xxx
value: to
lemma: to
shape: xx
value: the
lemma: the
shape: xxx
value: store
lemma: store
shape: xxxx
value: because
lemma: because
shape: xxxx
value: he
lemma: -PRON-
shape: xx
value: was
lemma: be
shape: xxx
value: king
lemma: king
shape: xxxx
value: of
lemma: of
shape: xx
value: the
lemma: the
shape: xxx
value: apes
lemma: ape
shape: xxxx
value: .
lemma: .
shape: .
