In [10]:
import nltk
from nltk.tokenize import (TreebankWordTokenizer,
                          word_tokenize,
                          wordpunct_tokenize,
                          TweetTokenizer,
                          MWETokenizer)
sentence = "It's true, Ms. Martha Wayne! #Truth"

## There are three types of tokenizations — sentence, word, and sub-word.

### Whitespace tokenization

In [11]:
# This is the most simple and commonly used form of tokenization. 
# It splits the text whenever it finds whitespace characters. 
# but it's not the best option

print(f'whitespace tokenization: {sentence.split()}')

whitespace tokenization: ["It's", 'true,', 'Ms.', 'Martha', 'Wayne!', '#Truth']


### Punctuation-based tokenization

In [12]:
print(f'punctuation-based tokenization: {wordpunct_tokenize(sentence)}')

# looks like it's far better than whitespace tokenization
# but Ms and (.) are should be attach together

punctuation-based tokenization: ['It', "'", 's', 'true', ',', 'Ms', '.', 'Martha', 'Wayne', '!', '#', 'Truth']


### Default/Treebankword tokenizer

In [13]:
tokenizer = TreebankWordTokenizer()
print(f'Treebank tokenization: {tokenizer.tokenize(sentence)}')

# we can clearly see that 

Treebank tokenization: ['It', "'s", 'true', ',', 'Ms.', 'Martha', 'Wayne', '!', '#', 'Truth']


### MWE Tokenizer

In [14]:
names = 'My name is Shakil Targeryan. King of the Seven Kingdoms and 3 Dragons, Protector of the realm, '
mwe = MWETokenizer()
mwe.add_mwe(('Shakil', 'Targeryan'))
print(f'Multi-word expression: {mwe.tokenize(word_tokenize(names))}')


Multi-word expression: ['My', 'name', 'is', 'Shakil_Targeryan', '.', 'King', 'of', 'the', 'Seven', 'Kingdoms', 'and', '3', 'Dragons', ',', 'Protector', 'of', 'the', 'realm', ',']


### Tweet Tokenizer

In [15]:
token = TweetTokenizer()
print(f'Tweet-rules based: {token.tokenize(sentence)}')

Tweet-rules based: ["It's", 'true', ',', 'Ms', '.', 'Martha', 'Wayne', '!', '#Truth']
