In [None]:
import nltk
from nltk.tokenize import *
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer

nltk.download('omw-1.4')
# input text
text = "The quick brown fox jumps over the lazy dog."

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
# whitespace tokenization
whitespace_tokens = WhitespaceTokenizer().tokenize(text)
print("Whitespace        Tokenization :", whitespace_tokens)

Whitespace        Tokenization : ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog.']


In [None]:
# punctuation-based tokenization
punct_tokens = word_tokenize(text)
print("Punctuation-based Tokenization :", punct_tokens)

Punctuation-based Tokenization : ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.']


In [None]:
# Treebank tokenization
treebank_tokens = TreebankWordTokenizer().tokenize(text)
print("Treebank          Tokenization :", treebank_tokens)

Treebank          Tokenization : ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.']


In [None]:
# Tweet tokenization
tweet_tokens = TweetTokenizer().tokenize(text)
print("Tweet             Tokenization :", tweet_tokens)

Tweet             Tokenization : ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.']


In [None]:
# MWE tokenization
mwe_tokens = MWETokenizer().tokenize(text.split())
print("MWE               Tokenization :", mwe_tokens)

MWE               Tokenization : ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog.']


In [None]:
# stemming using Porter Stemmer
porter = PorterStemmer()
porter_stems = [porter.stem(token) for token in punct_tokens]
print("Porter   Stemmer :", porter_stems)

Porter   Stemmer : ['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazi', 'dog', '.']


In [None]:
# stemming using Snowball Stemmer
snowball = SnowballStemmer("english")
snowball_stems = [snowball.stem(token) for token in punct_tokens]
print("Snowball Stemmer :", snowball_stems)

Snowball Stemmer : ['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazi', 'dog', '.']


In [None]:
# lemmatization using WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(token) for token in punct_tokens]
print("Lemmatization :", lemmas)

Lemmatization : ['The', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog', '.']
