In [1]:
import nltk
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.tokenize import WhitespaceTokenizer, WordPunctTokenizer
from nltk.tokenize import TreebankWordTokenizer, MWETokenizer
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer

In [15]:
# Sample sentence
sentence = "NLTK is a powerful library for natural language processing 😊✔"

In [16]:
# Tokenization
# Whitespace Tokenizer
whitespace_tokenizer = WhitespaceTokenizer()
whitespace_tokens = whitespace_tokenizer.tokenize(sentence)

In [17]:
# Punctuation-based Tokenizer
# Rule: 1. Punctuation: Splits almost all special symbols and treat them as separate units.

punct_tokenizer = WordPunctTokenizer()
punct_tokens = punct_tokenizer.tokenize(sentence)

In [18]:
# Treebank Tokenizer
# Rule: 1. Treebank: Uses regular expressions to tokenize text.
#       Regular Expressions: A filter that describes a set of strings that matches the pattern.

treebank_tokenizer = TreebankWordTokenizer()
treebank_tokens = treebank_tokenizer.tokenize(sentence)

In [19]:
# Tweet Tokenizer
# Rule: 1. Tweet - Considers Emoji/Unicodes as different tokens.

tweet_tokenizer = TweetTokenizer()
tweet_tokens = tweet_tokenizer.tokenize(sentence)

In [20]:
# MWE(MWET - Multi-Word Expression Tokenizer) Tokenizer
# Rule : 1. MWET - Allows the user to enter multiple word expressions before using the tokenizer on the text.

mwe_tokenizer = MWETokenizer([('natural', 'language'), ('processing', '.')])
mwe_tokens = mwe_tokenizer.tokenize(sentence.split())

In [21]:
# Stemming
porter_stemmer = PorterStemmer()
porter_stems = [porter_stemmer.stem(token) for token in punct_tokens]

In [22]:
snowball_stemmer = SnowballStemmer('english')
snowball_stems = [snowball_stemmer.stem(token) for token in punct_tokens]


In [23]:
# Lemmatization

# Involves methods to identify and transform words into their base or root forms.

# 1. Rule-Based
# Word: “walked”
# Rule Application: Remove “-ed”
# Result: “walk"

# 2. Dictionary-Based
# ‘running’ -> ‘run’
# ‘better’ -> ‘good’
# ‘went’ -> ‘go’

# 3. Machine Learning-Based
# Machine learning-based lemmatization leverages computational models to automatically learn the relationships between words 
# and their base forms. Unlike rule-based or dictionary-based approaches, machine learning models, such as neural networks or 
# statistical models, are trained on large text datasets to generalize patterns in language.


wordnet_lemmatizer = WordNetLemmatizer()
lemmas = [wordnet_lemmatizer.lemmatize(token) for token in punct_tokens]

In [24]:
# Print results
print("Original Sentence:", sentence)

print("Whitespace Tokenizer:", whitespace_tokens)

print("Punctuation-based Tokenizer:", punct_tokens)

print("Treebank Tokenizer:", treebank_tokens)

print("Tweet Tokenizer:", tweet_tokens)

print("MWE Tokenizer:", mwe_tokens)

print("Porter Stemmer:", porter_stems)

print("Snowball Stemmer:", snowball_stems)

print("Lemmatization:", lemmas)

Original Sentence: NLTK is a powerful library for natural language processing 😊✔
Whitespace Tokenizer: ['NLTK', 'is', 'a', 'powerful', 'library', 'for', 'natural', 'language', 'processing', '😊✔']
Punctuation-based Tokenizer: ['NLTK', 'is', 'a', 'powerful', 'library', 'for', 'natural', 'language', 'processing', '😊✔']
Treebank Tokenizer: ['NLTK', 'is', 'a', 'powerful', 'library', 'for', 'natural', 'language', 'processing', '😊✔']
Tweet Tokenizer: ['NLTK', 'is', 'a', 'powerful', 'library', 'for', 'natural', 'language', 'processing', '😊', '✔']
MWE Tokenizer: ['NLTK', 'is', 'a', 'powerful', 'library', 'for', 'natural_language', 'processing', '😊✔']
Porter Stemmer: ['nltk', 'is', 'a', 'power', 'librari', 'for', 'natur', 'languag', 'process', '😊✔']
Snowball Stemmer: ['nltk', 'is', 'a', 'power', 'librari', 'for', 'natur', 'languag', 'process', '😊✔']
Lemmatization: ['NLTK', 'is', 'a', 'powerful', 'library', 'for', 'natural', 'language', 'processing', '😊✔']
