In [None]:
import nltk
from nltk.tokenize import WhitespaceTokenizer, word_tokenize, TreebankWordTokenizer, TweetTokenizer
from nltk.tokenize import MWETokenizer
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
text = """
The quick brown fox jumps over the lazy dog! Don't stop learning. AI and ML are
transforming the future. #ArtificialIntelligence @OpenAI.
"""

In [None]:
# Whitespace Tokenizer
whitespace_tokenizer = WhitespaceTokenizer()
whitespace_tokens = whitespace_tokenizer.tokenize(text)


In [None]:
punctuation_tokens = word_tokenize(text)

In [None]:
treebank_tokenizer = TreebankWordTokenizer()
treebank_tokens = treebank_tokenizer.tokenize(text)


In [None]:
# Tweet Tokenizer
tweet_tokenizer = TweetTokenizer()
tweet_tokens = tweet_tokenizer.tokenize(text)

# Multi-Word Expression Tokenizer
mwe_tokenizer = MWETokenizer([('Artificial', 'Intelligence'), ('machine', 'learning')])
mwe_tokens = mwe_tokenizer.tokenize(word_tokenize(text))


In [None]:
# Porter Stemmer
porter_stemmer = PorterStemmer()
porter_stemmed = [porter_stemmer.stem(token) for token in punctuation_tokens]

# Snowball Stemmer
snowball_stemmer = SnowballStemmer('english')
snowball_stemmed = [snowball_stemmer.stem(token) for token in punctuation_tokens]


In [None]:
# Lemmatizer
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(token) for token in punctuation_tokens]


In [None]:
print("Original Text:")
print(text)
print("\nTokenization:")
print("Whitespace Tokenizer:", whitespace_tokens)
print("Punctuation-based Tokenizer:", punctuation_tokens)
print("Treebank Tokenizer:", treebank_tokens)
print("Tweet Tokenizer:", tweet_tokens)
print("MWE Tokenizer:", mwe_tokens)

print("\nStemming:")
print("Porter Stemmer:", porter_stemmed)
print("Snowball Stemmer:", snowball_stemmed)

print("\nLemmatization:")
print("Lemmatized:", lemmatized)

Original Text:

The quick brown fox jumps over the lazy dog! Don't stop learning. AI and ML are
transforming the future. #ArtificialIntelligence @OpenAI.


Tokenization:
Whitespace Tokenizer: ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog!', "Don't", 'stop', 'learning.', 'AI', 'and', 'ML', 'are', 'transforming', 'the', 'future.', '#ArtificialIntelligence', '@OpenAI.']
Punctuation-based Tokenizer: ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '!', 'Do', "n't", 'stop', 'learning', '.', 'AI', 'and', 'ML', 'are', 'transforming', 'the', 'future', '.', '#', 'ArtificialIntelligence', '@', 'OpenAI', '.']
Treebank Tokenizer: ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '!', 'Do', "n't", 'stop', 'learning.', 'AI', 'and', 'ML', 'are', 'transforming', 'the', 'future.', '#', 'ArtificialIntelligence', '@', 'OpenAI', '.']
Tweet Tokenizer: ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '!', "Don't", 's