In [1]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, WhitespaceTokenizer, WordPunctTokenizer, TreebankWordTokenizer, TweetTokenizer
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('wordnet')

# Sample text
text = "NLTK is an amazing library! It's used for NLP tasks. Tokenization, stemming & lemmatization are essential."

# Tokenization
whitespace_tokens = WhitespaceTokenizer().tokenize(text)
punctuation_tokens = WordPunctTokenizer().tokenize(text)
treebank_tokens = TreebankWordTokenizer().tokenize(text)
tweet_tokens = TweetTokenizer().tokenize(text)

# Stemming
porter = PorterStemmer()
snowball = SnowballStemmer("english")
porter_stemmed = [porter.stem(word) for word in whitespace_tokens]
snowball_stemmed = [snowball.stem(word) for word in whitespace_tokens]

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word) for word in whitespace_tokens]

# Output
print(f"Whitespace Tokens: {whitespace_tokens}")
print(f"Punctuation Tokens: {punctuation_tokens}")
print(f"Treebank Tokens: {treebank_tokens}")
print(f"Tweet Tokens: {tweet_tokens}")
print(f"Porter Stemmed: {porter_stemmed}")
print(f"Snowball Stemmed: {snowball_stemmed}")
print(f"Lemmatized: {lemmatized}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Whitespace Tokens: ['NLTK', 'is', 'an', 'amazing', 'library!', "It's", 'used', 'for', 'NLP', 'tasks.', 'Tokenization,', 'stemming', '&', 'lemmatization', 'are', 'essential.']
Punctuation Tokens: ['NLTK', 'is', 'an', 'amazing', 'library', '!', 'It', "'", 's', 'used', 'for', 'NLP', 'tasks', '.', 'Tokenization', ',', 'stemming', '&', 'lemmatization', 'are', 'essential', '.']
Treebank Tokens: ['NLTK', 'is', 'an', 'amazing', 'library', '!', 'It', "'s", 'used', 'for', 'NLP', 'tasks.', 'Tokenization', ',', 'stemming', '&', 'lemmatization', 'are', 'essential', '.']
Tweet Tokens: ['NLTK', 'is', 'an', 'amazing', 'library', '!', "It's", 'used', 'for', 'NLP', 'tasks', '.', 'Tokenization', ',', 'stemming', '&', 'lemmatization', 'are', 'essential', '.']
Porter Stemmed: ['nltk', 'is', 'an', 'amaz', 'library!', "it'", 'use', 'for', 'nlp', 'tasks.', 'tokenization,', 'stem', '&', 'lemmat', 'are', 'essential.']
Snowball Stemmed: ['nltk', 'is', 'an', 'amaz', 'library!', 'it', 'use', 'for', 'nlp', 'tasks.'