In [1]:
import nltk
from nltk.tokenize import (
    WhitespaceTokenizer,
    WordPunctTokenizer,
    TreebankWordTokenizer,
    TweetTokenizer,
    MWETokenizer
)
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer

# Downloads required for lemmatization and specific tokenizers
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
text = "The quick brown foxes don't jump over the lazy dog! #nature @user"

# Whitespace Tokenizer
ws_tokens = WhitespaceTokenizer().tokenize(text)

# Punctuation-based (WordPunct)
wp_tokens = WordPunctTokenizer().tokenize(text)

# Treebank Tokenizer
tb_tokens = TreebankWordTokenizer().tokenize(text)

# Tweet Tokenizer
tweet_tokens = TweetTokenizer().tokenize(text)

# MWE Tokenizer (requires pre-defined expressions)
mwe_tokenizer = MWETokenizer([('brown', 'foxes')])
mwe_tokens = mwe_tokenizer.tokenize(tb_tokens) # Usually applied after initial tokenization

In [5]:
import nltk
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer

# Ensure the lemmatizer data is downloaded
nltk.download('wordnet')

words = ["running", "flies", "happily", "better"]

# 1. Porter Stemmer
porter = PorterStemmer()
p_stems = [porter.stem(w) for w in words]

# 2. Snowball Stemmer
snowball = SnowballStemmer("english")
s_stems = [snowball.stem(w) for w in words]

# 3. Lemmatization (Verb context)
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(w, pos='v') for w in words]

# Printing the results
print(f"Original: {words}")
print(f"Porter:   {p_stems}")
print(f"Snowball: {s_stems}")
print(f"Lemmas:   {lemmas}")

Original: ['running', 'flies', 'happily', 'better']
Porter:   ['run', 'fli', 'happili', 'better']
Snowball: ['run', 'fli', 'happili', 'better']
Lemmas:   ['run', 'fly', 'happily', 'better']


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
