**Perform tokenization, Stemming,lemmatization**

In [1]:
pip install nltk



In [5]:
import nltk
# Importing the Class-based tokenizers for better compatibility
from nltk.tokenize import (
    word_tokenize,
    TreebankWordTokenizer,
    TweetTokenizer,
    MWETokenizer,
    WhitespaceTokenizer  # Use the class instead of the function
)
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer

# Download necessary resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab') # Required for newer NLTK versions

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

**Data**

In [6]:
# Sample Data
text = "The quick brown foxes are jumping over the lazy dogs' heads. #NLP is cool! :)"

# --- TOKENIZATION ---
print("--- Tokenization ---")

--- Tokenization ---


 **Whitespace** **Tokenization**

In [7]:
# 1. Whitespace Tokenization (Class-based)
ws_tokenizer = WhitespaceTokenizer()
print("Whitespace:", ws_tokenizer.tokenize(text))

Whitespace: ['The', 'quick', 'brown', 'foxes', 'are', 'jumping', 'over', 'the', 'lazy', "dogs'", 'heads.', '#NLP', 'is', 'cool!', ':)']


In [8]:
# 2. Punctuation-based (Standard word_tokenize)
print("Punctuation-based:", word_tokenize(text))

# 3. Treebank Tokenizer
tb_tokenizer = TreebankWordTokenizer()
print("Treebank:", tb_tokenizer.tokenize(text))

Punctuation-based: ['The', 'quick', 'brown', 'foxes', 'are', 'jumping', 'over', 'the', 'lazy', 'dogs', "'", 'heads', '.', '#', 'NLP', 'is', 'cool', '!', ':', ')']
Treebank: ['The', 'quick', 'brown', 'foxes', 'are', 'jumping', 'over', 'the', 'lazy', 'dogs', "'", 'heads.', '#', 'NLP', 'is', 'cool', '!', ':', ')']


In [9]:
# 4. Tweet Tokenizer
tweet_tokenizer = TweetTokenizer()
print("Tweet:", tweet_tokenizer.tokenize(text))

# 5. MWE (Multi-Word Expression) Tokenizer
mwe_text = "The United States of America is a large country."
mwe_tokenizer = MWETokenizer([('United', 'States', 'of', 'America')])
# Note: MWE tokenizer requires a list of already split tokens
print("MWE:", mwe_tokenizer.tokenize(word_tokenize(mwe_text)))

Tweet: ['The', 'quick', 'brown', 'foxes', 'are', 'jumping', 'over', 'the', 'lazy', 'dogs', "'", 'heads', '.', '#NLP', 'is', 'cool', '!', ':)']
MWE: ['The', 'United_States_of_America', 'is', 'a', 'large', 'country', '.']


** STEMMING & LEMMATIZATION **

In [10]:
# --- STEMMING & LEMMATIZATION ---
words = ["jumping", "jumps", "jumped", "foxes", "better"]

print("\n--- Stemming ---")
ps = PorterStemmer()
ss = SnowballStemmer("english")
print("Porter:", [ps.stem(w) for w in words])
print("Snowball:", [ss.stem(w) for w in words])

print("\n--- Lemmatization ---")
lemmatizer = WordNetLemmatizer()
print("Lemmatizer:", [lemmatizer.lemmatize(w) for w in words])


--- Stemming ---
Porter: ['jump', 'jump', 'jump', 'fox', 'better']
Snowball: ['jump', 'jump', 'jump', 'fox', 'better']

--- Lemmatization ---
Lemmatizer: ['jumping', 'jump', 'jumped', 'fox', 'better']
