In [3]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [4]:
import nltk
from nltk.tokenize import (
    WordPunctTokenizer,
    TreebankWordTokenizer,
    TweetTokenizer,
    MWETokenizer
)
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer

# Sample texts for demonstration
# Text 1: Standard sentence with contraction
text_std = "It's distinct from the other ones. Doesn't it look nice?"
# Text 2: Social media text with hashtag and handle
text_tweet = "Just loving the weather! #SunnyDay @WeatherChannel :)"
# Text 3: Text with Multi-Word Expression
text_mwe = "The United States of America is a large country."

print("--- 1. TOKENIZATION ---")

# A. Whitespace Tokenization
# (Simple split by space, often leaves punctuation attached to words)
whitespace_tokens = text_std.split()
print(f"Whitespace:      {whitespace_tokens}")

# B. Punctuation-based Tokenization
# (Splits on whitespace and punctuation distinctively)
punct_tokenizer = WordPunctTokenizer()
punct_tokens = punct_tokenizer.tokenize(text_std)
print(f"Punctuation:     {punct_tokens}")

# C. Treebank Tokenization (Standard NLTK method)
# (Uses Penn Treebank conventions, e.g., separating contractions like "n't")
treebank_tokenizer = TreebankWordTokenizer()
treebank_tokens = treebank_tokenizer.tokenize(text_std)
print(f"Treebank:        {treebank_tokens}")

# D. Tweet Tokenization
# (Preserves emojis, hashtags, and handles)
tweet_tokenizer = TweetTokenizer()
tweet_tokens = tweet_tokenizer.tokenize(text_tweet)
print(f"Tweet:           {tweet_tokens}")

# E. MWE (Multi-Word Expression) Tokenization
# (Merges specific words into a single token)
mwe_tokenizer = MWETokenizer([('United', 'States'), ('United', 'States', 'of', 'America')])
# Note: MWE tokenizer usually runs on an already tokenized list
base_tokens = text_mwe.split()
mwe_tokens = mwe_tokenizer.tokenize(base_tokens)
print(f"MWE:             {mwe_tokens}")


print("\n--- 2. STEMMING ---")
# Reducing words to their root/base form (often by chopping off ends).

words_to_stem = ["running", "generously", "happily", "organization", "wolves"]
porter = PorterStemmer()
snowball = SnowballStemmer(language='english')

print(f"{'Original':<15} | {'Porter':<15} | {'Snowball':<15}")
print("-" * 50)

for word in words_to_stem:
    p_stem = porter.stem(word)
    s_stem = snowball.stem(word)
    print(f"{word:<15} | {p_stem:<15} | {s_stem:<15}")


print("\n--- 3. LEMMATIZATION ---")
# Reducing words to their meaningful base form (lemma) using vocabulary analysis.

lemmatizer = WordNetLemmatizer()
words_to_lemmatize = ["better", "running", "wolves", "are", "corpora"]

print(f"{'Original':<15} | {'Lemma (Noun)':<15} | {'Lemma (Verb)':<15}")
print("-" * 50)

for word in words_to_lemmatize:
    # Default is noun (n)
    lemma_noun = lemmatizer.lemmatize(word)
    # Specifying 'v' handles the word as a verb
    lemma_verb = lemmatizer.lemmatize(word, pos='v')

    print(f"{word:<15} | {lemma_noun:<15} | {lemma_verb:<15}")

--- 1. TOKENIZATION ---
Whitespace:      ["It's", 'distinct', 'from', 'the', 'other', 'ones.', "Doesn't", 'it', 'look', 'nice?']
Punctuation:     ['It', "'", 's', 'distinct', 'from', 'the', 'other', 'ones', '.', 'Doesn', "'", 't', 'it', 'look', 'nice', '?']
Treebank:        ['It', "'s", 'distinct', 'from', 'the', 'other', 'ones.', 'Does', "n't", 'it', 'look', 'nice', '?']
Tweet:           ['Just', 'loving', 'the', 'weather', '!', '#SunnyDay', '@WeatherChannel', ':)']
MWE:             ['The', 'United_States_of_America', 'is', 'a', 'large', 'country.']

--- 2. STEMMING ---
Original        | Porter          | Snowball       
--------------------------------------------------
running         | run             | run            
generously      | gener           | generous       
happily         | happili         | happili        
organization    | organ           | organ          
wolves          | wolv            | wolv           

--- 3. LEMMATIZATION ---
Original        | Lemma (Noun)   