Perform tokenization (Whitespace, Punctuation-based, Treebank, Tweet, MWE) using NLTK 
library. Use porter stemmer and snowball stemmer for stemming. Use any technique for 
lemmatization.  

1 Install & Import Required Libraries

In [1]:
!pip install nltk





[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import nltk
from nltk.tokenize import (
    WhitespaceTokenizer,
    wordpunct_tokenize,
    TreebankWordTokenizer,
    TweetTokenizer,
    MWETokenizer
)
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet


In [3]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\samik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\samik\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\samik\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

2 Sample Text

In [4]:
text = "NLTK is amazing! I'm learning tokenization, stemming, and lemmatization ðŸ˜Š #AI"

3 Tokenization Techniques

In [None]:
# Whitespace Tokenization
whitespace_tokenizer = WhitespaceTokenizer()
print("Whitespace Tokenization:")
print(whitespace_tokenizer.tokenize(text))


Whitespace Tokenization:
['NLTK', 'is', 'amazing!', "I'm", 'learning', 'tokenization,', 'stemming,', 'and', 'lemmatization', 'ðŸ˜Š', '#AI']


In [None]:
# Punctuation-based Tokenization
print("\nPunctuation-based Tokenization:")
print(wordpunct_tokenize(text))



Punctuation-based Tokenization:
['NLTK', 'is', 'amazing', '!', 'I', "'", 'm', 'learning', 'tokenization', ',', 'stemming', ',', 'and', 'lemmatization', 'ðŸ˜Š', '#', 'AI']


In [None]:
#Treebank Tokenization
treebank_tokenizer = TreebankWordTokenizer()
print("\nTreebank Tokenization:")
print(treebank_tokenizer.tokenize(text))



Treebank Tokenization:
['NLTK', 'is', 'amazing', '!', 'I', "'m", 'learning', 'tokenization', ',', 'stemming', ',', 'and', 'lemmatization', 'ðŸ˜Š', '#', 'AI']


In [None]:
# Multi-Word Expression (MWE) Tokenization
mwe = MWETokenizer([('machine', 'learning'), ('artificial', 'intelligence')], separator='_')
sample_mwe_text = "I love machine learning and artificial intelligence"
print("\nMWE Tokenization:")
print(mwe.tokenize(sample_mwe_text.split()))



MWE Tokenization:
['I', 'love', 'machine_learning', 'and', 'artificial_intelligence']


4 Stemming

In [None]:
# Porter Stemmer
porter = PorterStemmer()
tokens = wordpunct_tokenize(text)

print("\nPorter Stemmer:")
print([porter.stem(word) for word in tokens])



Porter Stemmer:
['nltk', 'is', 'amaz', '!', 'i', "'", 'm', 'learn', 'token', ',', 'stem', ',', 'and', 'lemmat', 'ðŸ˜Š', '#', 'ai']


In [None]:
# Snowball Stemmer

snowball = SnowballStemmer("english")

print("\nSnowball Stemmer:")
print([snowball.stem(word) for word in tokens])



Snowball Stemmer:
['nltk', 'is', 'amaz', '!', 'i', "'", 'm', 'learn', 'token', ',', 'stem', ',', 'and', 'lemmat', 'ðŸ˜Š', '#', 'ai']


5 Lemmatization (WordNet Lemmatizer)

In [11]:
lemmatizer = WordNetLemmatizer()

print("\nLemmatization:")
print([lemmatizer.lemmatize(word) for word in tokens])



Lemmatization:


['NLTK', 'is', 'amazing', '!', 'I', "'", 'm', 'learning', 'tokenization', ',', 'stemming', ',', 'and', 'lemmatization', 'ðŸ˜Š', '#', 'AI']
