In [19]:
# Step 1: Import necessary libraries
import nltk
from nltk.tokenize import word_tokenize, TreebankWordTokenizer, TweetTokenizer, MWETokenizer
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer
import re

In [20]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Samruddhi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Samruddhi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Samruddhi\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [21]:
# Sample Text
text = "NLTK is a leading platform for building Python programs to work with human language data. It's awesome, isn't it?"

In [22]:
# 1. TOKENIZATION
print("Original Text:")
print(text)

Original Text:
NLTK is a leading platform for building Python programs to work with human language data. It's awesome, isn't it?


In [23]:
# Whitespace Tokenization
whitespace_tokens = text.split()
print("\nWhitespace Tokenization:")
print(whitespace_tokens)


Whitespace Tokenization:
['NLTK', 'is', 'a', 'leading', 'platform', 'for', 'building', 'Python', 'programs', 'to', 'work', 'with', 'human', 'language', 'data.', "It's", 'awesome,', "isn't", 'it?']


In [24]:
# Punctuation-based Tokenization (using regex)
punct_tokens = re.findall(r'\w+|[^\w\s]', text)
print("\nPunctuation-based Tokenization:")
print(punct_tokens)


Punctuation-based Tokenization:
['NLTK', 'is', 'a', 'leading', 'platform', 'for', 'building', 'Python', 'programs', 'to', 'work', 'with', 'human', 'language', 'data', '.', 'It', "'", 's', 'awesome', ',', 'isn', "'", 't', 'it', '?']


In [25]:
# Treebank Tokenizer
treebank_tokenizer = TreebankWordTokenizer()
treebank_tokens = treebank_tokenizer.tokenize(text)
print("\nTreebank Tokenizer:")
print(treebank_tokens)


Treebank Tokenizer:
['NLTK', 'is', 'a', 'leading', 'platform', 'for', 'building', 'Python', 'programs', 'to', 'work', 'with', 'human', 'language', 'data.', 'It', "'s", 'awesome', ',', 'is', "n't", 'it', '?']


In [26]:
# Tweet Tokenizer
tweet_tokenizer = TweetTokenizer()
tweet_tokens = tweet_tokenizer.tokenize(text)
print("\nTweet Tokenizer:")
print(tweet_tokens)


Tweet Tokenizer:
['NLTK', 'is', 'a', 'leading', 'platform', 'for', 'building', 'Python', 'programs', 'to', 'work', 'with', 'human', 'language', 'data', '.', "It's", 'awesome', ',', "isn't", 'it', '?']


In [27]:
# MWE Tokenizer
mwe_tokenizer = MWETokenizer([('human', 'language'), ('NLTK', 'is')])
mwe_tokens = mwe_tokenizer.tokenize(word_tokenize(text))
print("\nMulti-Word Expression (MWE) Tokenizer:")
print(mwe_tokens)


Multi-Word Expression (MWE) Tokenizer:
['NLTK_is', 'a', 'leading', 'platform', 'for', 'building', 'Python', 'programs', 'to', 'work', 'with', 'human_language', 'data', '.', 'It', "'s", 'awesome', ',', 'is', "n't", 'it', '?']


In [28]:
# 2. STEMMING

# Porter Stemmer
porter = PorterStemmer()
porter_stems = [porter.stem(word) for word in word_tokenize(text)]
print("\nPorter Stemmer:")
print(porter_stems)


Porter Stemmer:
['nltk', 'is', 'a', 'lead', 'platform', 'for', 'build', 'python', 'program', 'to', 'work', 'with', 'human', 'languag', 'data', '.', 'it', "'s", 'awesom', ',', 'is', "n't", 'it', '?']


In [29]:
# Snowball Stemmer
snowball = SnowballStemmer("english")
snowball_stems = [snowball.stem(word) for word in word_tokenize(text)]
print("\nSnowball Stemmer:")
print(snowball_stems)


Snowball Stemmer:
['nltk', 'is', 'a', 'lead', 'platform', 'for', 'build', 'python', 'program', 'to', 'work', 'with', 'human', 'languag', 'data', '.', 'it', "'s", 'awesom', ',', 'is', "n't", 'it', '?']


In [30]:
# 3. LEMMATIZATION

lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in word_tokenize(text)]
print("\nLemmatized Words:")
print(lemmatized_words)


Lemmatized Words:
['NLTK', 'is', 'a', 'leading', 'platform', 'for', 'building', 'Python', 'program', 'to', 'work', 'with', 'human', 'language', 'data', '.', 'It', "'s", 'awesome', ',', 'is', "n't", 'it', '?']
