In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('wordnet')

# Sample text
text = "Morphological analysis involves breaking down words into their constituent morphemes."

# Tokenization: Split the text into words
tokens = word_tokenize(text)

# Initialize a stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Stemming: Reduce words to their root form using Porter Stemmer
stemmed_tokens = [stemmer.stem(token) for token in tokens]

# Lemmatization: Reduce words to their base or dictionary form
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

# Display the results
print("Original Text:")
print(text)
print("\nTokenization:")
print(tokens)
print("\nStemming:")
print(stemmed_tokens)
print("\nLemmatization:")
print(lemmatized_tokens)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\choud\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\choud\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Original Text:
Morphological analysis involves breaking down words into their constituent morphemes.

Tokenization:
['Morphological', 'analysis', 'involves', 'breaking', 'down', 'words', 'into', 'their', 'constituent', 'morphemes', '.']

Stemming:
['morpholog', 'analysi', 'involv', 'break', 'down', 'word', 'into', 'their', 'constitu', 'morphem', '.']

Lemmatization:
['Morphological', 'analysis', 'involves', 'breaking', 'down', 'word', 'into', 'their', 'constituent', 'morpheme', '.']


In [5]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
text = 'This is assumed to be a list of tokens that have been previously processed and cleaned, likely by removing non-alphanumeric characters and converting the tokens to lowercase.'

tokens = word_tokenize(text)

filtered_tokens = [re.sub(r'[^a-zA-Z0-9]', '', token) for token in tokens]

stop_words = stopwords.words('english')
filtered_tokens_no_stop_words = [token for token in filtered_tokens if token not in stop_words]


# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens_no_stop_words]

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens_no_stop_words]

print(f'Filtered tokens - \n{filtered_tokens}\n')
print(f'Stop words removal - \n{filtered_tokens_no_stop_words}\n')
print(f'Stemmed tokens - \n{stemmed_tokens}\n')
print(f'Lemmatized tokens - \n{lemmatized_tokens}\n')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nidhi\AppData\Roaming\nltk_data...


Filtered tokens - 
['This', 'is', 'assumed', 'to', 'be', 'a', 'list', 'of', 'tokens', 'that', 'have', 'been', 'previously', 'processed', 'and', 'cleaned', '', 'likely', 'by', 'removing', 'nonalphanumeric', 'characters', 'and', 'converting', 'the', 'tokens', 'to', 'lowercase', '']

Stop words removal - 
['This', 'assumed', 'list', 'tokens', 'previously', 'processed', 'cleaned', '', 'likely', 'removing', 'nonalphanumeric', 'characters', 'converting', 'tokens', 'lowercase', '']

Stemmed tokens - 
['thi', 'assum', 'list', 'token', 'previous', 'process', 'clean', '', 'like', 'remov', 'nonalphanumer', 'charact', 'convert', 'token', 'lowercas', '']

Lemmatized tokens - 
['This', 'assumed', 'list', 'token', 'previously', 'processed', 'cleaned', '', 'likely', 'removing', 'nonalphanumeric', 'character', 'converting', 'token', 'lowercase', '']



Stemming and lemmatization are both text normalization techniques used in natural language processing (NLP) and information retrieval to reduce words to their base or root forms. While they serve the same fundamental purpose, they achieve this goal using different approaches and have distinct characteristics:

1. Stemming:

Approach: Stemming algorithms reduce words to their stem by removing common prefixes or suffixes. Stemmers use heuristic rules to cut off prefixes or suffixes to obtain the base form of a word.
Result: The resulting stems may not be valid words; they are simply substrings that have linguistic meaning. For example, "running" and "ran" would both be stemmed to "run."
Speed: Stemming is generally faster because it uses simple rules and string manipulation techniques.
Example: For the word "running," a stemming algorithm might remove the "-ing" suffix, resulting in the stem "run."

2. Lemmatization:

Approach: Lemmatization, on the other hand, reduces words to their lemma or dictionary form. Lemmatizers use vocabulary and morphological analysis to find the lemma that represents the word's base form.
Result: The resulting lemmas are actual words found in the dictionary and have valid linguistic meanings. Lemmatization ensures that words are reduced to their proper dictionary form.
Accuracy: Lemmatization is more accurate than stemming because it takes into account the word's meaning and context.
Example: For the word "running," a lemmatization algorithm would map it to the lemma "run," considering its correct grammatical and semantic form.

Key Differences:

Stemming is faster but may not always produce valid words, whereas lemmatization is slower but ensures valid words are obtained.
Lemmatization considers the context and meaning of words, making it more accurate than stemming.
Stemming is useful when speed is a priority, and slight inaccuracies or non-words are acceptable. Lemmatization is preferable when accuracy and meaningful results are crucial, such as in language understanding applications.