<a href="https://colab.research.google.com/github/Rakshithbodakuntla/segmentation_tokenization/blob/main/Lemmatization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

# --- Download required resources ---
nltk.download('punkt')
nltk.download('punkt_tab')       # fixes LookupError
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng') # Download the missing resource

# --- Input text ---
text = "John enjoys playing football while Mary loves reading books in the library."

# Segment into tokens
tokens = word_tokenize(text)
print("Step 1 - Tokens:", tokens)

# Remove stopwords & keep only alphabetic tokens
stop_words = set(stopwords.words('english'))
filtered_tokens = [w for w in tokens if w.lower() not in stop_words and w.isalpha()]
print("\nStep 2 - After removing stopwords:", filtered_tokens)

# Apply lemmatization (not stemming)
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(w.lower()) for w in filtered_tokens]
print("\nStep 3 - After lemmatization:", lemmatized_tokens)

# Keep only verbs and nouns (using POS tags)
tagged_tokens = pos_tag(lemmatized_tokens)
final_tokens = [word for word, tag in tagged_tokens if tag.startswith('N') or tag.startswith('V')]
print("\nStep 4 - Final Tokens (only Nouns & Verbs):", final_tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...


Step 1 - Tokens: ['John', 'enjoys', 'playing', 'football', 'while', 'Mary', 'loves', 'reading', 'books', 'in', 'the', 'library', '.']

Step 2 - After removing stopwords: ['John', 'enjoys', 'playing', 'football', 'Mary', 'loves', 'reading', 'books', 'library']

Step 3 - After lemmatization: ['john', 'enjoys', 'playing', 'football', 'mary', 'love', 'reading', 'book', 'library']

Step 4 - Final Tokens (only Nouns & Verbs): ['john', 'enjoys', 'playing', 'football', 'love', 'reading', 'book', 'library']


[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
