<a href="https://colab.research.google.com/github/Randoot/NLP-2/blob/main/Stemming_Lemmatization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing necessary libraries


In [4]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag, word_tokenize


# Downloading necessary NLTK resources


In [1]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

# Sample text

In [2]:
text = "The leaves on the trees are falling. The children are playing with leaves in the park."


# Tokenizing the text into words


In [5]:
words = word_tokenize(text)


# 1. Stemming


In [6]:
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in words]
print("Stemmed Words: ", stemmed_words)

Stemmed Words:  ['the', 'leav', 'on', 'the', 'tree', 'are', 'fall', '.', 'the', 'children', 'are', 'play', 'with', 'leav', 'in', 'the', 'park', '.']


# 2. Lemmatization


In [7]:
lemmatizer = WordNetLemmatizer()

# Function to get part of speech tags compatible with WordNet


In [10]:
# map part-of-speech (PoS) tags from the tagset,
# which is used by the pos_tag function from NLTK, to the WordNet PoS tagset.
# Becasue WordNet uses different tags for categorizing words,
# this function convert between these tagsets to be compatible with WordNet’s lemmatization
def get_wordnet_pos(word):
  # NLTK pos_tag is used to assign POS tags to words.
  # It returns a list of tuples: word and its corresponding tag.
  #[0][1]: Accesses the tag from the tuple : VBG
  #[0]: Extracts the first character of the tag :v
  # upper: character to uppercase.
    tag = pos_tag([word])[0][1][0].upper()
    # Now we Map Tags to WordNet Tags
    tag_dict = {"J": wordnet.ADJ, #'J' (Adjective) → wordnet.ADJ
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
                #Looks up the WordNet tag corresponding to the Penn Treebank tag from the tag_dict dictionary.
                # If the tag is not found in the dictionary, it defaults to wordnet.NOUN.
    return tag_dict.get(tag, wordnet.NOUN)

In [9]:
lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words]
print("Lemmatized Words: ", lemmatized_words)

Lemmatized Words:  ['The', 'leaf', 'on', 'the', 'tree', 'be', 'fall', '.', 'The', 'child', 'be', 'play', 'with', 'leaf', 'in', 'the', 'park', '.']


# Compare original, stemmed, and lemmatized words


In [11]:
comparison = list(zip(words, stemmed_words, lemmatized_words))
print("\nComparison (Original, Stemmed, Lemmatized):")
for original, stemmed, lemmatized in comparison:
    print(f"{original:15} -> {stemmed:15} -> {lemmatized:15}")



Comparison (Original, Stemmed, Lemmatized):
The             -> the             -> The            
leaves          -> leav            -> leaf           
on              -> on              -> on             
the             -> the             -> the            
trees           -> tree            -> tree           
are             -> are             -> be             
falling         -> fall            -> fall           
.               -> .               -> .              
The             -> the             -> The            
children        -> children        -> child          
are             -> are             -> be             
playing         -> play            -> play           
with            -> with            -> with           
leaves          -> leav            -> leaf           
in              -> in              -> in             
the             -> the             -> the            
park            -> park            -> park           
.               -> .               ->

# Import PorterStemmer


In [12]:
from nltk.stem import PorterStemmer


# List of words to stem


In [13]:
words = ["running", "jumps", "easily", "flying"]


# Initialize the stemmer

In [None]:
stemmer = PorterStemmer()


# TODO: Stem each word and print the result


In [None]:
stemmed_words = [___________ for word in words]

print("Stemmed Words:", stemmed_words)


# Import WordNetLemmatizer and other necessary modules


In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag


# List of words to lemmatize


In [None]:
words = ["running", "jumps", "easily", "flying"]


# Initialize the lemmatizer


In [None]:
lemmatizer = WordNetLemmatizer()


# Helper function to get WordNet PoS tag


In [None]:
def get_wordnet_pos(word):
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# TODO: Lemmatize each word with PoS tag and print the result


In [None]:
lemmatized_words = [____________ for word in words]

print("Lemmatized Words:", lemmatized_words)
