## Week - 2 

## 2.1 NLP - Preprocessing Tokenization , Stemming, Lemmatization

In [2]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
text="Running runners run in race"
tokens=word_tokenize(text)
stemmers=PorterStemmer()
stemmed=[stemmers.stem(token)for token in tokens]
print("Stemmed words:",stemmed)

Stemmed words: ['run', 'runner', 'run', 'in', 'race']


## 2.2 Advanced Lemmatization

In [12]:
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag,word_tokenize

In [13]:
text='The striped bats are hanging on their feet'
tokens=word_tokenize(text)
lemmatizer=WordNetLemmatizer()
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
lemmatized=[lemmatizer.lemmatize(w,get_wordnet_pos(t))for w,t, in pos_tag(tokens)]
print("Lemmatized Words:", lemmatized)

Lemmatized Words: ['The', 'striped', 'bat', 'be', 'hang', 'on', 'their', 'foot']


##  1. Porter's Stemmer 

#### It simplifies words by reducing them to their root forms, a process known as "stemming.

In [14]:
from nltk.stem import PorterStemmer

# Create a Porter Stemmer instance
porter_stemmer = PorterStemmer()

# Example words for stemming
words = ["running", "jumps", "happily", "running", "happily"]

# Apply stemming to each word
stemmed_words = [porter_stemmer.stem(word) for word in words]

# Print the results
print("Original words:", words)
print("Stemmed words:", stemmed_words)

Original words: ['running', 'jumps', 'happily', 'running', 'happily']
Stemmed words: ['run', 'jump', 'happili', 'run', 'happili']


## 1 Snowball Stemmer

#### The Snowball Stemmer, compared to the Porter Stemmer, is multi-lingual as it can handle non-English words. It supports various languages and is based on the 'Snowball' programming language, known for efficient processing of small strings.

In [18]:
from nltk.stem import SnowballStemmer

# Choose a language for stemming, for example, English
stemmer = SnowballStemmer(language='english')

# Example words to stem
words_to_stem = ['running', 'jumped', 'happily', 'quickly', 'foxes']

# Apply Snowball Stemmer
stemmed_words = [stemmer.stem(word) for word in words_to_stem]

# Print the results
print("Original words:", words_to_stem)
print("Stemmed words:", stemmed_words)

Original words: ['running', 'jumped', 'happily', 'quickly', 'foxes']
Stemmed words: ['run', 'jump', 'happili', 'quick', 'fox']


## 2.  Lancaster Stemmer

#### Rule-based, very aggressive iterative suffix stripping; repeatedly applies short rules to strip suffixes aggressively

In [19]:
from nltk.stem import LancasterStemmer

# Create a Lancaster Stemmer instance
stemmer = LancasterStemmer()

# Example words to stem
words_to_stem = ['running', 'jumped', 'happily', 'quickly', 'foxes']

# Apply Lancaster Stemmer
stemmed_words = [stemmer.stem(word) for word in words_to_stem]

# Print the results
print("Original words:", words_to_stem)
print("Stemmed words:", stemmed_words)

Original words: ['running', 'jumped', 'happily', 'quickly', 'foxes']
Stemmed words: ['run', 'jump', 'happy', 'quick', 'fox']


## 3.  Regexp Stemmer

#### Uses custom regular expressions to strip suffixes or prefixes; depends entirely on regex patterns user defines

In [25]:
from nltk.stem import RegexpStemmer

# Create a Regexp Stemmer with a custom rule
custom_rule = r'ing$'
regexp_stemmer = RegexpStemmer(custom_rule)

# Apply the stemmer to a word
word = 'running'
stemmed_word = regexp_stemmer.stem(word)

print(f'Original Word: {word}')
print(f'Stemmed Word: {stemmed_word}')

Original Word: running
Stemmed Word: runn


## 4. Lovins Stemmer 

In [36]:
def lovins_stem(word):
    suffixes = ['ization', 'tion', 'ing', 'ly', 'ed', 'es', 's']
    word = word.lower()
    
    # Remove the longest suffix found at the end of the word
    for suffix in sorted(suffixes, key=len, reverse=True):
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word

# Examples
print(lovins_stem("running"))      # Output: runn
print(lovins_stem("foxes"))    # Output: happin
print(lovins_stem("jumped"))  # Output: conditional (no suffix removed)
print(lovins_stem("happily")) # Output: organ
print(lovins_stem("quickly"))         # Output: cat


runn
fox
jump
happi
quick


## 5. N- gram Stemmer

In [34]:
import nltk
from nltk.util import ngrams

sentence = "The striped bats are hanging on their feet"
tokens = nltk.word_tokenize(sentence)

n = 2  # for bigrams, change to any n you want
result = list(ngrams(tokens, n))

print(result)


[('The', 'striped'), ('striped', 'bats'), ('bats', 'are'), ('are', 'hanging'), ('hanging', 'on'), ('on', 'their'), ('their', 'feet')]
