
<center><u><H1>Stemming and Lemmatization</H1></u></center>

## Stemming:

In [1]:
import nltk

In [2]:
from nltk import PorterStemmer, LancasterStemmer, SnowballStemmer

In [3]:
def words_stemmer(words, type="PorterStemmer", lang="english", encoding="utf8"):
    stemmers = ["PorterStemmer", "LancasterStemmer", "SnowballStemmer"]
    if type is False or type not in stemmers:
        return words
    else:
        stem_words = []
        if type == "PorterStemmer":
            stemmer = PorterStemmer()
            for w in words:
                stem_words.append(stemmer.stem(w).encode(encoding))
        if type == "LancasterStemmer":
            stemmer = LancasterStemmer()
            for w in words:
                stem_words.append(stemmer.stem(w).encode(encoding))
        if type == "SnowballStemmer":
            stemmer = SnowballStemmer(lang)
            for w in words:
                stem_words.append(stemmer.stem(w).encode(encoding))
        return b" ".join(stem_words)        

In [4]:
words = "caring cares carefully cared"

In [5]:
wt = nltk.word_tokenize(words)
wt

['caring', 'cares', 'carefully', 'cared']

In [6]:
print("Original:", words)
print("Porter: ", words_stemmer(wt, "PorterStemmer"))
print("Lancaster: ", words_stemmer(wt, "LancasterStemmer"))
print("Snowball: ", words_stemmer(wt, "SnowballStemmer"))

Original: caring cares carefully cared
Porter:  b'care care care care'
Lancaster:  b'car car car car'
Snowball:  b'care care care care'


## Lemmatization: using WordNetLemmatizer
### Wordnet is a large lexical database for English words that are linked together

In [7]:
from nltk.stem import WordNetLemmatizer

In [8]:
wlem = WordNetLemmatizer()

In [9]:
#Function to apply lemmatization to list of words
def words_lemmatizer(text, encoding="utf8"):
    words = nltk.word_tokenize(text)
    lemma_words = []
    wl = WordNetLemmatizer()
    for w in words:
        pos = find_pos(w)
        lemma_words.append(wl.lemmatize(w, pos).encode(encoding))
    return b" ".join(lemma_words)    

In [10]:
#n    NOUN 
#v    VERB 
#a    ADJECTIVE 
#s    ADJECTIVE SATELLITE 
#r    ADVERB 

In [11]:
def find_pos(word):
    #part of speech constants
    pos = nltk.pos_tag(nltk.word_tokenize(word))[0][1]
    # Adjective tags : "JJ", "JJR", "JJS"
    if pos.lower()[0] == 'j':
        return 'a'
    # Adverb tags : "RB", "RBR", "RBS"
    elif pos.lower()[0] == 'r':
        return 'r'
    # Verb tags: "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"
    elif pos.lower()[0] == 'v':
        return 'v'
    # Noun tags: "NN", "NNS", "NNP", "NNPS"
    else:
        return 'n'

In [12]:
print("Lemmatized: ", words_lemmatizer(words))

Lemmatized:  b'care care carefully care'


### Getting synonyms and antonyms for a given word with wordnet

In [13]:
# Wordnet is a large lexical database for English words that are linked together
# by their semantic relationships. 
# It groups words together based on their meanings.

In [14]:
from nltk.corpus import wordnet

In [15]:
s = wordnet.synsets("suitable")
print("Definition: ", s[0].definition())
print("Example: ", s[0].examples())

Definition:  meant or adapted for an occasion or use
Example:  ['a tractor suitable (or fit) for heavy duty', 'not an appropriate (or fit) time for flippancy']


In [16]:
synonyms = []
antonyms = []
for s in wordnet.synsets("better"):
    for l in s.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())

print("synonyms: \n", set(synonyms))
print("antonyms: \n", set(antonyms))

synonyms: 
 {'break', 'advantageously', 'dear', 'effective', 'considerably', 'just', 'unspoiled', 'better', 'respectable', 'secure', 'ripe', 'in_force', 'well', 'punter', 'substantially', 'wagerer', 'honorable', 'meliorate', 'easily', 'sound', 'proficient', 'best', 'bettor', 'near', 'in_effect', 'upright', 'safe', 'undecomposed', 'amend', 'intimately', 'estimable', 'serious', 'ameliorate', 'improve', 'skilful', 'unspoilt', 'beneficial', 'expert', 'full', 'skillful', 'salutary', 'comfortably', 'good', 'dependable', 'adept', 'honest', 'practiced', 'right'}
antonyms: 
 {'ill', 'evil', 'worse', 'badly', 'worsen', 'disadvantageously', 'bad'}
