## wordnet lemmatizer with NLTK

In [15]:
import nltk
from nltk.stem import WordNetLemmatizer

In [21]:
lemmatizer = WordNetLemmatizer()
def lemma_method1(sentence):
    word_list = nltk.word_tokenize(sentence)
    lemma_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
    return lemma_output

In [22]:
sentence = "The striped bats are hanging on their feet for best"
out1 = lemma_method1(sentence)

In [23]:
out1

'The striped bat are hanging on their foot for best'

Notice it didn’t do a good job. Because, ‘are’ is not converted to ‘be’ and ‘hanging’ is not converted to ‘hang’ as expected.This can be corrected if we provide the correct ‘part-of-speech’ tag (POS tag) as the second argument to lemmatize().

In [24]:
print(lemmatizer.lemmatize("stripes", 'v'))  

strip


## wordnet lemmatizer with appropriate pos tag

In [25]:
print(nltk.pos_tag(['feet']))

[('feet', 'NNS')]


In [26]:
print(nltk.pos_tag(nltk.word_tokenize(sentence)))

[('The', 'DT'), ('striped', 'JJ'), ('bats', 'NNS'), ('are', 'VBP'), ('hanging', 'VBG'), ('on', 'IN'), ('their', 'PRP$'), ('feet', 'NNS'), ('for', 'IN'), ('best', 'JJS')]


In [35]:
from nltk.corpus import wordnet
tag_dict = {"J": wordnet.ADJ,
            "N": wordnet.NOUN,
            "V": wordnet.VERB,
            "R": wordnet.ADV}

lemmatizer = WordNetLemmatizer()
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    return tag_dict.get(tag,wordnet.NOUN)

def lemma_method2(s):
    output = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in nltk.word_tokenize(s)]
    return output

In [36]:
lemma_method2(sentence)

['The', 'strip', 'bat', 'be', 'hang', 'on', 'their', 'foot', 'for', 'best']

## spacy lemmatization

In [44]:
import spacy
import sys

In [51]:
nlp = spacy.load('en')

In [52]:
def lemma_method3(s):
    doc = nlp(s)
    output = ' '.join([word.lemma_ for word in doc])
    return output

In [53]:
lemma_method3(sentence)

'the stripe bat be hang on -PRON- foot for good'

## textblob

In [54]:
from textblob import TextBlob, Word
def lemma_method4(s):
    sent = TextBlob(s)
    output = ' '.join([w.lemmatize() for w in sent.words])
    return output

In [55]:
lemma_method4(sentence)

'The striped bat are hanging on their foot for best'

## TextBlob Lemmatizer with appropriate POS tag

In [56]:
def lemma_method5(sentence):
    sent = TextBlob(sentence)
    tag_dict = {"J": 'a', 
                "N": 'n', 
                "V": 'v', 
                "R": 'r'}
    words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]    
    lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
    return " ".join(lemmatized_list)

lemma_method5(sentence)

'The striped bat be hang on their foot for best'