In [1]:
# Character-based tokenization using Python

s = 'some string'

for c in s:
    print(f"{c} is a token")

s is a token
o is a token
m is a token
e is a token
  is a token
s is a token
t is a token
r is a token
i is a token
n is a token
g is a token


In [3]:
# Word-based tokenization using Python

text = 'my sample text'

split_text = [word for word in text.split()]

split_text

['my', 'sample', 'text']

**Import NLTK library**

In [2]:
import nltk

# Stemming

**Refers to chopping off ends of words in an effort to remove variations of a word, e.g. running to run (this should happen after tokenization).**

In [4]:
from nltk.stem import PorterStemmer

In [5]:
# Initialize stemmer object

porter = PorterStemmer()

In [6]:
porter.stem("walking")

'walk'

In [7]:
porter.stem("walked")

'walk'

In [8]:
porter.stem("walks")

'walk'

In [9]:
# There is no stemming rule for this sequence of letters

porter.stem("ran")

'ran'

In [10]:
porter.stem("running")

'run'

In [11]:
porter.stem("bosses")

'boss'

In [12]:
# Not a word!

porter.stem("replacement")

'replac'

In [13]:
sentence = "Lemmatization is more sophisticated than stemming".split()

In [14]:
sentence

['Lemmatization', 'is', 'more', 'sophisticated', 'than', 'stemming']

In [15]:
# Stem an entire sentence (split sentence first)
# Use end argument to ensure all text is printed on a line and not in a list

for token in sentence:
    print(porter.stem(token), end=" ")

lemmat is more sophist than stem 

In [16]:
# Porter rule changes y to i at end of words, which makes no sense!

porter.stem("unnecessary")

'unnecessari'

In [17]:
porter.stem("berry")

'berri'

In [36]:
# Stemmer rules this as plural form so removes the 's'

porter.stem('was')

'wa'

In [39]:
porter.stem('is')

'is'

In [18]:
# As you can see, stemming is quite crude pre-process

# Lemmatization

**Lemmatization is considered more sophisticated approach since it uses a lexical database as reference before reducing the word to its root.**

In [19]:
from nltk.stem import WordNetLemmatizer

In [20]:
# Only need to use once

#nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shmel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [24]:
# NLTK told me to download this!

nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\shmel\AppData\Roaming\nltk_data...


True

In [25]:
from nltk.corpus import wordnet

In [26]:
# Initialize lemmatizer object

lemmatizer = WordNetLemmatizer()

In [27]:
# Walking is not a noun so remains unchanged

lemmatizer.lemmatize('walking')

'walking'

In [28]:
# Changing the lemmatizer focus to verbs finds the root of the walking

lemmatizer.lemmatize('walking', pos='v')

'walk'

In [29]:
lemmatizer.lemmatize('going')

'going'

In [30]:
lemmatizer.lemmatize('going', pos='v')

'go'

In [31]:
lemmatizer.lemmatize('ran')

'ran'

In [33]:
lemmatizer.lemmatize('ran', pos='v')

'run'

In [34]:
lemmatizer.lemmatize('mice')

'mouse'

In [35]:
# Changing the lemmatizer focus to adjectives keeps the plural of mouse the same

lemmatizer.lemmatize('mice', pos='a')

'mice'

In [37]:
# Same as porter.stemmer object, it is considered plural noun and removes 's'

lemmatizer.lemmatize('was')

'wa'

In [38]:
lemmatizer.lemmatize('was', pos='v')

'be'

In [40]:
lemmatizer.lemmatize('is')

'is'

In [41]:
lemmatizer.lemmatize('is', pos='v')

'be'

In [42]:
lemmatizer.lemmatize('better')

'better'

In [43]:
lemmatizer.lemmatize('better', pos='a')

'good'

**Lemmatization has its disadvantages, since you need to input the correct part-of-speech tag for each word!**

There is part-of-speech tagging functionality available in NLTK: `nltk.pos_tag()`

In [44]:
# Custom function to extract correct lemma for each word in text
# Input parameter is the part-of-speech tag 

def get_pos_tag(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [45]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\shmel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [47]:
sentence = 'Donald Trump has a devoted nutbag following'.split()

sentence

['Donald', 'Trump', 'has', 'a', 'devoted', 'nutbag', 'following']

In [48]:
words_and_tags = nltk.pos_tag(sentence)

words_and_tags

[('Donald', 'NNP'),
 ('Trump', 'NNP'),
 ('has', 'VBZ'),
 ('a', 'DT'),
 ('devoted', 'VBN'),
 ('nutbag', 'NN'),
 ('following', 'VBG')]

In [49]:
# NNP is noun
# VBZ is verb
# DT is determinative (word that references a noun)
# VBN is past-participle verb
# NN is noun
# VBG is verb in the gerund

# The full list of tags can be found in documentation online

In [50]:
# Unpack tuple in For Loop 
for word, tag in words_and_tags:
    lemma = lemmatizer.lemmatize(word, pos=get_pos_tag(tag))
    print(lemma, end=" ")

Donald Trump have a devote nutbag follow 

In [51]:
text = 'The cat was following the bird as it flew by'.split()

In [52]:
ws_and_ts = nltk.pos_tag(text)

ws_and_ts

[('The', 'DT'),
 ('cat', 'NN'),
 ('was', 'VBD'),
 ('following', 'VBG'),
 ('the', 'DT'),
 ('bird', 'NN'),
 ('as', 'IN'),
 ('it', 'PRP'),
 ('flew', 'VBD'),
 ('by', 'IN')]

In [53]:
for w, t in ws_and_ts:
    lemma = lemmatizer.lemmatize(w, pos=get_pos_tag(t))
    print(lemma, end=" ")

The cat be follow the bird a it fly by 

In [54]:
# Note that 'as' has been reduced to a - treated as plural noun, the 's' was removed
# pos_tag function labelled 'as' with IN tag, which means preposition
# Update custom function rule - if starts with 'I' return wordnet adjective

def get_wordnet_pos(speech_tag):
    if speech_tag.startswith('J') or speech_tag.startswith('I'):
        return wordnet.ADJ
    elif speech_tag.startswith('V'):
        return wordnet.VERB
    elif speech_tag.startswith('N'):
        return wordnet.NOUN
    elif speech_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [55]:
for w, t in ws_and_ts:
    lemma = lemmatizer.lemmatize(w, pos=get_wordnet_pos(t))
    print(lemma, end=" ")

The cat be follow the bird as it fly by 

In [56]:
# BOOM! Check that the other sentence is still correct using updated function

for word, tag in words_and_tags:
    lemma = lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag))
    print(lemma, end=" ")

Donald Trump have a devote nutbag follow 

**Perfect!**