# NLTK Basics

In [1]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
text = "Hello Mr. Smith, how are you doing today? \
        The weather is great, and Python is awesome!\
        The sky is pinkish-blue. \
        You shouldn\'t eat cardboard."

In [3]:
sentence = """At eight o'clock on Thursday morning Arthur felt very good. But he didn't go to play"""

### Sentence tokenizing

In [4]:
sent_tokenize(text)

['Hello Mr. Smith, how are you doing today?',
 'The weather is great, and Python is awesome!',
 'The sky is pinkish-blue.',
 "You shouldn't eat cardboard."]

In [5]:
for sent in sent_tokenize(text):
    print(sent)

Hello Mr. Smith, how are you doing today?
The weather is great, and Python is awesome!
The sky is pinkish-blue.
You shouldn't eat cardboard.


**Tokens created using `sent_tokenize` is list of sentences.**

### Word tokenizing

In [6]:
word_tokenize(sentence)

['At',
 'eight',
 "o'clock",
 'on',
 'Thursday',
 'morning',
 'Arthur',
 'felt',
 'very',
 'good',
 '.',
 'But',
 'he',
 'did',
 "n't",
 'go',
 'to',
 'play']

In [7]:
for word in word_tokenize(sentence):
    print(word)

At
eight
o'clock
on
Thursday
morning
Arthur
felt
very
good
.
But
he
did
n't
go
to
play


In [8]:
for word in word_tokenize(text):
    print(word)

Hello
Mr.
Smith
,
how
are
you
doing
today
?
The
weather
is
great
,
and
Python
is
awesome
!
The
sky
is
pinkish-blue
.
You
should
n't
eat
cardboard
.


**Observations:**
- Punctuations are treated as seperate token.
- Notice the seperation of word "shouldn't" into "should" and "n't".
- Notice that "pinkish-blue" is treated as one word.
- Some words seems trival which form stop words.

In [9]:
text1 = "this is Ram's text, is'nt it?"

In [10]:
tokenizer = nltk.tokenize.WhitespaceTokenizer()
tokenizer.tokenize(text1)

['this', 'is', "Ram's", 'text,', "is'nt", 'it?']

In [11]:
tokenizer = nltk.tokenize.TreebankWordTokenizer()
tokenizer.tokenize(text1)

['this', 'is', 'Ram', "'s", 'text', ',', "is'nt", 'it', '?']

In [12]:
tokenizer = nltk.tokenize.WordPunctTokenizer()
tokenizer.tokenize(text1)

['this', 'is', 'Ram', "'", 's', 'text', ',', 'is', "'", 'nt', 'it', '?']

### Stop Words

In [13]:
from nltk.corpus import stopwords

In [14]:
print(set(stopwords.words('english')))

{'and', 'me', 'by', 'same', 'hers', 'doing', 'them', 'those', 'being', 'off', "hadn't", 'am', 'here', 'few', 'than', 'of', "weren't", 'is', 'a', 'should', "should've", "doesn't", 'other', 'did', 'below', 'how', 'has', 'd', 'both', 'our', "shouldn't", 'had', 'too', 'only', 'm', 'you', 'against', 'shouldn', 'most', 'my', 'yourselves', 'again', "don't", 'don', 'mustn', 'y', 'ain', 'needn', 'on', 'between', 'themselves', 'mightn', "you'd", 'they', "she's", 'in', "you'll", "didn't", 'why', 'was', 'for', 'who', "aren't", 'this', 'i', 'any', 'until', 'about', 'further', 'from', 'under', 'nor', 'to', 'o', 'each', 'been', 'very', 'can', 'does', 'didn', 'isn', 'do', "mightn't", 'yourself', 'the', 'theirs', "mustn't", "won't", 'hasn', 'your', "hasn't", 'haven', 'her', 'where', 'not', "you've", 'himself', 'be', 'wouldn', 'which', 'he', 'whom', 'weren', 'so', 'hadn', 'some', 't', "wasn't", "isn't", 'she', 'yours', 'an', 'its', 'if', 'after', 'aren', 'now', "you're", 'his', 'more', 'just', 'll', 'wh

In [15]:
stop_words = set(stopwords.words('english'))

In [16]:
ex_sentence = "This is a sample sentence, showing off the stop words filteration."

In [17]:
word_tokens = word_tokenize(ex_sentence)

In [18]:
# option 1
filtered_sentence = [w for w in word_tokens if not w in stop_words]

# option 2
filtered_sentence = []

for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)
            
print(word_tokens)
print(filtered_sentence)

['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filteration', '.']
['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filteration', '.']


### Stemming words

The idea of stemming is a sort of normalizing method. Many variations of words carry the same meaning, other that when tense is involved.

The reason why we stem is to shorten the lookup, and normalize sentences.

In [19]:
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer

In [20]:
pstem = PorterStemmer()
lstem = LancasterStemmer()
sstem = SnowballStemmer('english')

In [21]:
word_list = ['connected', 'connecting', 'connection', 'connections']

print("{0:20} {1:20} {2:20} {3:20}".format("Word",
                                           "PorterStemmer",
                                           "LancasterStemmer",
                                           "SnowballStemmer"))
for word in word_list:
    print("{0:20} {1:20} {2:20} {3:20}".format(word,
                                               pstem.stem(word),
                                               lstem.stem(word),
                                               sstem.stem(word)))

Word                 PorterStemmer        LancasterStemmer     SnowballStemmer     
connected            connect              connect              connect             
connecting           connect              connect              connect             
connection           connect              connect              connect             
connections          connect              connect              connect             


In [22]:
word_list = ['run', 'running', 'runs', 'runner', 'monthly']

print("{0:15} {1:15} {2:15} {3:15}".format('Word',
                                           'PorterStemmer',
                                           'LancasterStemmer',
                                           'SnowballStemmer'))
for word in word_list:
    print("{0:15} {1:15} {2:15} {3:15}".format(word,
                                               pstem.stem(word), 
                                               lstem.stem(word),
                                               sstem.stem(word))) 

Word            PorterStemmer   LancasterStemmer SnowballStemmer
run             run             run             run            
running         run             run             run            
runs            run             run             run            
runner          runner          run             runner         
monthly         monthli         month           month          


In [23]:
word_list = ['cats', 'touble', 'troubling', 'troubled', 'troublesome']

print("{0:15} {1:15} {2:15} {3:15}".format('Word',
                                          'PorterStemmer',
                                          'LancasterStemmer',
                                          'SnowballStemmer'))
for word in word_list:
    print("{0:15} {1:15} {2:15} {3:15}".format(word,
                                               pstem.stem(word),
                                               lstem.stem(word),
                                               sstem.stem(word)))

Word            PorterStemmer   LancasterStemmer SnowballStemmer
cats            cat             cat             cat            
touble          toubl           toubl           toubl          
troubling       troubl          troubl          troubl         
troubled        troubl          troubl          troubl         
troublesome     troublesom      troublesom      troublesom     


In [24]:
word_list = ['argue', 'argued', 'argues', 'arguing', 'argus']

print("{0:15} {1:15} {2:15} {3:15}".format('Word',
                                           'PorterStemmer',
                                           'LancasterStemmer',
                                           'SnowballStemmer'))
for word in word_list:
    print("{0:15} {1:15} {2:15} {3:15}".format(word,
                                               pstem.stem(word),
                                               lstem.stem(word),
                                               sstem.stem(word)))

Word            PorterStemmer   LancasterStemmer SnowballStemmer
argue           argu            argu            argu           
argued          argu            argu            argu           
argues          argu            argu            argu           
arguing         argu            argu            argu           
argus           argu            arg             argus          


In [25]:
word_list = ['friend', 'friendship', 'friends', 'friendships', 'stabil',
            'destablise', 'misunderstanding', 'railroad', 'moonlight',
            'football', 'cricket', 'cycle', 'rained']

print('{0:15} {1:15} {2:15} {3:15}'.format('Word',
                                           'PorterStemmer',
                                           'LancasterStemmer',
                                           'SnowballStemmer'))
for word in word_list:
    print('{0:15} {1:15} {2:15} {3:15}'.format(word,
                                              pstem.stem(word),
                                              lstem.stem(word),
                                              sstem.stem(word)))

Word            PorterStemmer   LancasterStemmer SnowballStemmer
friend          friend          friend          friend         
friendship      friendship      friend          friendship     
friends         friend          friend          friend         
friendships     friendship      friend          friendship     
stabil          stabil          stabl           stabil         
destablise      destablis       dest            destablis      
misunderstanding misunderstand   misunderstand   misunderstand  
railroad        railroad        railroad        railroad       
moonlight       moonlight       moonlight       moonlight      
football        footbal         footbal         footbal        
cricket         cricket         cricket         cricket        
cycle           cycl            cyc             cycl           
rained          rain            rain            rain           


### Lemmatization

Lemmatization is the process of converting a word to its base form.

The difference between stemming and lemmatization is
> Lemmatization considers the context and converts the words to its meaning ful base form, whereas stemming just removes the last few characters, often leading to incorrect meanings and spelling errors.

There are multiple ways to lemmatization such as
> - Wordnet Lemmatizer
> - Spacy Lemmatizer
> - TextBlob
> - CLiPs Pattern
> - Stanford CoreNLP
> - Gensim Lemmatizer
> - TreeTagger

In [26]:
from nltk.stem import WordNetLemmatizer

# initiate WordNetLemmatizer class object
wnl = WordNetLemmatizer()

In [27]:
word_list = ['friend', 'friendship', 'eaten', 'bicycle', 'fatten',
             'player', 'introducing', 'datum', 'data', 'processing']

print('{0:20} {1:20}'.format('Word', 'WordNetLemmatizer'))
for word in word_list:
    print('{0:20} {1:20}'.format(word, wnl.lemmatize(word)))

Word                 WordNetLemmatizer   
friend               friend              
friendship           friendship          
eaten                eaten               
bicycle              bicycle             
fatten               fatten              
player               player              
introducing          introducing         
datum                datum               
data                 data                
processing           processing          


In [28]:
sentence = 'The striped bats are hanging on their feet for best.'

# tokenize
word_list = word_tokenize(sentence)
print(word_list)

['The', 'striped', 'bats', 'are', 'hanging', 'on', 'their', 'feet', 'for', 'best', '.']


In [29]:
for word in word_list:
    print('{0:15} {1:15}'.format(word, wnl.lemmatize(word)))

The             The            
striped         striped        
bats            bat            
are             are            
hanging         hanging        
on              on             
their           their          
feet            foot           
for             for            
best            best           
.               .              


**Notice**: WordNetLemmatizers missing some words. To fix that words must be tagged with PoS.

In [30]:
print(wnl.lemmatize('hanging', 'v'))

hang


In [31]:
print(wnl.lemmatize('eating', 'v'))

eat


In [32]:
print(wnl.lemmatize('walking', 'v'))

walk


### Generate the N-grams for the given sentence

The essential concept in text mining is n-grams, which are a set of co-occuring or continuous sequence of n items from a sequence of large text or sentence. The item here could be words, letters, and syllables. 1-gram is also called as unigrams are the unique words present in the sentence. Bigram (2-gram) is the combination of 2 words. Trigram (3-words) is 3 words and so on.

In [33]:
from nltk.util import ngrams

In [34]:
text = 'Data science is an interesting field of study, includes ML and DL as sub field'

In [35]:
grams = 2

n_grams = ngrams(nltk.word_tokenize(text), grams)

In [36]:
n_grams

<generator object ngrams at 0x0000027E16BB77C8>

In [37]:
list(nltk.bigrams(word_tokenize(text)))

[('Data', 'science'),
 ('science', 'is'),
 ('is', 'an'),
 ('an', 'interesting'),
 ('interesting', 'field'),
 ('field', 'of'),
 ('of', 'study'),
 ('study', ','),
 (',', 'includes'),
 ('includes', 'ML'),
 ('ML', 'and'),
 ('and', 'DL'),
 ('DL', 'as'),
 ('as', 'sub'),
 ('sub', 'field')]

In [38]:
[' '.join(grams) for grams in n_grams]

['Data science',
 'science is',
 'is an',
 'an interesting',
 'interesting field',
 'field of',
 'of study',
 'study ,',
 ', includes',
 'includes ML',
 'ML and',
 'and DL',
 'DL as',
 'as sub',
 'sub field']

In [39]:
text = 'Data science is a wonderful program, Data science is a land of opportunitites, Data science is about machine learning'
text

'Data science is a wonderful program, Data science is a land of opportunitites, Data science is about machine learning'

In [40]:
bigrams = nltk.bigrams(word_tokenize(text))

In [41]:
from collections import Counter

In [42]:
Counter(bigrams)

Counter({('Data', 'science'): 3,
         ('science', 'is'): 3,
         ('is', 'a'): 2,
         ('a', 'wonderful'): 1,
         ('wonderful', 'program'): 1,
         ('program', ','): 1,
         (',', 'Data'): 2,
         ('a', 'land'): 1,
         ('land', 'of'): 1,
         ('of', 'opportunitites'): 1,
         ('opportunitites', ','): 1,
         ('is', 'about'): 1,
         ('about', 'machine'): 1,
         ('machine', 'learning'): 1})

In [43]:
# function to generate n-grams from sentences
def extract_ngrams(data, num):
    n_grams = ngrams(word_tokenize(data), num)
    return [' ' .join(gram) for gram in n_grams]

data = 'A class is a blueprint for the object.'

print("1-gram: ", extract_ngrams(data, 1))
print("2-gram: ", extract_ngrams(data, 2))
print("3-gram: ", extract_ngrams(data ,3))
print("4-gram: ", extract_ngrams(data, 4))

1-gram:  ['A', 'class', 'is', 'a', 'blueprint', 'for', 'the', 'object', '.']
2-gram:  ['A class', 'class is', 'is a', 'a blueprint', 'blueprint for', 'for the', 'the object', 'object .']
3-gram:  ['A class is', 'class is a', 'is a blueprint', 'a blueprint for', 'blueprint for the', 'for the object', 'the object .']
4-gram:  ['A class is a', 'class is a blueprint', 'is a blueprint for', 'a blueprint for the', 'blueprint for the object', 'for the object .']


In [44]:
text = 'Data science is a wonderful program, Data science is a land of opportunitites, Data science is about machine learning'

list(nltk.trigrams(word_tokenize(text)))

[('Data', 'science', 'is'),
 ('science', 'is', 'a'),
 ('is', 'a', 'wonderful'),
 ('a', 'wonderful', 'program'),
 ('wonderful', 'program', ','),
 ('program', ',', 'Data'),
 (',', 'Data', 'science'),
 ('Data', 'science', 'is'),
 ('science', 'is', 'a'),
 ('is', 'a', 'land'),
 ('a', 'land', 'of'),
 ('land', 'of', 'opportunitites'),
 ('of', 'opportunitites', ','),
 ('opportunitites', ',', 'Data'),
 (',', 'Data', 'science'),
 ('Data', 'science', 'is'),
 ('science', 'is', 'about'),
 ('is', 'about', 'machine'),
 ('about', 'machine', 'learning')]