# NLTK - basics 

In [5]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

In [6]:
EXAMPLE_TEXT = "Hello Mr. Smith, how are you doing today? \
               The weather is great, and Python is awesome !\
               The sky is pinkish-blue. \
               You shouldn\'t eat cardboard."

In [7]:
sentence = """At eight o'clock on Thursday morning Arthur felt very good. But he didn't go to play"""

### sentence tokenizing

In [8]:
sent_tokenize(EXAMPLE_TEXT)

['Hello Mr. Smith, how are you doing today?',
 'The weather is great, and Python is awesome !',
 'The sky is pinkish-blue.',
 "You shouldn't eat cardboard."]

In [9]:
for sent in sent_tokenize(EXAMPLE_TEXT):
    print(sent)

Hello Mr. Smith, how are you doing today?
The weather is great, and Python is awesome !
The sky is pinkish-blue.
You shouldn't eat cardboard.


So there, we have created tokens, which are sentences. 

### word tokenizing

In [10]:
nltk.word_tokenize(sentence)

['At',
 'eight',
 "o'clock",
 'on',
 'Thursday',
 'morning',
 'Arthur',
 'felt',
 'very',
 'good',
 '.',
 'But',
 'he',
 'did',
 "n't",
 'go',
 'to',
 'play']

In [11]:
word_tokenize(EXAMPLE_TEXT)

['Hello',
 'Mr.',
 'Smith',
 ',',
 'how',
 'are',
 'you',
 'doing',
 'today',
 '?',
 'The',
 'weather',
 'is',
 'great',
 ',',
 'and',
 'Python',
 'is',
 'awesome',
 '!',
 'The',
 'sky',
 'is',
 'pinkish-blue',
 '.',
 'You',
 'should',
 "n't",
 'eat',
 'cardboard',
 '.']

#### Observation. 
- First, notice that punctuation is treated as a separate token. 
- Also, notice the separation of the word "shouldn't" into "should" and "n't." 
- Finally, notice that "pinkish-blue" is indeed treated like the "one word" it was meant to be turned into

- Some words seem trivial - these are a form of "stop words"

In [12]:
text = "this is Ram's text, is'nt it?"

In [14]:
tokenizer = nltk.tokenize.WhitespaceTokenizer()
tokenizer.tokenize(text)

['this', 'is', "Ram's", 'text,', "is'nt", 'it?']

In [15]:
tokenizer = nltk.tokenize.TreebankWordTokenizer()
tokenizer.tokenize(text)

['this', 'is', 'Ram', "'s", 'text', ',', "is'nt", 'it', '?']

In [16]:
tokenizer = nltk.tokenize.WordPunctTokenizer()
tokenizer.tokenize(text)

['this', 'is', 'Ram', "'", 's', 'text', ',', 'is', "'", 'nt', 'it', '?']

### stop words

In [8]:
from nltk.corpus import stopwords

In [9]:
print(set(stopwords.words('english')))

{'about', 'up', "wasn't", "you'd", 'above', 'her', 'doesn', "weren't", 't', 'isn', 'i', 'had', 'further', 'hers', 'their', "you've", 'once', 'that', 'and', 'by', 'what', 'this', 'down', 'more', 'at', "you're", 'my', 'for', 'be', 'before', 'to', 'do', 'if', 'not', 'having', 'because', 'who', 'now', 'any', 'where', 'should', 'those', 'how', 'nor', 'very', 'is', 'while', "needn't", 'into', 'ain', 'll', 'they', 'no', 'shan', 'wasn', 'yours', 'whom', "she's", 'own', 'me', 'd', 'needn', 'them', 'other', 'doing', 'but', 'myself', 'when', 'theirs', 'was', 'of', 'didn', 'aren', "shan't", 'than', "you'll", 'herself', 'did', 'few', 's', 'some', 'too', "mustn't", "wouldn't", 'am', 'haven', 'ourselves', 'shouldn', 'you', 'mightn', 'why', 'will', 'against', 'o', 'have', "aren't", 'so', 'below', 'then', 'she', "haven't", 'each', 'don', 'mustn', 'we', 'with', 'it', 'a', 'yourselves', 'again', 'our', 'won', 'off', 'ma', 'your', "don't", "it's", 'has', 'or', 'in', "hadn't", 'been', 'such', 'itself', "di

In [10]:
example_sent = "This is a sample sentence, showing off the stop words filtration."

In [11]:
stop_words = set(stopwords.words('english'))

In [12]:
word_tokens = word_tokenize(example_sent)

In [13]:
# option 1
filtered_sentence = [w for w in word_tokens if not w in stop_words]

# option 2
filtered_sentence = []

for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)

print(word_tokens)
print(filtered_sentence)

['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']
['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


### Stemming words

The idea of stemming is a sort of normalizing method. Many variations of words carry the same meaning, other than when tense is involved.

The reason why we stem is to shorten the lookup, and normalize sentences.

Consider:

I was taking a ride in the car.
I was riding in the car.

One of the most popular stemming algorithms is the __Porter stemmer__, which has been around since 1979.

In [14]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import SnowballStemmer


In [15]:
porter   = PorterStemmer()
lancaster=LancasterStemmer()
sno      = nltk.stem.SnowballStemmer('english')

In [16]:
word_list = ["connected", "connecting", "connection", "connections"]

print("{0:20} {1:20} {2:20} {2:20}".format("Word","Porter Stemmer", "lancaster Stemmer", "Snowball Stemmer"))
for word in word_list:
    print("{0:20} {1:20} {2:20} {2:20}".format(word, porter.stem(word), lancaster.stem(word), sno.stem(word)))

     

Word                 Porter Stemmer       lancaster Stemmer    lancaster Stemmer   
connected            connect              connect              connect             
connecting           connect              connect              connect             
connection           connect              connect              connect             
connections          connect              connect              connect             


In [17]:
word_list = ["run", "running", "runs", "runner", "monthly"]

print("{0:20} {1:20} {2:20} {2:20}".format("Word","Porter Stemmer", "lancaster Stemmer", "Snowball Stemmer"))
for word in word_list:
    print("{0:20} {1:20} {2:20} {2:20}".format(word, porter.stem(word), lancaster.stem(word), sno.stem(word)))

Word                 Porter Stemmer       lancaster Stemmer    lancaster Stemmer   
run                  run                  run                  run                 
running              run                  run                  run                 
runs                 run                  run                  run                 
runner               runner               run                  run                 
monthly              monthli              month                month               


In [18]:
word_list = ["cats", "trouble", "troubling", "troubled", "troublesome"]

print("{0:20} {1:20} {2:20} {2:20}".format("Word","Porter Stemmer", "lancaster Stemmer", "Snowball Stemmer"))
for word in word_list:
    print("{0:20} {1:20} {2:20} {2:20}".format(word, porter.stem(word), lancaster.stem(word), sno.stem(word))) 

Word                 Porter Stemmer       lancaster Stemmer    lancaster Stemmer   
cats                 cat                  cat                  cat                 
trouble              troubl               troubl               troubl              
troubling            troubl               troubl               troubl              
troubled             troubl               troubl               troubl              
troublesome          troublesom           troublesom           troublesom          


Notice how the PorterStemmer is 
- giving the root (stem) of the word "cats" by simply removing the 's' after cat. This is a suffix added to cat to make it plural. 
- But if we look at 'trouble', 'troubling' and 'troubled' they are stemmed to 'trouble' because **PorterStemmer algorithm does not follow linguistics rather a set of 05 rules for different cases that are applied in phases (step by step) to generate stems**


In [19]:
word_list = ["argue", "argued", "argues", "arguing", "argus"]

print("{0:20} {1:20} {2:20} {2:20}".format("Word","Porter Stemmer", "lancaster Stemmer", "Snowball Stemmer"))
for word in word_list:
    print("{0:20} {1:20} {2:20} {2:20}".format(word, porter.stem(word), lancaster.stem(word), sno.stem(word)))

Word                 Porter Stemmer       lancaster Stemmer    lancaster Stemmer   
argue                argu                 argu                 argu                
argued               argu                 argu                 argu                
argues               argu                 argu                 argu                
arguing              argu                 argu                 argu                
argus                argu                 arg                  arg                 


In [21]:
#A list of words to be stemmed
word_list = ["friend", "friendship", "friends", "friendships","stabil","destabilize","misunderstanding","railroad","moonlight","football"]

print("{0:20} {1:20} {2:20} {2:20}".format("Word","Porter Stemmer", "lancaster Stemmer", "Snowball Stemmer"))
for word in word_list:
    print("{0:20} {1:20} {2:20} {2:20}".format(word, porter.stem(word), lancaster.stem(word), sno.stem(word)))

Word                 Porter Stemmer       lancaster Stemmer    lancaster Stemmer   
friend               friend               friend               friend              
friendship           friendship           friend               friend              
friends              friend               friend               friend              
friendships          friendship           friend               friend              
stabil               stabil               stabl                stabl               
destabilize          destabil             dest                 dest                
misunderstanding     misunderstand        misunderstand        misunderstand       
railroad             railroad             railroad             railroad            
moonlight            moonlight            moonlight            moonlight           
football             footbal              footbal              footbal             


In [26]:
sentence="Pythoners are very intelligent and work very pythonly and now they are pythoning their way to success."
porter.stem(sentence)

'pythoners are very intelligent and work very pythonly and now they are pythoning their way to success.'

stemmer sees the entire sentence as a word, so it returns it as it is.

In [27]:
text = "My system keeps crashing his crashed yesterday, ours crashes daily"

print(' '.join([porter.stem(word) for word in text.split()]))
print(' '.join([lancaster.stem(word) for word in text.split()]))

My system keep crash hi crash yesterday, our crash daili
my system keep crash his crash yesterday, our crash dai


In [28]:
simple_stemmer("My system keeps crashing his crashed yesterday, ours crashes daily")

NameError: name 'simple_stemmer' is not defined

### limitations of porter stemmer

In [25]:
text = ['business', 'busy', 'PROBE', 'PROBATE', 'clip', 'clippings']

for w in text:
    
    print(w, ' --> ', ps.stem(w))

NameError: name 'ps' is not defined

### lemmatization

Lemmatization is the process of converting a word to its base form. 

The difference between stemming and lemmatization is, 

> lemmatization considers the context and converts the word to its meaningful base form, whereas stemming just removes the last few characters, often leading to incorrect meanings and spelling errors.

For example, lemmatization would correctly identify the base form of ‘caring’ to ‘care’, whereas, stemming would cutoff the ‘ing’ part and convert it to car.

    ‘Caring’ -> Lemmatization -> ‘Care’
    ‘Caring’ -> Stemming -> ‘Car’
    
ways to lemmatize:-

    Wordnet Lemmatizer
    Spacy Lemmatizer
    TextBlob
    CLiPS Pattern
    Stanford CoreNLP
    Gensim Lemmatizer
    TreeTagger

In [29]:
from nltk.stem import WordNetLemmatizer

# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()

In [30]:
word_list = ["friend", "friendship", "friends", "friendships","stabilize","destabilize","misunderstanding","railroad","moonlight","football"]

print("{0:20} {1:20}".format("Word","WordNetLemmatizer"))

for word in word_list:
    print("{0:20} {1:20} ".format(word, lemmatizer.lemmatize(word)))

Word                 WordNetLemmatizer   
friend               friend               
friendship           friendship           
friends              friend               
friendships          friendship           
stabilize            stabilize            
destabilize          destabilize          
misunderstanding     misunderstanding     
railroad             railroad             
moonlight            moonlight            
football             football             


In [31]:
# Lemmatize Single Word
print(lemmatizer.lemmatize("bats"))

print(lemmatizer.lemmatize("are"))

print(lemmatizer.lemmatize("feet"))

bat
are
foot


In [32]:
# Define the sentence to be lemmatized
sentence = "The striped bats are hanging on their feet for best"

# Tokenize: Split the sentence into words
word_list = nltk.word_tokenize(sentence)
print(word_list)

['The', 'striped', 'bats', 'are', 'hanging', 'on', 'their', 'feet', 'for', 'best']


In [33]:
for w in word_list:
    print(w, '-->', lemmatizer.lemmatize(w) )

The --> The
striped --> striped
bats --> bat
are --> are
hanging --> hanging
on --> on
their --> their
feet --> foot
for --> for
best --> best


Notice it didn’t do a good job. Because, ‘are’ is not converted to ‘be’ and ‘hanging’ is not converted to ‘hang’ as expected. 

This can be corrected if we provide the correct ‘part-of-speech’ tag (POS tag) as the second argument to lemmatize().

In [34]:
print(lemmatizer.lemmatize("stripes", 'v')) 
print(lemmatizer.lemmatize("stripes", 'n'))  

strip
stripe


## Generate the N-grams for the given sentence

The essential concepts in text mining is n-grams, which are a set of co-occurring or continuous sequence of n items from a sequence of large text or sentence. The item here could be words, letters, and syllables. 1-gram is also called as unigrams are the unique words present in the sentence. Bigram(2-gram) is the combination of 2 words. Trigram(3-gram) is 3 words and so on.

In [20]:
from nltk.util import ngrams

In [24]:
text = 'Data science is an interesting field of study, includes ML and DL as sub field'

In [29]:
grams = 2

n_grams = ngrams(nltk.word_tokenize(text), grams)

In [30]:
[' '.join(grams) for grams in n_grams]

['Data science',
 'science is',
 'is an',
 'an interesting',
 'interesting field',
 'field of',
 'of study',
 'study ,',
 ', includes',
 'includes ML',
 'ML and',
 'and DL',
 'DL as',
 'as sub',
 'sub field']

OR equivalenty ...

In [31]:
# Function to generate n-grams from sentences.
def extract_ngrams(data, num):
    n_grams = ngrams(nltk.word_tokenize(data), num)
    return [ ' '.join(grams) for grams in n_grams]
 
data = 'A class is a blueprint for the object.'
 
print("1-gram: ", extract_ngrams(data, 1))
print("2-gram: ", extract_ngrams(data, 2))
print("3-gram: ", extract_ngrams(data, 3))
print("4-gram: ", extract_ngrams(data, 4))

1-gram:  ['A', 'class', 'is', 'a', 'blueprint', 'for', 'the', 'object', '.']
2-gram:  ['A class', 'class is', 'is a', 'a blueprint', 'blueprint for', 'for the', 'the object', 'object .']
3-gram:  ['A class is', 'class is a', 'is a blueprint', 'a blueprint for', 'blueprint for the', 'for the object', 'the object .']
4-gram:  ['A class is a', 'class is a blueprint', 'is a blueprint for', 'a blueprint for the', 'blueprint for the object', 'for the object .']
