# Tokenization

In [1]:
import nltk

In [2]:
from nltk import word_tokenize

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sohom.ghosh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
text = word_tokenize("The sky is blue")

In [5]:
text

['The', 'sky', 'is', 'blue']

# POS tagging

In [6]:
words = word_tokenize("I am reading Introduction to NLP")
print(words)

['I', 'am', 'reading', 'Introduction', 'to', 'NLP']


In [7]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sohom.ghosh/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [8]:
nltk.pos_tag(text)

[('The', 'DT'), ('sky', 'NN'), ('is', 'VBZ'), ('blue', 'JJ')]

In [9]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sohom.ghosh/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [10]:
nltk.pos_tag(words)

[('I', 'PRP'),
 ('am', 'VBP'),
 ('reading', 'VBG'),
 ('Introduction', 'NN'),
 ('to', 'TO'),
 ('NLP', 'VB')]

# Lemmatization

In [11]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sohom.ghosh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('products')

'product'

In [13]:
lemmatizer.lemmatize('production')

'production'

# Stemming

In [14]:
stemmer = nltk.stem.PorterStemmer()
stemmer.stem('production')

'product'

In [15]:
stemmer.stem('firing')

'fire'

In [16]:
lemmatizer.lemmatize('firing')

'firing'

In [17]:
stemmer.stem('battling')

'battl'

In [18]:
lemmatizer.lemmatize('battle')

'battle'

# Stop words

In [19]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sohom.ghosh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# Exercise: Spelling correction

In [21]:
!pip install autocorrect



In [22]:
from autocorrect import spell

In [23]:
spell('Natureal')

'Natural'

In [24]:
import nltk
from nltk import word_tokenize
from autocorrect import spell

In [25]:
sentence1 = "I am laerning Python which is one of the mostt populer programmming  langages"
sentence2 = "Ntural Luanguage Processin deals with the art of extracting insightes from Natural Languaes"
sentence3 = "We are learning how to crroect spellinges autametically"
sentence1_words = word_tokenize(sentence1)
sentence2_words = word_tokenize(sentence2)
sentence3_words = word_tokenize(sentence3)

In [26]:
sentence1_corrected = ' '.join([spell(word) for word in sentence1_words])
print(sentence1_corrected)
sentence2_corrected = ' '.join([spell(word) for word in sentence2_words])
print(sentence2_corrected)
sentence3_corrected = ' '.join([spell(word) for word in sentence3_words])
print(sentence3_corrected)

I am learning Python which is one of the most popular programming languages
Natural Language Procession deals with the art of extracting insights from Natural Languages
We are learning how to correct spellings automatically


# Activity : Stop word removal

In [27]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [28]:
sentence1 = "I am learning Python. It is one of the most popular programming  languages"
sentence2 = "Natural Language Processing deals with the art of extracting insights from Natural Languages"
sentence3 = "We are learning how to remove stop words automatically"
sentence1_words = word_tokenize(sentence1)
sentence2_words = word_tokenize(sentence2)
sentence3_words = word_tokenize(sentence3)

In [29]:
sentence1_no_stops = ' '.join([word for word in sentence1_words if word not in stop_words])
print(sentence1_no_stops)
sentence2_no_stops = ' '.join([word for word in sentence2_words if word not in stop_words])
print(sentence2_no_stops)
sentence3_no_stops = ' '.join([word for word in sentence3_words if word not in stop_words])
print(sentence3_no_stops)

I learning Python . It one popular programming languages
Natural Language Processing deals art extracting insights Natural Languages
We learning remove stop words automatically


# Named Entity Recognition (NER)

In [30]:
import nltk
nltk.download('maxent_ne_chunker')
nltk.download('words')
sentence = "We are reading a book published by Packt which is based out of Birmingham."

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/sohom.ghosh/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/sohom.ghosh/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [31]:
i = nltk.ne_chunk(nltk.pos_tag(word_tokenize(sentence)), binary=True)
[a for a in i if len(a)==1]

[Tree('NE', [('Packt', 'NNP')]), Tree('NE', [('Birmingham', 'NNP')])]

# Word Sense Disambiguation

In [32]:
from nltk.wsd import lesk
sentence1 = "Keep your savings in the bank"
print(lesk(word_tokenize(sentence1), 'bank'))
sentence = "It's so risky to drive over the banks of the road"
print(lesk(word_tokenize(sentence), 'bank'))

Synset('savings_bank.n.02')
Synset('bank.v.07')


Synset('savings_bank.n.02') refers to a container for keeping money safely at home <br>
Synset('bank.v.07') refers to a slope in the turn of a road

# Sentence Boundary Detection

In [34]:
from nltk.tokenize import sent_tokenize
sent_tokenize("We are reading a book. Do you know who is the publisher? It is Packt. Packt is based out of Birmingham.")

['We are reading a book.',
 'Do you know who is the publisher?',
 'It is Packt.',
 'Packt is based out of Birmingham.']

# Exercise

## Do the following tasks:
Tokenization,
Spelling correction,
Pos tagging,
Stop words removal,
Stemming,
Lemmatization,
Named Entity Recognition,
Text normalisation,
Word Sense Disambiguation,
Sentence Boundary Detection,

On the following text corpus:

In this book, we shall lerning how to pracess Natueral Language and extract insights from it. The first four chapter will introduce you to the basics of NLP. Later chapters will describe how to deal with complex NLP prajects. If you want to get early access of it, you should book your order now.


In [35]:
import nltk
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from autocorrect import spell
from nltk.wsd import lesk
from nltk.tokenize import sent_tokenize
import string

In [36]:
sentence = "In this book authored by Sohom Ghosh and Dwight Gunning, we shall learnning how to pracess \
Natueral Language and extract insights from it. The first four chapter will introduce you to the basics of NLP. \
Later chapters will describe how to deal with complex NLP prajects. If you want to get early access of it, you \
should book your order now."

In [37]:
words = word_tokenize(sentence)
words[0:20]

['In',
 'this',
 'book',
 'authored',
 'by',
 'Sohom',
 'Ghosh',
 'and',
 'Dwight',
 'Gunning',
 ',',
 'we',
 'shall',
 'learnning',
 'how',
 'to',
 'pracess',
 'Natueral',
 'Language',
 'and']

In [38]:
corrected_sentence = ""
corrected_word_list = []
for wd in words:
    if wd not in string.punctuation:
        wd_c = spell(wd)
        if wd_c != wd:
            print(wd + " has been corrected to: " + wd_c)
            corrected_sentence = corrected_sentence + ' ' +wd_c
            corrected_word_list.append(wd_c)
        else:
            corrected_sentence = corrected_sentence + ' ' + wd
            corrected_word_list.append(wd)
    else:
        corrected_sentence = corrected_sentence + wd
        corrected_word_list.append(wd)

Sohom has been corrected to: Soho
Ghosh has been corrected to: Ghost
learnning has been corrected to: learning
pracess has been corrected to: process
Natueral has been corrected to: Natural
prajects has been corrected to: projects


In [39]:
corrected_sentence

' In this book authored by Soho Ghost and Dwight Gunning, we shall learning how to process Natural Language and extract insights from it. The first four chapter will introduce you to the basics of NLP. Later chapters will describe how to deal with complex NLP projects. If you want to get early access of it, you should book your order now.'

In [40]:
corrected_word_list[:20]

['In',
 'this',
 'book',
 'authored',
 'by',
 'Soho',
 'Ghost',
 'and',
 'Dwight',
 'Gunning',
 ',',
 'we',
 'shall',
 'learning',
 'how',
 'to',
 'process',
 'Natural',
 'Language',
 'and']

In [41]:
nltk.pos_tag(corrected_word_list)

[('In', 'IN'),
 ('this', 'DT'),
 ('book', 'NN'),
 ('authored', 'VBN'),
 ('by', 'IN'),
 ('Soho', 'NNP'),
 ('Ghost', 'NNP'),
 ('and', 'CC'),
 ('Dwight', 'NNP'),
 ('Gunning', 'NNP'),
 (',', ','),
 ('we', 'PRP'),
 ('shall', 'MD'),
 ('learning', 'VB'),
 ('how', 'WRB'),
 ('to', 'TO'),
 ('process', 'VB'),
 ('Natural', 'NNP'),
 ('Language', 'NNP'),
 ('and', 'CC'),
 ('extract', 'JJ'),
 ('insights', 'NNS'),
 ('from', 'IN'),
 ('it', 'PRP'),
 ('.', '.'),
 ('The', 'DT'),
 ('first', 'JJ'),
 ('four', 'CD'),
 ('chapter', 'NN'),
 ('will', 'MD'),
 ('introduce', 'VB'),
 ('you', 'PRP'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('basics', 'NNS'),
 ('of', 'IN'),
 ('NLP', 'NNP'),
 ('.', '.'),
 ('Later', 'NNP'),
 ('chapters', 'NNS'),
 ('will', 'MD'),
 ('describe', 'VB'),
 ('how', 'WRB'),
 ('to', 'TO'),
 ('deal', 'VB'),
 ('with', 'IN'),
 ('complex', 'JJ'),
 ('NLP', 'NNP'),
 ('projects', 'NNS'),
 ('.', '.'),
 ('If', 'IN'),
 ('you', 'PRP'),
 ('want', 'VBP'),
 ('to', 'TO'),
 ('get', 'VB'),
 ('early', 'JJ'),
 ('access', 'N

In [42]:
stop_words = stopwords.words('english')
corrected_word_list_without_stopwords = []
for wd in corrected_word_list:
    if wd not in stop_words:
        corrected_word_list_without_stopwords.append(wd)
corrected_word_list_without_stopwords[:20]      

['In',
 'book',
 'authored',
 'Soho',
 'Ghost',
 'Dwight',
 'Gunning',
 ',',
 'shall',
 'learning',
 'process',
 'Natural',
 'Language',
 'extract',
 'insights',
 '.',
 'The',
 'first',
 'four',
 'chapter']

In [43]:
stemmer = nltk.stem.PorterStemmer()
corrected_word_list_without_stopwords_stemmed = []
for wd in corrected_word_list_without_stopwords:
    corrected_word_list_without_stopwords_stemmed.append(stemmer.stem(wd))
corrected_word_list_without_stopwords_stemmed[:20]

['In',
 'book',
 'author',
 'soho',
 'ghost',
 'dwight',
 'gun',
 ',',
 'shall',
 'learn',
 'process',
 'natur',
 'languag',
 'extract',
 'insight',
 '.',
 'the',
 'first',
 'four',
 'chapter']

In [44]:
lemmatizer = WordNetLemmatizer()
corrected_word_list_without_stopwords_lemmatized = []
for wd in corrected_word_list_without_stopwords:
    corrected_word_list_without_stopwords_lemmatized.append(lemmatizer.lemmatize(wd))
corrected_word_list_without_stopwords_lemmatized[:20]

['In',
 'book',
 'authored',
 'Soho',
 'Ghost',
 'Dwight',
 'Gunning',
 ',',
 'shall',
 'learning',
 'process',
 'Natural',
 'Language',
 'extract',
 'insight',
 '.',
 'The',
 'first',
 'four',
 'chapter']

In [45]:
sent_tokenize(corrected_sentence)

[' In this book authored by Soho Ghost and Dwight Gunning, we shall learning how to process Natural Language and extract insights from it.',
 'The first four chapter will introduce you to the basics of NLP.',
 'Later chapters will describe how to deal with complex NLP projects.',
 'If you want to get early access of it, you should book your order now.']