# Basic NLP Tasks with NLTK

In [8]:
import nltk
#getting some text corpora
nltk.download()

# Download description link
# https://www.nltk.org/data.html

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [9]:
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


## 1. Counting vocabulary of words

In [10]:
text1

<Text: Moby Dick by Herman Melville 1851>

In [11]:
len(text1)

260819

In [13]:
sent1

['Call', 'me', 'Ishmael', '.']

In [15]:
len(set(text1))

19317

In [16]:
text7

<Text: Wall Street Journal>

In [17]:
len(text7)

100676

In [18]:
sent7

['Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov.',
 '29',
 '.']

In [19]:
len(sent7)

18

In [21]:
len(set(text7))

12408

In [22]:
text7

<Text: Wall Street Journal>

In [24]:
list(set(text7))[:10]

['pension-fund',
 'jumping',
 'minted',
 'TRUST',
 'jobs',
 'label',
 'Rita',
 '3.42',
 '135',
 'fire']

## 2. Frequency of words

In [26]:
dist = FreqDist(text7)
len(dist)

12408

In [57]:
# set of key value pairs like dictionary
# here keys are words
# values are number
dist

FreqDist({',': 4885, 'the': 4045, '.': 3828, 'of': 2319, 'to': 2164, 'a': 1878, 'in': 1572, 'and': 1511, '*-1': 1123, '0': 1099, ...})

In [39]:
vocab1 = dist.keys()
#vocab1[:10] 
# In Python 3 dict.keys() returns an iterable view instead of a list

list(vocab1)[:10]

['Pierre', 'Vinken', ',', '61', 'years', 'old', 'will', 'join', 'the', 'board']

In [40]:
# number of repititions of a word
dist['four']

20

In [42]:
freqwords = [w for w in vocab1 if len(w) > 5 and dist[w] > 5]
freqwords[:10]

['director',
 'chairman',
 'publishing',
 'former',
 'British',
 'industrial',
 'asbestos',
 'filters',
 'caused',
 'percentage']

In [43]:
len(freqwords)

1175

## 3. Normalization and stemming

In [44]:
input1 = "List listed lists listing listings"
words1 = input1.lower().split(' ')
words1

['list', 'listed', 'lists', 'listing', 'listings']

In [45]:
## apply 'nltk.PorterStemmer()'

porter = nltk.PorterStemmer()
[porter.stem(t) for t in words1]

['list', 'list', 'list', 'list', 'list']

## 4. Lemmatization

In [47]:
## Lemmatization
udhr = nltk.corpus.udhr.words('English-Latin1')
udhr[:20]

['Universal',
 'Declaration',
 'of',
 'Human',
 'Rights',
 'Preamble',
 'Whereas',
 'recognition',
 'of',
 'the',
 'inherent',
 'dignity',
 'and',
 'of',
 'the',
 'equal',
 'and',
 'inalienable',
 'rights',
 'of']

In [49]:
[porter.stem(t) for t in udhr][:20]

['univers',
 'declar',
 'of',
 'human',
 'right',
 'preambl',
 'wherea',
 'recognit',
 'of',
 'the',
 'inher',
 'digniti',
 'and',
 'of',
 'the',
 'equal',
 'and',
 'inalien',
 'right',
 'of']

**After stemming on udhr**

    * Universal --> univers
    * Declaration --> declar
    * Rights --> right
    * rights --> right
<br>
and so on... <br>[Those are wrong representation]

#### New lets try 'nltk.WordNetLemmatizer()'

In [54]:
WNlemma = nltk.WordNetLemmatizer()
[WNlemma.lemmatize(t) for t in udhr[:20]]

['Universal',
 'Declaration',
 'of',
 'Human',
 'Rights',
 'Preamble',
 'Whereas',
 'recognition',
 'of',
 'the',
 'inherent',
 'dignity',
 'and',
 'of',
 'the',
 'equal',
 'and',
 'inalienable',
 'right',
 'of']

In [56]:
#Same code just a for loop
#for t in udhr[:20]:
    #print(WNlemma.lemmatize(t))

**After Lemmatizing on udhr**

    * Universal --> Universal
    * Declaration --> Declaration
    * Rights --> Rights
**But**<br>

    * rights --> right
**So in this case lemmatization shows better word representation than stemming**

## 5. Tokenization

In [59]:
text11 = "Children shouldn't drink a sugary drink before bed."
text11.split()

['Children', "shouldn't", 'drink', 'a', 'sugary', 'drink', 'before', 'bed.']

**"shouldn't" <br>
'bed.'**

In [60]:
## Lets apply tokenize to text11
nltk.word_tokenize(text11)

['Children',
 'should',
 "n't",
 'drink',
 'a',
 'sugary',
 'drink',
 'before',
 'bed',
 '.']

Here <br>
**'should'<br>
 "n't"<br><br>
 'bed'<br>
 '.'** <br>
 are separated

### Tokenize on sentences

In [61]:
text12 = "This is the first sentence. A gallon of milk in the U.S. costs $2.99. Is this the third sentence? Yes, it is!"
sentences = nltk.sent_tokenize(text12)
len(sentences)

4

In [62]:
sentences

['This is the first sentence.',
 'A gallon of milk in the U.S. costs $2.99.',
 'Is this the third sentence?',
 'Yes, it is!']