<a href="https://colab.research.google.com/github/ShankarDhandapani/Google-colab/blob/master/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Natural Language Processing (NLP)

In [0]:
#@title Basic Imports
import nltk
import re
import heapq   
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [0]:
nltk.download('popular', halt_on_error=False)

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

True

## Tokenization

In [0]:
#@title Data
data = "It originated from the idea that there are readers who prefer learning new skills from the comforts of their drawing rooms"

In [0]:
#@title Tokenizing sentences
sentences = nltk.sent_tokenize(data)
sentences

['It originated from the idea that there are readers who prefer learning new skills from the comforts of their drawing rooms']

In [0]:
#@title Tokenizing words
words = nltk.word_tokenize(data)
words

['It',
 'originated',
 'from',
 'the',
 'idea',
 'that',
 'there',
 'are',
 'readers',
 'who',
 'prefer',
 'learning',
 'new',
 'skills',
 'from',
 'the',
 'comforts',
 'of',
 'their',
 'drawing',
 'rooms']

## Stemming the words

In [0]:
#@title Stemming
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
# First Word tokenization
nltk_tokens = nltk.word_tokenize(data)
#Next find the roots of the word
for w in nltk_tokens:
       print ("Actual: %s  Stem: %s"  % (w,porter_stemmer.stem(w)))

Actual: It  Stem: It
Actual: originated  Stem: origin
Actual: from  Stem: from
Actual: the  Stem: the
Actual: idea  Stem: idea
Actual: that  Stem: that
Actual: there  Stem: there
Actual: are  Stem: are
Actual: readers  Stem: reader
Actual: who  Stem: who
Actual: prefer  Stem: prefer
Actual: learning  Stem: learn
Actual: new  Stem: new
Actual: skills  Stem: skill
Actual: from  Stem: from
Actual: the  Stem: the
Actual: comforts  Stem: comfort
Actual: of  Stem: of
Actual: their  Stem: their
Actual: drawing  Stem: draw
Actual: rooms  Stem: room


In [0]:
#@title Lemmatization 
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
nltk_tokens = nltk.word_tokenize(data)
for w in nltk_tokens:
       print ("Actual: %s  Lemma: %s"  % (w,wordnet_lemmatizer.lemmatize(w)))

Actual: It  Lemma: It
Actual: originated  Lemma: originated
Actual: from  Lemma: from
Actual: the  Lemma: the
Actual: idea  Lemma: idea
Actual: that  Lemma: that
Actual: there  Lemma: there
Actual: are  Lemma: are
Actual: readers  Lemma: reader
Actual: who  Lemma: who
Actual: prefer  Lemma: prefer
Actual: learning  Lemma: learning
Actual: new  Lemma: new
Actual: skills  Lemma: skill
Actual: from  Lemma: from
Actual: the  Lemma: the
Actual: comforts  Lemma: comfort
Actual: of  Lemma: of
Actual: their  Lemma: their
Actual: drawing  Lemma: drawing
Actual: rooms  Lemma: room


## Stop Words

In [0]:
#@title Removing stopwords
sentences = nltk.sent_tokenize(data)
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [word for word in words if word not in stopwords.words('english')]
    sentences[i] = ' '.join(words)    
sentences

['It originated idea readers prefer learning new skills comforts drawing rooms']

## Part-Of-Speech Tagging

In [0]:
#@title POS Tagging
words = nltk.word_tokenize(data)
tagged_words = nltk.pos_tag(words)
# Tagged word paragraph
word_tags = []
for tw in tagged_words:
    word_tags.append(tw[0]+"_"+tw[1])

tagged_paragraph = ' '.join(word_tags)
tagged_paragraph

'It_PRP originated_VBD from_IN the_DT idea_NN that_IN there_EX are_VBP readers_NNS who_WP prefer_VBP learning_VBG new_JJ skills_NNS from_IN the_DT comforts_NNS of_IN their_PRP$ drawing_NN rooms_NNS'

## Word Count

In [0]:
#@title Creating word histogram (Word Count or Map Reduce)
dataset = nltk.sent_tokenize(data)
for i in range(len(dataset)):
    dataset[i] = dataset[i].lower()
    dataset[i] = re.sub(r'\W',' ',dataset[i])
    dataset[i] = re.sub(r'\s+',' ',dataset[i])
word2count = {}
for d in dataset:
    words = nltk.word_tokenize(d)
    for word in words:
        if word not in word2count.keys():
            word2count[word] = 1
        else:
            word2count[word] += 1
word2count

{'are': 1,
 'comforts': 1,
 'drawing': 1,
 'from': 2,
 'idea': 1,
 'it': 1,
 'learning': 1,
 'new': 1,
 'of': 1,
 'originated': 1,
 'prefer': 1,
 'readers': 1,
 'rooms': 1,
 'skills': 1,
 'that': 1,
 'the': 2,
 'their': 1,
 'there': 1,
 'who': 1}

In [0]:
#@title Selecting best features
# Tokenize sentences
dataset = nltk.sent_tokenize(data)
for i in range(len(dataset)):
    dataset[i] = dataset[i].lower()
    dataset[i] = re.sub(r'\W',' ',dataset[i])
    dataset[i] = re.sub(r'\s+',' ',dataset[i])


# Creating word histogram
word2count = {}
for d1 in dataset:
    words = nltk.word_tokenize(d1)
    for word in words:
        if word not in word2count.keys():
            word2count[word] = 1
        else:
            word2count[word] += 1
freq_words = heapq.nlargest(3,word2count,key=word2count.get)
freq_words

['from', 'the', 'it']

## Synonyms and Antonyms

In [0]:
#@title synonyms and antonyms
synonyms = []
antonyms = []

for syn in wordnet.synsets("good"):
    for s in syn.lemmas():
        synonyms.append(s.name())
        for a in s.antonyms():
            antonyms.append(a.name())
            
print(set(synonyms))
print(set(antonyms))

{'well', 'honest', 'safe', 'unspoiled', 'estimable', 'practiced', 'goodness', 'just', 'good', 'near', 'secure', 'beneficial', 'proficient', 'adept', 'dear', 'full', 'undecomposed', 'ripe', 'honorable', 'unspoilt', 'sound', 'dependable', 'right', 'trade_good', 'commodity', 'serious', 'thoroughly', 'effective', 'expert', 'skilful', 'in_force', 'skillful', 'respectable', 'soundly', 'upright', 'in_effect', 'salutary'}
{'bad', 'ill', 'evilness', 'badness', 'evil'}


## Exercises
**1. Scrap NITT Wikipedia Page (https://en.wikipedia.org/wiki/National_Institute_of_Technology,_Tiruchirappalli) and apply above concepts**

**2. Scrap Palindrome words from (http://www.word-buff.com/single-word-palindromes.html) and find it's synonyms and antonyms in both CPU and GPU and Calculte it's processing time**