Nltk book is here: http://www.nltk.org/book/

NLP applications: http://blog.mashape.com/list-of-25-natural-language-processing-apis/

In [1]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet


In [2]:
# we have a sentence, and we want to extract all the words from it.
sentence = "Camera quality is not upto the mark I visited one plus store and the store representative checked my phone and compared the camera quality with his demo phone He noticed a significant difference between the two I request Amazon to replace my phone with a new one as it is evident that this phone has some problems"

In [3]:
sentence.split(" ")

['Camera',
 'quality',
 'is',
 'not',
 'upto',
 'the',
 'mark',
 'I',
 'visited',
 'one',
 'plus',
 'store',
 'and',
 'the',
 'store',
 'representative',
 'checked',
 'my',
 'phone',
 'and',
 'compared',
 'the',
 'camera',
 'quality',
 'with',
 'his',
 'demo',
 'phone',
 'He',
 'noticed',
 'a',
 'significant',
 'difference',
 'between',
 'the',
 'two',
 'I',
 'request',
 'Amazon',
 'to',
 'replace',
 'my',
 'phone',
 'with',
 'a',
 'new',
 'one',
 'as',
 'it',
 'is',
 'evident',
 'that',
 'this',
 'phone',
 'has',
 'some',
 'problems']

In [4]:
# We can split the function on a space (” “) to get all the words. 
# However, The problem with this is, we cannot extract punctuation marks like full stops, 
# and this simple parser will not be able to handle every single type of sentence.

# Which is why we should use the word tokenizer provided by the NLTK library. This correctly identifies punctuation marks:
from nltk.tokenize import word_tokenize
sentence = sentence.lower()
word_tokenize(sentence)

['camera',
 'quality',
 'is',
 'not',
 'upto',
 'the',
 'mark',
 'i',
 'visited',
 'one',
 'plus',
 'store',
 'and',
 'the',
 'store',
 'representative',
 'checked',
 'my',
 'phone',
 'and',
 'compared',
 'the',
 'camera',
 'quality',
 'with',
 'his',
 'demo',
 'phone',
 'he',
 'noticed',
 'a',
 'significant',
 'difference',
 'between',
 'the',
 'two',
 'i',
 'request',
 'amazon',
 'to',
 'replace',
 'my',
 'phone',
 'with',
 'a',
 'new',
 'one',
 'as',
 'it',
 'is',
 'evident',
 'that',
 'this',
 'phone',
 'has',
 'some',
 'problems']

In [5]:
# Nltk comes inbuilt with a list of stop words for all main languages. 
# To see the stop words for English:


stopwords.words('german')


['aber',
 'alle',
 'allem',
 'allen',
 'aller',
 'alles',
 'als',
 'also',
 'am',
 'an',
 'ander',
 'andere',
 'anderem',
 'anderen',
 'anderer',
 'anderes',
 'anderm',
 'andern',
 'anderr',
 'anders',
 'auch',
 'auf',
 'aus',
 'bei',
 'bin',
 'bis',
 'bist',
 'da',
 'damit',
 'dann',
 'der',
 'den',
 'des',
 'dem',
 'die',
 'das',
 'dass',
 'daß',
 'derselbe',
 'derselben',
 'denselben',
 'desselben',
 'demselben',
 'dieselbe',
 'dieselben',
 'dasselbe',
 'dazu',
 'dein',
 'deine',
 'deinem',
 'deinen',
 'deiner',
 'deines',
 'denn',
 'derer',
 'dessen',
 'dich',
 'dir',
 'du',
 'dies',
 'diese',
 'diesem',
 'diesen',
 'dieser',
 'dieses',
 'doch',
 'dort',
 'durch',
 'ein',
 'eine',
 'einem',
 'einen',
 'einer',
 'eines',
 'einig',
 'einige',
 'einigem',
 'einigen',
 'einiger',
 'einiges',
 'einmal',
 'er',
 'ihn',
 'ihm',
 'es',
 'etwas',
 'euer',
 'eure',
 'eurem',
 'euren',
 'eurer',
 'eures',
 'für',
 'gegen',
 'gewesen',
 'hab',
 'habe',
 'haben',
 'hat',
 'hatte',
 'hatten',
 '

## POS Tagging

Remove all punct
Identify the parts of speech 

In [6]:
# 
# Now, let's get a tag associated with each and every token and see what part of speech these are.
# Whether they're noun, pronoun, adverb, adjective etc.

# By doing so, we can learn more about the constituents of a statement/tweet and see what kind of worlds are 
# present in it.
w = word_tokenize(sentence)
tokensLC = list()
for words in w:
    tokensLC.append(words.lower())

nltk.pos_tag(tokensLC)

[('camera', 'NN'),
 ('quality', 'NN'),
 ('is', 'VBZ'),
 ('not', 'RB'),
 ('upto', 'JJ'),
 ('the', 'DT'),
 ('mark', 'NN'),
 ('i', 'NN'),
 ('visited', 'VBD'),
 ('one', 'CD'),
 ('plus', 'CC'),
 ('store', 'NN'),
 ('and', 'CC'),
 ('the', 'DT'),
 ('store', 'NN'),
 ('representative', 'NN'),
 ('checked', 'VBD'),
 ('my', 'PRP$'),
 ('phone', 'NN'),
 ('and', 'CC'),
 ('compared', 'VBN'),
 ('the', 'DT'),
 ('camera', 'NN'),
 ('quality', 'NN'),
 ('with', 'IN'),
 ('his', 'PRP$'),
 ('demo', 'NN'),
 ('phone', 'NN'),
 ('he', 'PRP'),
 ('noticed', 'VBD'),
 ('a', 'DT'),
 ('significant', 'JJ'),
 ('difference', 'NN'),
 ('between', 'IN'),
 ('the', 'DT'),
 ('two', 'CD'),
 ('i', 'NN'),
 ('request', 'VBP'),
 ('amazon', 'NN'),
 ('to', 'TO'),
 ('replace', 'VB'),
 ('my', 'PRP$'),
 ('phone', 'NN'),
 ('with', 'IN'),
 ('a', 'DT'),
 ('new', 'JJ'),
 ('one', 'CD'),
 ('as', 'IN'),
 ('it', 'PRP'),
 ('is', 'VBZ'),
 ('evident', 'JJ'),
 ('that', 'IN'),
 ('this', 'DT'),
 ('phone', 'NN'),
 ('has', 'VBZ'),
 ('some', 'DT'),
 ('prob

List of tags: http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

![image.png](attachment:image.png)

In [7]:
# The Nltk has many great features, like finding the meaning of words, finding examples of words, 
# finding similar and opposite words etc. 

# You can see how useful these features would be if you were building like a search engine, or a text parser.

# Let’s look at a few of these features.

# The first thing you can do it, find the definition of any word.
syn = wordnet.synsets("irregular")
print(syn)
print(syn[0].name())
print(syn[0].definition())

print(syn[1].name())
print(syn[1].definition())

[Synset('guerrilla.n.01'), Synset('irregular.n.02'), Synset('irregular.a.01'), Synset('irregular.s.02'), Synset('irregular.a.03'), Synset('irregular.a.04'), Synset('irregular.s.05'), Synset('atypical.s.02'), Synset('irregular.s.07'), Synset('irregular.s.08'), Synset('irregular.s.09')]
guerrilla.n.01
a member of an irregular armed force that fights a stronger force by sabotage and harassment
irregular.n.02
merchandise that has imperfections; usually sold at a reduced price without the brand name


In [8]:
syn = wordnet.synsets("feature")
syn[0].examples()

['the map showed roads and other features',
 'generosity is one of his best characteristics']

In [9]:
syn = wordnet.synsets("set")
syn[1].examples()
syn[2].examples()

['he did four sets of the incline bench press']

In [10]:
# We can get words closer to a certain word using synsets, hypernyms and hyponymns.
# Eg., we use the word "Speak" here. and what we get is the synonymns of speak.

# Hymernym : a word with a broad meaning constituting a category into which words with more 
#            specific meanings fall

# Hyponyms : each of two or more words having the same spelling or pronunciation but 
#            different meanings and origins 

syn = wordnet.synsets("speak")[0]
print(syn.hypernyms())
print("**********************************************")
print(syn.hyponyms())

[Synset('communicate.v.02')]
**********************************************
[Synset('babble.v.01'), Synset('bark.v.01'), Synset('bay.v.01'), Synset('begin.v.04'), Synset('blubber.v.02'), Synset('blurt_out.v.01'), Synset('bumble.v.03'), Synset('cackle.v.01'), Synset('chatter.v.04'), Synset('chatter.v.05'), Synset('deliver.v.01'), Synset('drone.v.02'), Synset('enthuse.v.02'), Synset('generalize.v.02'), Synset('gulp.v.02'), Synset('hiss.v.03'), Synset('lip_off.v.01'), Synset('mumble.v.01'), Synset('murmur.v.01'), Synset('open_up.v.07'), Synset('peep.v.04'), Synset('rant.v.01'), Synset('rasp.v.02'), Synset('read.v.03'), Synset('shout.v.01'), Synset('sing.v.02'), Synset('slur.v.03'), Synset('snap.v.01'), Synset('snivel.v.01'), Synset('speak_in_tongues.v.01'), Synset('speak_up.v.02'), Synset('swallow.v.04'), Synset('talk_of.v.01'), Synset('tone.v.01'), Synset('tone.v.02'), Synset('troll.v.07'), Synset('verbalize.v.01'), Synset('vocalize.v.05'), Synset('whiff.v.05'), Synset('whisper.v.01'), S

In [11]:
# Here in this example, we try to find the opposite words of "good"

# lemma in NLTK is a canonical form of a word.
syn = wordnet.synsets("speak")

for s in syn:
    for l in s.lemmas():
        print(l)

Lemma('talk.v.02.talk')
Lemma('talk.v.02.speak')
Lemma('talk.v.02.utter')
Lemma('talk.v.02.mouth')
Lemma('talk.v.02.verbalize')
Lemma('talk.v.02.verbalise')
Lemma('talk.v.01.talk')
Lemma('talk.v.01.speak')
Lemma('speak.v.03.speak')
Lemma('speak.v.03.talk')
Lemma('address.v.02.address')
Lemma('address.v.02.speak')
Lemma('speak.v.05.speak')


In [12]:
# Lemmas can be used to find all similar words:

# And it heps us to reduce/substitute a set of words to one single word


syn = wordnet.synsets("book")
print("*************************************************************************************")
print("Synonyms of book")
print("-------------------------------------------------------------------------------------")
print(syn)
print("-------------------------------------------------------------------------------------")

print("*************************************************************************************")
print("Lemmas of book - Words that are similar to the word book and unique words in NLTK")
print("-------------------------------------------------------------------------------------")
for s in syn:
    print(s.lemmas())
print("-------------------------------------------------------------------------------------")


*************************************************************************************
Synonyms of book
-------------------------------------------------------------------------------------
[Synset('book.n.01'), Synset('book.n.02'), Synset('record.n.05'), Synset('script.n.01'), Synset('ledger.n.01'), Synset('book.n.06'), Synset('book.n.07'), Synset('koran.n.01'), Synset('bible.n.01'), Synset('book.n.10'), Synset('book.n.11'), Synset('book.v.01'), Synset('reserve.v.04'), Synset('book.v.03'), Synset('book.v.04')]
-------------------------------------------------------------------------------------
*************************************************************************************
Lemmas of book - Words that are similar to the word book and unique words in NLTK
-------------------------------------------------------------------------------------
[Lemma('book.n.01.book')]
[Lemma('book.n.02.book'), Lemma('book.n.02.volume')]
[Lemma('record.n.05.record'), Lemma('record.n.05.record_book'), L

In [16]:
from textblob import TextBlob
from textblob import Word

In [17]:
# spelling correction
TextBlob('15 minuten late').correct()

TextBlob("15 minutes late")

In [18]:
s="this is bcz"
TextBlob(s).correct()

TextBlob("this is bc")

In [19]:
Word('studi').spellcheck()

[('study', 0.9664429530201343),
 ('stud', 0.020134228187919462),
 ('studio', 0.013422818791946308)]

In [20]:
TextBlob('This is a bright day').detect_language()

AttributeError: 'TextBlob' object has no attribute 'detect_language'

In [28]:
TextBlob('Aaj bahut acha din hai #AchcheDin').detect_language()

'hi'

In [29]:
TextBlob('Chala manchi roju idi').detect_language()

'te'

In [34]:
TextBlob.translator.translate('Aaj bahut shubh din hai',to_lang="en")

"I'm looking forward to this"

In [40]:
# https://en.wikipedia.org/wiki/Cadet_Nurse_Corps
from nltk.tokenize import word_tokenize

para = """
AnalytixLabs - leading Capability Building and Training Solutions Provider.

Our courses are crafted by experts to keep you ahead of the curve in industry best practices. 
Case study based modules ensure that participants learn practical applications along with the theoretical concepts. 

Further to this, new courses are continuously launched and old ones keep evolving as per the latest and upcoming 
industry trends.

High degree of commitment & personal attention is given through small batch size and individual counselling. 
Hands-on sessions and practice assignments on real life business datasets are included to ensure assimilated learning.

"""
words = word_tokenize(para)
print("*************************************************************************************")
print("Words with all stopwords")
print("-------------------------------------------------------------------------------------")
print(words)
print("-------------------------------------------------------------------------------------")

useful_words = [word.lower() for word in words if word not in stopwords.words('english')]
print("*************************************************************************************")
print("Sentence is clean now - no stop words included")
print("-------------------------------------------------------------------------------------")
print(useful_words)
print("-------------------------------------------------------------------------------------")


*************************************************************************************
Words with all stopwords
-------------------------------------------------------------------------------------
['AnalytixLabs', '-', 'leading', 'Capability', 'Building', 'and', 'Training', 'Solutions', 'Provider', '.', 'Our', 'courses', 'are', 'crafted', 'by', 'experts', 'to', 'keep', 'you', 'ahead', 'of', 'the', 'curve', 'in', 'industry', 'best', 'practices', '.', 'Case', 'study', 'based', 'modules', 'ensure', 'that', 'participants', 'learn', 'practical', 'applications', 'along', 'with', 'the', 'theoretical', 'concepts', '.', 'Further', 'to', 'this', ',', 'new', 'courses', 'are', 'continuously', 'launched', 'and', 'old', 'ones', 'keep', 'evolving', 'as', 'per', 'the', 'latest', 'and', 'upcoming', 'industry', 'trends', '.', 'High', 'degree', 'of', 'commitment', '&', 'personal', 'attention', 'is', 'given', 'through', 'small', 'batch', 'size', 'and', 'individual', 'counselling', '.', 'Hands-on', 'sessio

In [41]:
# This is how the Naive Bayes classifier expects the input

def create_word_features(words):
    useful_words = [word for word in words if word not in stopwords.words("english")]
    my_dict = dict([(word, True) for word in useful_words])
    return my_dict

create_word_features(["the", "quick", "brown", "quick", "a", "fox"])

{'quick': True, 'brown': True, 'fox': True}

In [None]:
#Abbrevations and Words correction
def clean_text(text):
    import re
    text = text.lower()
    text = text.strip()
    text = re.sub(r' +', ' ', text)
    text = re.sub(r"[-()\"#/@;:{}`+=~|.!?,'0-9]", "", text)
    return(text)

In [None]:
cleanedWords = []
for words in useful_words:
    
    cleanedWords.append(clean_text(words))
cleanedWords

In [46]:
sentence = "Player's Unknown Battle Grounds"
s = ""
for i in sentence.split(" "):
    print(i[0])


P
U
B
G
