# Natural Language Processing

In [28]:
import nltk

In [29]:
paragraph = """ John does his work intelligently. John is an intelligent man. John is always working """
paragraph

' John does his work intelligently. John is an intelligent man. John is always working '

### Import PorterSetemmer from NLTK library

In [30]:
from nltk.stem import PorterStemmer

#### Tokenization is a process in which a sequence is broken into pieces such as words,sentences,phrases etc.
#### It will break the whole paragarh into diffrent sentences as per ",","." etc in a list.

In [31]:
sentence = nltk.sent_tokenize(paragraph)

In [32]:
sentence

[' John does his work intelligently.',
 'John is an intelligent man.',
 'John is always working']

In [33]:
stemmer = PorterStemmer()

### Now we can see how stemming works 

#### STEMMING : The Process of reducing infected or derived words to their base form or root form of words. 
#### eg intelligent/intelligence/intelligently -----> make it intellig which has no meaning, That's a drawback of stemming

In [34]:
for i in range(len(sentence)):
    words = nltk.word_tokenize(sentence[i])
    newwords = [stemmer.stem(word) for word in words]
    print("words:",words)
    print("Newwords:",newwords)
    sentence[i] = ' '.join(newwords)
    print("Sentences with newwords ----------->",sentence[i])

words: ['John', 'does', 'his', 'work', 'intelligently', '.']
Newwords: ['john', 'doe', 'hi', 'work', 'intellig', '.']
Sentences with newwords -----------> john doe hi work intellig .
words: ['John', 'is', 'an', 'intelligent', 'man', '.']
Newwords: ['john', 'is', 'an', 'intellig', 'man', '.']
Sentences with newwords -----------> john is an intellig man .
words: ['John', 'is', 'always', 'working']
Newwords: ['john', 'is', 'alway', 'work']
Sentences with newwords -----------> john is alway work


### LEMMATIZATION

#### To Get away the drawback of Stemming we can use Lemmatization  eg. in stemming very---> veri but lemmatization take the word which has meaning  very ----> very 

In [35]:
from nltk.stem import WordNetLemmatizer

In [36]:
lemmatizer = WordNetLemmatizer()

In [37]:
for i in range(len(sentence)):
    words = nltk.word_tokenize(sentence[i])
    newwords = [lemmatizer.lemmatize(word) for word in words]
    print(newwords)
    print("words:",words)
    print("Newwords:",newwords)
    sentence[i] = ' '.join(newwords)

['john', 'doe', 'hi', 'work', 'intellig', '.']
words: ['john', 'doe', 'hi', 'work', 'intellig', '.']
Newwords: ['john', 'doe', 'hi', 'work', 'intellig', '.']
['john', 'is', 'an', 'intellig', 'man', '.']
words: ['john', 'is', 'an', 'intellig', 'man', '.']
Newwords: ['john', 'is', 'an', 'intellig', 'man', '.']
['john', 'is', 'alway', 'work']
words: ['john', 'is', 'alway', 'work']
Newwords: ['john', 'is', 'alway', 'work']


## Stop Words

In [38]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Tripti
[nltk_data]     Rotake\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [39]:
paragraph = """ John does his work intelligently. John is an intelligent man. John is always working """
paragraph

' John does his work intelligently. John is an intelligent man. John is always working '

In [40]:
sentence = nltk.sent_tokenize(paragraph)

In [41]:
from nltk.corpus import stopwords

In [42]:
for i in range(len(sentence)):
    words = nltk.word_tokenize(sentence[i])
    newwords = [word for word in words if word not in stopwords.words('english')]
    sentence[i] = ' '.join(newwords)
    print(newwords)  

['John', 'work', 'intelligently', '.']
['John', 'intelligent', 'man', '.']
['John', 'always', 'working']


### Parts  Of Speech Tagging  

In [43]:
paragraph = """ John does his work intelligently. John is an intelligent man. John is always working """
paragraph

' John does his work intelligently. John is an intelligent man. John is always working '

In [44]:
words = nltk.word_tokenize(paragraph)
words

['John',
 'does',
 'his',
 'work',
 'intelligently',
 '.',
 'John',
 'is',
 'an',
 'intelligent',
 'man',
 '.',
 'John',
 'is',
 'always',
 'working']

In [45]:
tagged_words = nltk.pos_tag(words)
tagged_words 

[('John', 'NNP'),
 ('does', 'VBZ'),
 ('his', 'PRP$'),
 ('work', 'NN'),
 ('intelligently', 'RB'),
 ('.', '.'),
 ('John', 'NNP'),
 ('is', 'VBZ'),
 ('an', 'DT'),
 ('intelligent', 'JJ'),
 ('man', 'NN'),
 ('.', '.'),
 ('John', 'NNP'),
 ('is', 'VBZ'),
 ('always', 'RB'),
 ('working', 'VBG')]

##### The output is in tuple, so we can't use them for formulating models.  So we going to generate other paragrah of list

In [46]:
word_tags = []
for tw in tagged_words:
    word_tags.append(tw[0]+"_"+tw[1])
print(word_tags)

['John_NNP', 'does_VBZ', 'his_PRP$', 'work_NN', 'intelligently_RB', '._.', 'John_NNP', 'is_VBZ', 'an_DT', 'intelligent_JJ', 'man_NN', '._.', 'John_NNP', 'is_VBZ', 'always_RB', 'working_VBG']


In [47]:
#convert it to in paragarh
tagged_paragrah = ' '.join(word_tags)

In [48]:
tagged_paragrah

'John_NNP does_VBZ his_PRP$ work_NN intelligently_RB ._. John_NNP is_VBZ an_DT intelligent_JJ man_NN ._. John_NNP is_VBZ always_RB working_VBG'

### Named Entity Recognization

In [49]:
import nltk
paragraph = " The Taj MAhal was built by Emperor Shah Jahan"
paragraph

' The Taj MAhal was built by Emperor Shah Jahan'

In [50]:
words = nltk.word_tokenize(paragraph)
words

['The', 'Taj', 'MAhal', 'was', 'built', 'by', 'Emperor', 'Shah', 'Jahan']

In [51]:
tagged_words = nltk.pos_tag(words)
tagged_words

[('The', 'DT'),
 ('Taj', 'NNP'),
 ('MAhal', 'NNP'),
 ('was', 'VBD'),
 ('built', 'VBN'),
 ('by', 'IN'),
 ('Emperor', 'NNP'),
 ('Shah', 'NNP'),
 ('Jahan', 'NNP')]

In [52]:
namedEnt = nltk.ne_chunk(tagged_words)
namedEnt.draw()

### Bag Of Words (BoW)

### TEXT PREPROCESSING

In [64]:
import nltk
import re
paragraph = """ In a small Italian town, hundreds of years ago, a small business owner owed a large sum of money to a loan-shark. The loan-shark was a very old, unattractive looking guy that just so happened to fancy the business owner’s daughter.

He decided to offer the businessman a deal that would completely wipe out the debt he owed him. However, the catch was that we would only wipe out the debt if he could marry the businessman’s daughter.

Needless to say, this proposal was met with a look of disgust.

The loan-shark said that he would place two pebbles into a bag, one white and one black.

The daughter would then have to reach into the bag and pick out a pebble. If it was black, the debt would be wiped, but the loan-shark would then marry her. If it was white, the debt would also be wiped, but the daughter wouldn’t have to marry the loan-shark.

Standing on a pebble-strewn path in the businessman’s garden, the loan-shark bent over and picked up two pebbles.

Whilst he was picking them up, the daughter noticed that he’d picked up two black pebbles and placed them both into the bag.

He then asked the daughter to reach into the bag and pick one.

The daughter naturally had three choices as to what she could have done:

Refuse to pick a pebble from the bag.
Take both pebbles out of the bag and expose the loan-shark for cheating.
Pick a pebble from the bag fully well knowing it was black and sacrifice herself for her father’s freedom.
She drew out a pebble from the bag, and before looking at it ‘accidentally’ dropped it into the midst of the other pebbles. She said to the loan-shark;

 

“Oh, how clumsy of me. Never mind, if you look into the bag for the one that is left, you will be able to tell which pebble I picked.”

 

The pebble left in the bag is obviously black, and seeing as the loan-shark didn’t want to be exposed, he had to play along as if the pebble the daughter dropped was white, and clear her father’s debt. """

In [65]:
paragraph

' In a small Italian town, hundreds of years ago, a small business owner owed a large sum of money to a loan-shark. The loan-shark was a very old, unattractive looking guy that just so happened to fancy the business owner’s daughter.\n\nHe decided to offer the businessman a deal that would completely wipe out the debt he owed him. However, the catch was that we would only wipe out the debt if he could marry the businessman’s daughter.\n\nNeedless to say, this proposal was met with a look of disgust.\n\nThe loan-shark said that he would place two pebbles into a bag, one white and one black.\n\nThe daughter would then have to reach into the bag and pick out a pebble. If it was black, the debt would be wiped, but the loan-shark would then marry her. If it was white, the debt would also be wiped, but the daughter wouldn’t have to marry the loan-shark.\n\nStanding on a pebble-strewn path in the businessman’s garden, the loan-shark bent over and picked up two pebbles.\n\nWhilst he was pickin

In [70]:
dataset = nltk.sent_tokenize(paragraph)

In [71]:
dataset

[' In a small Italian town, hundreds of years ago, a small business owner owed a large sum of money to a loan-shark.',
 'The loan-shark was a very old, unattractive looking guy that just so happened to fancy the business owner’s daughter.',
 'He decided to offer the businessman a deal that would completely wipe out the debt he owed him.',
 'However, the catch was that we would only wipe out the debt if he could marry the businessman’s daughter.',
 'Needless to say, this proposal was met with a look of disgust.',
 'The loan-shark said that he would place two pebbles into a bag, one white and one black.',
 'The daughter would then have to reach into the bag and pick out a pebble.',
 'If it was black, the debt would be wiped, but the loan-shark would then marry her.',
 'If it was white, the debt would also be wiped, but the daughter wouldn’t have to marry the loan-shark.',
 'Standing on a pebble-strewn path in the businessman’s garden, the loan-shark bent over and picked up two pebbles.',

In [72]:
len(dataset)

18

In [73]:
for i in range(len(dataset)):
    dataset[i] = dataset[i].lower()            ## Lower Casing
    dataset[i] = re.sub(r'\W',' ',dataset[i])  ## substitute all non-words with single space (ALL PUNCTUATION)
    dataset[i] = re.sub(r'\s+',' ',dataset[i]) ## substitute all extra ( more then one spaces) spaces by single space
    print(dataset[i])

 in a small italian town hundreds of years ago a small business owner owed a large sum of money to a loan shark 
the loan shark was a very old unattractive looking guy that just so happened to fancy the business owner s daughter 
he decided to offer the businessman a deal that would completely wipe out the debt he owed him 
however the catch was that we would only wipe out the debt if he could marry the businessman s daughter 
needless to say this proposal was met with a look of disgust 
the loan shark said that he would place two pebbles into a bag one white and one black 
the daughter would then have to reach into the bag and pick out a pebble 
if it was black the debt would be wiped but the loan shark would then marry her 
if it was white the debt would also be wiped but the daughter wouldn t have to marry the loan shark 
standing on a pebble strewn path in the businessman s garden the loan shark bent over and picked up two pebbles 
whilst he was picking them up the daughter noticed

### Creating Histogram

In [74]:
word2count = {}
for data in dataset:
    words = nltk.word_tokenize(data)
    for word in words:
        if word in word2count:
            word2count[word]+=1
        else: 
            word2count[word]=1
print(word2count)

{'in': 3, 'a': 13, 'small': 2, 'italian': 1, 'town': 1, 'hundreds': 1, 'of': 6, 'years': 1, 'ago': 1, 'business': 2, 'owner': 2, 'owed': 2, 'large': 1, 'sum': 1, 'money': 1, 'to': 13, 'loan': 9, 'shark': 9, 'the': 37, 'was': 8, 'very': 1, 'old': 1, 'unattractive': 1, 'looking': 2, 'guy': 1, 'that': 6, 'just': 1, 'so': 1, 'happened': 1, 'fancy': 1, 's': 5, 'daughter': 8, 'he': 8, 'decided': 1, 'offer': 1, 'businessman': 3, 'deal': 1, 'would': 7, 'completely': 1, 'wipe': 2, 'out': 5, 'debt': 5, 'him': 1, 'however': 1, 'catch': 1, 'we': 1, 'only': 1, 'if': 5, 'could': 2, 'marry': 3, 'needless': 1, 'say': 1, 'this': 1, 'proposal': 1, 'met': 1, 'with': 1, 'look': 2, 'disgust': 1, 'said': 2, 'place': 1, 'two': 3, 'pebbles': 5, 'into': 6, 'bag': 10, 'one': 4, 'white': 3, 'and': 10, 'black': 5, 'then': 3, 'have': 3, 'reach': 2, 'pick': 4, 'pebble': 8, 'it': 5, 'be': 4, 'wiped': 2, 'but': 2, 'her': 3, 'also': 1, 'wouldn': 1, 't': 2, 'standing': 1, 'on': 1, 'strewn': 1, 'path': 1, 'garden': 1, '

### Filtering words

In [61]:
# Determine which word has a largest count
import heapq
freq_words = heapq.nlargest(100,word2count,key = word2count.get) # most freq 100 words

In [34]:
freq_words

['the',
 'a',
 'to',
 'bag',
 'and',
 'loan',
 'shark',
 'was',
 'daughter',
 'he',
 'pebble',
 'would',
 'of',
 'that',
 'into',
 's',
 'out',
 'debt',
 'if',
 'pebbles',
 'black',
 'it',
 'one',
 'pick',
 'be',
 'in',
 'businessman',
 'marry',
 'two',
 'white',
 'then',
 'have',
 'her',
 'picked',
 'up',
 'as',
 'she',
 'from',
 'for',
 'small',
 'business',
 'owner',
 'owed',
 'looking',
 'wipe',
 'could',
 'look',
 'said',
 'reach',
 'wiped',
 'but',
 't',
 'them',
 'both',
 'had',
 'father',
 'dropped',
 'you',
 'is',
 'left',
 'italian',
 'town',
 'hundreds',
 'years',
 'ago',
 'large',
 'sum',
 'money',
 'very',
 'old',
 'unattractive',
 'guy',
 'just',
 'so',
 'happened',
 'fancy',
 'decided',
 'offer',
 'deal',
 'completely',
 'him',
 'however',
 'catch',
 'we',
 'only',
 'needless',
 'say',
 'this',
 'proposal',
 'met',
 'with',
 'disgust',
 'place',
 'also',
 'wouldn',
 'standing',
 'on',
 'strewn',
 'path',
 'garden']

#### BoW Model

##### In BoW model each documents represent as a vector of 0's and 1's.

In [76]:
X = []
for data in dataset:
    vector = []
    for word in freq_words:
        if word in nltk.word_tokenize(data):
            vector.append(1)
        else: 
            vector.append(0)
    X.append(vector)

In [77]:
X

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [36]:
## Now we going to convert it in 2D-array
import numpy as np

In [37]:
X = np.array(X)

In [38]:
X

array([[0, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 0, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0]])

### TF-IDF 

### IDF

In [48]:
word_idfs = {}
for word in freq_words:
    doc_count =0
    for data in dataset:
        if word in nltk.word_tokenize(data):
            doc_count = doc_count + 1
    word_idfs[word]=np.log((len(dataset)/doc_count))

In [51]:
word_idfs

{'the': 0.11778303565638346,
 'a': 0.5877866649021191,
 'to': 0.5877866649021191,
 'bag': 0.6931471805599453,
 'and': 0.6931471805599453,
 'loan': 0.6931471805599453,
 'shark': 0.6931471805599453,
 'was': 0.8109302162163288,
 'daughter': 0.8109302162163288,
 'he': 1.0986122886681098,
 'pebble': 1.0986122886681098,
 'would': 1.0986122886681098,
 'of': 1.2809338454620642,
 'that': 1.0986122886681098,
 'into': 1.0986122886681098,
 's': 1.2809338454620642,
 'out': 1.2809338454620642,
 'debt': 1.2809338454620642,
 'if': 1.5040773967762742,
 'pebbles': 1.2809338454620642,
 'black': 1.2809338454620642,
 'it': 1.5040773967762742,
 'one': 1.791759469228055,
 'pick': 1.5040773967762742,
 'be': 1.791759469228055,
 'in': 1.791759469228055,
 'businessman': 1.791759469228055,
 'marry': 1.791759469228055,
 'two': 1.791759469228055,
 'white': 1.791759469228055,
 'then': 1.791759469228055,
 'have': 1.791759469228055,
 'her': 1.791759469228055,
 'picked': 1.791759469228055,
 'up': 2.1972245773362196,
 '

In [52]:
len(word_idfs)

100

### TF

In [57]:
tf_matrix = {}
for word in freq_words:
    doc_tf = []
    for data in dataset:
        frequency = 0
        for w in nltk.word_tokenize(data):
            if w ==word:
                frequency+= 1
        tf_word = frequency/len(nltk.word_tokenize(data))
        doc_tf.append(tf_word)
    tf_matrix[word] = doc_tf
                
                
                
                

In [60]:
tf_matrix

{'the': [0.0,
  0.09523809523809523,
  0.1111111111111111,
  0.15,
  0.0,
  0.05555555555555555,
  0.13333333333333333,
  0.11764705882352941,
  0.14285714285714285,
  0.09523809523809523,
  0.08333333333333333,
  0.15384615384615385,
  0.09523809523809523,
  0.14285714285714285,
  0.05,
  0.13043478260869565,
  0.09090909090909091,
  0.1076923076923077],
 'a': [0.17391304347826086,
  0.047619047619047616,
  0.05555555555555555,
  0.0,
  0.08333333333333333,
  0.05555555555555555,
  0.06666666666666667,
  0.0,
  0.0,
  0.047619047619047616,
  0.0,
  0.0,
  0.047619047619047616,
  0.0,
  0.05,
  0.043478260869565216,
  0.0,
  0.0],
 'to': [0.043478260869565216,
  0.047619047619047616,
  0.05555555555555555,
  0.0,
  0.08333333333333333,
  0.0,
  0.06666666666666667,
  0.0,
  0.047619047619047616,
  0.0,
  0.0,
  0.07692307692307693,
  0.09523809523809523,
  0.0,
  0.0,
  0.0,
  0.09090909090909091,
  0.046153846153846156],
 'bag': [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.05555555555555555,

In [61]:
len(tf_matrix)

100

### TFIDF

In [74]:
tfidf_matrix = []
for word in tf_matrix:
    tfidf = []
    for value in tf_matrix[word]:
        score=value*word_idfs[word]
        tfidf.append(score)
    tfidf_matrix.append(tfidf)

In [75]:
tfidf_matrix


[[0.0,
  0.011217431967274615,
  0.013087003961820384,
  0.01766745534845752,
  0.0,
  0.006543501980910192,
  0.01570440475418446,
  0.013856827724280407,
  0.016826147950911922,
  0.011217431967274615,
  0.009815252971365287,
  0.018120467024058996,
  0.011217431967274615,
  0.016826147950911922,
  0.005889151782819173,
  0.015363004650832623,
  0.01070754869603486,
  0.012684326916841297],
 [0.10222376780906418,
  0.02798984118581519,
  0.03265481471678439,
  0.0,
  0.048982222075176586,
  0.03265481471678439,
  0.03918577766014127,
  0.0,
  0.0,
  0.02798984118581519,
  0.0,
  0.0,
  0.02798984118581519,
  0.0,
  0.029389333245105953,
  0.025555941952266046,
  0.0,
  0.0],
 [0.025555941952266046,
  0.02798984118581519,
  0.03265481471678439,
  0.0,
  0.048982222075176586,
  0.0,
  0.03918577766014127,
  0.0,
  0.02798984118581519,
  0.0,
  0.0,
  0.04521435883862455,
  0.05597968237163038,
  0.0,
  0.0,
  0.0,
  0.0534351513547381,
  0.027128615303174727],
 [0.0,
  0.0,
  0.0,
  0.

In [76]:
X = np.asarray(tfidf_matrix)

In [77]:
X.shape

(100, 18)

In [80]:
X=np.transpose(X)

In [81]:
X.shape

(18, 100)

### N-Grams

In [199]:
import random

In [200]:
text = """ In a small Italian town, hundreds of years ago, a small business owner owed a large sum of money to a loan-shark. The loan-shark was a very old, unattractive looking guy that just so happened to fancy the business owner’s daughter.

He decided to offer the businessman a deal that would completely wipe out the debt he owed him. However, the catch was that we would only wipe out the debt if he could marry the businessman’s daughter.

Needless to say, this proposal was met with a look of disgust. """

In [201]:
n=3 # trigrams
ngrams = {}
len(text)-n

497

In [202]:
#creating n-grams
for i in range(len(text)-n):
    gram =text[i:i+n]
    if gram not in ngrams:
        ngrams[gram]=[]
    ngrams[gram].append(text[i+n])

In [203]:
#Testing are n_gram Model

current_gram = text[0:n]
result = current_gram
for i in range(500):
    if current_gram not in ngrams:
        break
    possibilities = ngrams[current_gram]
    next_item = possibilities[random.randrange(len(possibilities))]
    result+=next_item
    current_gram = result[len(result)-n:len(result)]
print(result)

 In a loan-shark. The decided a small Italian a debt he out that just so happened towner the loan-shark. The loan-shark. The loan-shark. The deal was that we would completely wipe out that we would completely wipe out the businessman’s daughter.

He debt he catch was met would completely with a look of disgust. 


In [198]:
import random
import nltk
text = """ In a small Italian town, hundreds of years ago, a small business owner owed a large sum of money to a loan-shark. The loan-shark was a very old, unattractive looking guy that just so happened to fancy the business owner’s daughter.

He decided to offer the businessman a deal In a small that would completely wipe out the debt he owed him. However, the catch was that we would only wipe out the debt if he could marry the businessman’s daughter.

Needless to say, this proposal was met with a look of disgust. """
n = 6
ngrams ={}

len(nltk.word_tokenize(text))

107

In [193]:
words = nltk.word_tokenize(text)
for i in range(len(words)-n):
    gram =' '.join(word[i:i+n])
    if gram not in ngrams:
        ngrams[gram] = []
    ngrams[gram].append(words[i+n])
    
currentGram = ' '.join(words[0:n])
result =currentGram
for i in range(95):
    if currentGram not in ngrams:
        break
    possibilities = ngrams[currentGram]
    nextItem = possibilities[random.randrange(len(possibilities))]
    result += ' '+nextItem
    rwords = nltk.word_tokenize(result)
    currentGram = ' '.join(rwords[len(rwords)-n:len(rwords)])

In [194]:
print(currentGram)

In a small Italian town ,


### Latent Semantic Analysis

In [1]:
#importing Libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [2]:
#Sample Data
dataset = ["The amount of polution is increasing day by day",
           "The concert was just greast",
           "I Love to see Gordan Ramsay cook",
           "Google is introducing a new technology",
           "AI Robots are examples of great technology present today",
           "All of us were singing in the concert",
           "We have launch campaign to stop pollution and global warming"]

In [3]:
len(dataset)

7

##### PREPROCESSING

#### Lower Casing

In [12]:
# Lower Casing
dataset = [line.lower() for line in dataset]
dataset

['the amount of polution is increasing day by day',
 'the concert was just greast',
 'i love to see gordan ramsay cook',
 'google is introducing a new technology',
 'ai robots are examples of great technology present today',
 'all of us were singing in the concert',
 'we have launch campaign to stop pollution and global warming']

In [13]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset)

In [20]:
print(X[0])

  (0, 5)	0.3211483974289089
  (0, 9)	0.6422967948578178
  (0, 18)	0.3211483974289089
  (0, 20)	0.2665807498646048
  (0, 27)	0.3211483974289089
  (0, 25)	0.2278643877752444
  (0, 2)	0.3211483974289089
  (0, 35)	0.2278643877752444


In [23]:
lsa = TruncatedSVD(n_components=4,n_iter=100)
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=4, n_iter=100,
             random_state=None, tol=0.0)

In [30]:
row1 = lsa.components_[0]
row1

array([ 8.85113667e-02,  1.98744709e-01,  1.33350030e-01, -4.15918698e-17,
        8.85113667e-02,  1.33350030e-01, -4.15918698e-17,  3.46979719e-01,
       -1.91662727e-16,  2.66700060e-01,  8.85113667e-02, -4.15918698e-17,
        1.05538944e-01, -1.91662727e-16,  2.19259895e-01,  8.85113667e-02,
       -4.15918698e-17,  1.98744709e-01,  1.33350030e-01,  1.05538944e-01,
        1.98298365e-01,  2.19259895e-01, -4.15918698e-17, -1.91662727e-16,
        1.05538944e-01,  2.98432605e-01, -4.15918698e-17,  1.33350030e-01,
        8.85113667e-02, -1.91662727e-16,  8.85113667e-02, -1.91662727e-16,
        1.98744709e-01, -4.15918698e-17,  1.61078423e-01,  3.91202594e-01,
       -1.93621347e-16,  8.85113667e-02,  1.98744709e-01, -4.15918698e-17,
        2.19259895e-01, -4.15918698e-17,  1.98744709e-01])

In [32]:
terms = vectorizer.get_feature_names()
for i,comp in enumerate(lsa.components_):
    componentTerms = zip(terms,comp)
    sortedTerms = sorted(componentTerms,key=lambda x:x[1],reverse=True)
    sortedTerms = sortedTerms[:10]
    print("\nConcept",i,":")
    for term in sortedTerms:
        print(term)
#Here we Seperate all the words according to Cocept like food,Tech,Music,News with there probabilities.


Concept 0 :
('the', 0.39120259386723283)
('concert', 0.34697971904847097)
('of', 0.29843260469823923)
('day', 0.2667000602240798)
('greast', 0.2192598949413184)
('just', 0.2192598949413184)
('was', 0.2192598949413184)
('all', 0.19874470880199513)
('in', 0.19874470880199513)
('singing', 0.19874470880199513)

Concept 1 :
('technology', 0.3813428511192244)
('google', 0.28970320674398875)
('introducing', 0.28970320674398875)
('new', 0.28970320674398875)
('is', 0.28954931014004825)
('ai', 0.16969847762410875)
('are', 0.1696984776241085)
('examples', 0.1696984776241085)
('great', 0.1696984776241085)
('present', 0.1696984776241085)

Concept 2 :
('to', 0.4157884439670067)
('cook', 0.28359165793510666)
('gordan', 0.28359165793510666)
('love', 0.28359165793510666)
('ramsay', 0.28359165793510666)
('see', 0.28359165793510666)
('and', 0.21730644711292504)
('campaign', 0.21730644711292504)
('global', 0.21730644711292504)
('have', 0.21730644711292504)

Concept 3 :
('examples', 0.2605506321958195)
('

In [43]:
concept_words = {}

terms = vectorizer.get_feature_names()
for i,comp in enumerate(lsa.components_):
    componentTerms = zip(terms,comp)
    sortedTerms = sorted(componentTerms,key=lambda x:x[1],reverse=True)
    sortedTerms = sortedTerms[:10]
    concept_words["Concept "+str(i)] = sortedTerms
#Here we Seperate all the words according to Cocept like food,Tech,Music,News with there probabilities.

In [44]:
concept_words

{'Concept 0': [('the', 0.39120259386723283),
  ('concert', 0.34697971904847097),
  ('of', 0.29843260469823923),
  ('day', 0.2667000602240798),
  ('greast', 0.2192598949413184),
  ('just', 0.2192598949413184),
  ('was', 0.2192598949413184),
  ('all', 0.19874470880199513),
  ('in', 0.19874470880199513),
  ('singing', 0.19874470880199513)],
 'Concept 1': [('technology', 0.3813428511192244),
  ('google', 0.28970320674398875),
  ('introducing', 0.28970320674398875),
  ('new', 0.28970320674398875),
  ('is', 0.28954931014004825),
  ('ai', 0.16969847762410875),
  ('are', 0.1696984776241085),
  ('examples', 0.1696984776241085),
  ('great', 0.1696984776241085),
  ('present', 0.1696984776241085)],
 'Concept 2': [('to', 0.4157884439670067),
  ('cook', 0.28359165793510666),
  ('gordan', 0.28359165793510666),
  ('love', 0.28359165793510666),
  ('ramsay', 0.28359165793510666),
  ('see', 0.28359165793510666),
  ('and', 0.21730644711292504),
  ('campaign', 0.21730644711292504),
  ('global', 0.217306447

In [56]:
import nltk
for key in concept_words:
    sentence_score = []
    for sentence in dataset:
        words = nltk.word_tokenize(sentence)
        score = 0
        for word in words:
            for word_with_score in concept_words[key]:
                if word == word_with_score[0]:
                    score+= word_with_score[1]
        sentence_score.append(score)
    print("\n"+key+":")
    for sentence_score in sentence_score:
        print(sentence_score)


Concept 0:
1.2230353190136318
1.3959619977396591
0
0
0.29843260469823923
1.6328490440199284
0

Concept 1:
0.28954931014004825
0
0
1.540001781491239
1.2298352392397671
0
0

Concept 2:
0
0
1.8337467336425401
0
0
0
1.285014232418707

Concept 3:
0.09557673896071192
0.09129690746961269
0
0.11178024069487177
2.03121140502632
0.1868736464303246
0


### Word Synonyms and Antonyms using NLTK

In [57]:
from nltk.corpus import wordnet

In [58]:
synonyms = []
antonyms = []


In [60]:
wordnet.synsets("good")

[Synset('good.n.01'),
 Synset('good.n.02'),
 Synset('good.n.03'),
 Synset('commodity.n.01'),
 Synset('good.a.01'),
 Synset('full.s.06'),
 Synset('good.a.03'),
 Synset('estimable.s.02'),
 Synset('beneficial.s.01'),
 Synset('good.s.06'),
 Synset('good.s.07'),
 Synset('adept.s.01'),
 Synset('good.s.09'),
 Synset('dear.s.02'),
 Synset('dependable.s.04'),
 Synset('good.s.12'),
 Synset('good.s.13'),
 Synset('effective.s.04'),
 Synset('good.s.15'),
 Synset('good.s.16'),
 Synset('good.s.17'),
 Synset('good.s.18'),
 Synset('good.s.19'),
 Synset('good.s.20'),
 Synset('good.s.21'),
 Synset('well.r.01'),
 Synset('thoroughly.r.02')]

##### SYNONYMS

In [62]:
for syn in wordnet.synsets("good"):
    for s in syn.lemmas():
          synonyms.append(s.name())
print(set(synonyms))

{'effective', 'in_effect', 'honorable', 'practiced', 'skillful', 'near', 'undecomposed', 'soundly', 'secure', 'honest', 'serious', 'unspoiled', 'safe', 'good', 'in_force', 'commodity', 'trade_good', 'right', 'estimable', 'respectable', 'dear', 'thoroughly', 'well', 'skilful', 'upright', 'just', 'unspoilt', 'goodness', 'proficient', 'full', 'dependable', 'beneficial', 'expert', 'salutary', 'sound', 'adept', 'ripe'}


##### ANTONYMS

In [70]:
for syn in wordnet.synsets("good"):
    for s in syn.lemmas():
        synonyms.append(s.name())
        for a in s.antonyms():
                antonyms.append(a.name())
                
print("--------------Antonyms--------------",set(antonyms))
print("--------------Synonyms--------------",set(synonyms))

--------------Antonyms-------------- {'evilness', 'evil', 'ill', 'bad', 'badness'}
--------------Synonyms-------------- {'effective', 'in_effect', 'honorable', 'practiced', 'skillful', 'near', 'undecomposed', 'soundly', 'secure', 'honest', 'serious', 'unspoiled', 'safe', 'good', 'in_force', 'commodity', 'trade_good', 'right', 'estimable', 'respectable', 'dear', 'thoroughly', 'well', 'skilful', 'upright', 'just', 'unspoilt', 'goodness', 'proficient', 'full', 'dependable', 'beneficial', 'expert', 'salutary', 'sound', 'adept', 'ripe'}


### Word Negation Tracking
* All about negative (-ve) and positive (+ve) Sentences.

In [72]:
import nltk
sentence = "I was not happy with the team's performance"
sentence

"I was not happy with the team's performance"

In [73]:
words = nltk.word_tokenize(sentence)
words

['I', 'was', 'not', 'happy', 'with', 'the', 'team', "'s", 'performance']

In [74]:
# here not happy is converted into not_happy.
new_word = []
temp_word = ""
for word in words:
    if word == "not":
        temp_word = "not_"
    elif temp_word == "not_":
        word = temp_word + word
        temp_word = ""
    if word != "not":
        new_word.append(word)

In [76]:
sentence = " ".join(new_word)

In [78]:
sentence 


"I was not_happy with the team 's performance"

In [97]:
# we are going to convert the word not happy with antonyms of happy i.e unhappy or sad. 
new_word = []
temp_word = ""
for word in words:
    antonyms = []
    if word == "not":
        temp_word = "not_"
    elif temp_word == "not_":
        for syn in wordnet.synsets(word):
            for s in syn.lemmas():
                for a in s.antonyms():
                    antonyms.append(a.name())
        if len(antonyms)>=1:
            word = antonyms[0]
        else:
            word = temp_word + word
        temp_word = ""
    if word != "not":
        new_word.append(word)
        
sentence = " ".join(new_word)

In [98]:
sentence

"I was unhappy with the team 's performance"