In [1]:
import pandas as pd
from fractions import Fraction

import nltk
from nltk import FreqDist

In [2]:
#Corpus
Ram = ['I wish you the best', 'I hope to reach home by 6 P M', 'I wish to go home early',
      'I do not want to buy this', 'I hope it rains today']
Raj = ['I hope to play tennis tonight', 'I hope to win this tournament', 'I hope to buy this car in the next year',
      'I wish to get a good score this time', 'I wish they would come']

In [3]:
#Calculate number of words in Ram, Raj and calculate total words
ramWords = []
for i in range(0,len(Ram)):
    #Split the strings based on blankspace
    sen = Ram[i].split(' ')
    #Extend the list by adding
    ramWords.extend(sen)
print("Number of words in Ram: ", len(ramWords))

rajWords = []
for i in range(0,len(Raj)):
    #Split the strings based on blankspace
    sen = Raj[i].split(' ')
    #Extend the list by adding
    rajWords.extend(sen)
print("Number of words in Raj: ", len(rajWords))

totWords = len(ramWords) + len(rajWords)
print("Total words in both the corpus: ", totWords)

Number of words in Ram:  32
Number of words in Raj:  36
Total words in both the corpus:  68


In [4]:
uniqRamWords = list(set(ramWords))
uniqRajWords = list(set(rajWords))
UniqWords = uniqRamWords + uniqRajWords
ttlUniqWords = set(UniqWords)

print("Vocabulary of ram corpus: ", len(uniqRamWords))
print("Vocabulary of raj corpus: ", len(uniqRajWords))
print("Vocabulary of combined corpus: ", len(ttlUniqWords))

Vocabulary of ram corpus:  23
Vocabulary of raj corpus:  24
Vocabulary of combined corpus:  40


In [5]:
#Store the frequency distribution of words in the respective corpus as a dictionary 
fDistRam = dict(nltk.FreqDist(ramWords))
fDistRaj = dict(nltk.FreqDist(rajWords))
print("Frequency of words in Ram Corpus\n", fDistRam)
print("Frequency of words in Raj Corpus\n", fDistRaj)

Frequency of words in Ram Corpus
 {'I': 5, 'wish': 2, 'you': 1, 'the': 1, 'best': 1, 'hope': 2, 'to': 3, 'reach': 1, 'home': 2, 'by': 1, '6': 1, 'P': 1, 'M': 1, 'go': 1, 'early': 1, 'do': 1, 'not': 1, 'want': 1, 'buy': 1, 'this': 1, 'it': 1, 'rains': 1, 'today': 1}
Frequency of words in Raj Corpus
 {'I': 5, 'hope': 3, 'to': 4, 'play': 1, 'tennis': 1, 'tonight': 1, 'win': 1, 'this': 3, 'tournament': 1, 'buy': 1, 'car': 1, 'in': 1, 'the': 1, 'next': 1, 'year': 1, 'wish': 2, 'get': 1, 'a': 1, 'good': 1, 'score': 1, 'time': 1, 'they': 1, 'would': 1, 'come': 1}


In [6]:
#Calculate P(X1|y) = Count(X1,y)/Count(Y)
#y are class labels (Ram or Raj)
#X1 are words (I, wish, hope etc.)
#Y is the total number of words in both the corpus (ie) 68

#Define a function to calculate probability and store result as a fraction
probRam = {}
probRaj = {}
def probRamXY(w1):
    probRam[w1] = 0
    for key, value in fDistRam.items():
        if w1 in key:
            probRam[w1] = Fraction(value,totWords)
    return probRam[w1]

def probRajXY(w1):
    probRaj[w1] = 0
    for key, value in fDistRaj.items():
        if w1 in key:
            probRaj[w1] = Fraction(value,totWords)
    return probRaj[w1]

In [7]:
probRajXY('hope')

Fraction(3, 68)

In [8]:
probRamXY('I')

Fraction(5, 68)

In [9]:
#Calculate P(X1|y) for all unique words in Ram and Raj corpus and store it in a list
prRam = {}
prRaj = {}
allWords = ramWords + rajWords
print("Total number of words in the combined corpus: ", len(allWords))
uniqWords = set(allWords)
print("\nUnique words in the combined corpus: ", len(uniqWords))

for words in uniqWords:
    prRam[words] = probRamXY(words)
    prRaj[words] = probRajXY(words)

print("\nProbabilities of words in Ram corpus: \n", prRam)
print("\n\nLength of words for which probability calculated in Ram corpus: ", len(prRam))
print("\nProbabilities of words in Raj corpus: \n", prRaj)
print("\n\nLength of words for which probability calculated in Raj corpus: ", len(prRaj))

Total number of words in the combined corpus:  68

Unique words in the combined corpus:  40

Probabilities of words in Ram corpus: 
 {'tennis': 0, 'by': Fraction(1, 68), 'today': Fraction(1, 68), 'home': Fraction(1, 34), 'car': 0, 'reach': Fraction(1, 68), 'best': Fraction(1, 68), 'do': Fraction(1, 68), 'come': 0, 'want': Fraction(1, 68), 'win': 0, 'tonight': 0, 'go': Fraction(1, 68), 'I': Fraction(5, 68), 'they': 0, 'next': 0, 'P': Fraction(1, 68), 'you': Fraction(1, 68), 'buy': Fraction(1, 68), 'time': 0, 'rains': Fraction(1, 68), '6': Fraction(1, 68), 'early': Fraction(1, 68), 'would': 0, 'M': Fraction(1, 68), 'this': Fraction(1, 68), 'score': 0, 'not': Fraction(1, 68), 'wish': Fraction(1, 34), 'tournament': 0, 'good': 0, 'the': Fraction(1, 68), 'year': 0, 'it': Fraction(1, 68), 'get': 0, 'hope': Fraction(1, 34), 'to': Fraction(1, 68), 'in': Fraction(1, 68), 'play': 0, 'a': Fraction(1, 68)}


Length of words for which probability calculated in Ram corpus:  40

Probabilities of words

In [10]:
#Prior probability P(y) = count(y)/count(Y). As there are only two classes it is 1/2
PrProb = Fraction(1,2)
print("Prior probability :", PrProb)

Prior probability : 1/2


In [11]:
#Guess who wrote the sentence "I wish you would come"
#For Ram Corpus
def bRam(w1,w2,w3,w4,w5):
    lstVal = []
    for key, value in prRam.items():
        if key == w1:
            lstVal.append(value)
        if key == w2:
            lstVal.append(value)
        if key == w3:
            lstVal.append(value)
        if key == w4:
            lstVal.append(value)
        if key == w5:
            lstVal.append(value)
    finProb = 1
    for i in range(len(lstVal)):
        finProb = finProb*lstVal[i]
    print("Baye's Probability from Ram Corpus is: ", PrProb*finProb)
    
    return lstVal

In [12]:
bRam('I','wish','you','would','come')

Baye's Probability from Ram Corpus is:  0


[0, Fraction(5, 68), Fraction(1, 68), 0, Fraction(1, 34)]

In [13]:
def bRaj(w1,w2,w3,w4,w5):
    lstVal = []
    for key, value in prRaj.items():
        if key == w1:
            lstVal.append(value)
        if key == w2:
            lstVal.append(value)
        if key == w3:
            lstVal.append(value)
        if key == w4:
            lstVal.append(value)
        if key == w5:
            lstVal.append(value)
    #print(any(x == 0 for x in lstVal))
    
    finProb = 1
    for i in range(len(lstVal)):
        finProb = finProb*lstVal[i]
    print("Baye's Probability from Raj Corpus is: ", PrProb*finProb)
    
    return lstVal

In [14]:
bRaj('I','wish','you','would','come')

Baye's Probability from Raj Corpus is:  0


[Fraction(1, 68), Fraction(5, 68), 0, Fraction(1, 68), Fraction(1, 34)]

In [15]:
#Both probabilities are zero.
#Hence add 1 to each of the words in the numerator only

#Get the keys of Ram corpus for which the value is zero and store the keys separately
keyRam0 = []
keyRaj0 = []
for k, v in prRam.items():
    if v == 0:
        keyRam0.append(k)
for k, v in prRaj.items():
    if v == 0:
        keyRaj0.append(k)
#print(keyRam0)
#print("Number of words in combined corpus but not in Ram corpus: ", len(keyRam0))
#print(keyRaj0)
#print("Number of words in combined corpus but not in Raj corpus: ", len(keyRaj0))

#Increase numerator values by 1 in the respective dictionary
def upProbRamXY(w1):
    probRam[w1] = Fraction(1,68)
    for key, value in fDistRam.items():
        if w1 in key:
            probRam[w1] = Fraction(value+1,totWords)
    return probRam[w1]

def upProbRajXY(w1):
    probRaj[w1] = Fraction(1,68)
    for key, value in fDistRaj.items():
        if w1 in key:
            probRaj[w1] = Fraction(value+1,totWords)
    return probRaj[w1]

#print("Probability of missing word car in Ram corpus", upProbRamXY('car'))
#print("Probability of missing word home in Raj corpus",upProbRajXY('home'))
#print("Original Probability of present word I in Ram corpus", probRamXY('I'))
#print("Updated Probability of present word I in Ram corpus", upProbRamXY('I'))
#print("Original Probability of present word I in Raj corpus", probRajXY('I'))
#print("Updated Probability of present word I in Raj corpus", upProbRajXY('I'))

#update P(X1|y) for all unique words in Ram and Raj corpus and store it in a list
uprRam = {}
uprRaj = {}

for words in uniqWords:
    uprRam[words] = upProbRamXY(words)
    uprRaj[words] = upProbRajXY(words)

#print("\nUpdated Probabilities of words in Ram corpus: \n", uprRam)
#print("\n\nUpdated number of words for which probability calculated in Ram corpus: ", len(uprRam))
#print("\nUpdated Probabilities of words in Raj corpus: \n", uprRaj)
#print("\n\nUpdated number of words for which probability calculated in Raj corpus: ", len(uprRaj))

def ubRam(w1,w2,w3,w4,w5):
    lstVal = []
    for key, value in uprRam.items():
        if key == w1:
            lstVal.append(value)
        if key == w2:
            lstVal.append(value)
        if key == w3:
            lstVal.append(value)
        if key == w4:
            lstVal.append(value)
        if key == w5:
            lstVal.append(value)
    finProb = 1
    for i in range(len(lstVal)):
        finProb = finProb*lstVal[i]
    print("Baye's Probability from revised Ram Corpus is: ", PrProb*finProb)
    
    return finProb

def ubRaj(w1,w2,w3,w4,w5):
    lstVal = []
    for key, value in uprRaj.items():
        if key == w1:
            lstVal.append(value)
        if key == w2:
            lstVal.append(value)
        if key == w3:
            lstVal.append(value)
        if key == w4:
            lstVal.append(value)
        if key == w5:
            lstVal.append(value)
       
    finProb = 1
    for i in range(len(lstVal)):
        finProb = finProb*lstVal[i]
    print("Baye's Probability from revised Raj Corpus is: ", PrProb*finProb)
    
    return float(finProb)

#print(bRam('I','wish','you','would','come'))
#print(bRaj('I','wish','you','would','come'))
valUpdatedRam = ubRam('I','wish','you','would','come')
valUpdatedRaj = ubRaj('I','wish','you','would','come')
print("Ram sent the mail") if valUpdatedRam > valUpdatedRaj else print("Raj sent the mail")

Baye's Probability from revised Ram Corpus is:  9/726966784
Baye's Probability from revised Raj Corpus is:  9/363483392
Raj sent the mail


In [16]:
#Find the sender of the email - Ram or Raj
#A new mail arrives with just three words - motivate, profit and product
#Historical information provided
import pandas as pd
data = [['motivate',0.24,0.05],['profit',0.3,0.35],['product',0.26,0.35],['leadership',0.08,0.15],['operations',0.12,0.10]]
df = pd.DataFrame(data, columns = ['Word','Ram','Raj'])
df.set_index('Word', inplace = True)
print(df)

             Ram   Raj
Word                  
motivate    0.24  0.05
profit      0.30  0.35
product     0.26  0.35
leadership  0.08  0.15
operations  0.12  0.10


In [17]:
#Create a wordlist for search words and calculate Bayesian probability for Ram and Raj
#Max value of Bayesian product will be the sender of the email
wordList = ['motivate', 'profit', 'product']
probRam = 1
probRaj = 1
for i in wordList:
    valRam = df.loc[i,'Ram']
    valRaj = df.loc[i,'Raj']
    probRam = valRam*probRam
    probRaj = valRaj*probRaj
print("Probability mail sent by Ram is: ", probRam)
print("Probability mail sent by Raj is: ", probRaj)
print("Mail sent by Ram") if probRam > probRaj else print("Mail sent by Raj")

Probability mail sent by Ram is:  0.01872
Probability mail sent by Raj is:  0.006124999999999999
Mail sent by Ram


In [18]:
#Product sentiments
#Assume the following likelihood for each word being part of positive or negative review
#Equal prior probabilities for each class (P(positive) = 0.5 and P(negative) =0.5)
#What class Naive Bayes classifier would assign to the sentence "I do not like to fill in the application form"
data2 = [['I',0.09,0.16],['love',0.07,0.06],['to',0.05,0.07],['fill',0.29,0.06],
        ['credit',0.04,0.15],['card',0.08,0.11],['application',0.06,0.04]]
df2 = pd.DataFrame(data2, columns=['Word','Positive','Negative'])
#df2.set_index('Word', inplace = True)
print(df2)

          Word  Positive  Negative
0            I      0.09      0.16
1         love      0.07      0.06
2           to      0.05      0.07
3         fill      0.29      0.06
4       credit      0.04      0.15
5         card      0.08      0.11
6  application      0.06      0.04


In [19]:
words = ['I','do','not','like','to','fill','in','the','application','form']
#Out of vocab words are: do, not, like, in, the, form (six words)

#Create two separate empty lists and populate them with matched vocabulary and out of vocabulary words
wordsMatch = []
wordsNoMatch = []

for i in words:
    if (df2['Word'] == i).any():
        wordsMatch.append(i)
    else:
        wordsNoMatch.append(i)

#print("List of matched words: ", wordsMatch)
#print("List of out of vocabulary words: ", wordsNoMatch)
#print("Total number of words: ", len(words))
#print("Number of matched words: ", len(wordsMatch))
#print("Number of out of vocabulary words: ", len(wordsNoMatch))

In [20]:
#Subset df2 containing matched words into a new dataframe newDF
newDF = pd.DataFrame(columns=['Word','Positive','Negative'])

for i in wordsMatch:
    if (df2['Word'].str.contains(i)).any():
        newDF = newDF.append(df2.loc[df2['Word'] == i])
        #print(i, "is there")

#Create a new dataframe called oov (out of vocabulary) with words from wordsNoMatch as words and probability values 0.5
oovDF = pd.DataFrame(columns=['Word','Positive','Negative'])
for i in range(len(wordsNoMatch)):
    oovDF.loc[i] = [wordsNoMatch[i]] + [0.5] + [0.5]

#Concatenate newDF and oovDF into one single dataframe and set the index as Word column
frames = [newDF, oovDF]
merged = pd.concat(frames, ignore_index=True)
merged.set_index('Word',inplace = True)
print(merged)

             Positive  Negative
Word                           
I                0.09      0.16
to               0.05      0.07
fill             0.29      0.06
application      0.06      0.04
do               0.50      0.50
not              0.50      0.50
like             0.50      0.50
in               0.50      0.50
the              0.50      0.50
form             0.50      0.50


In [21]:
#Calculate Bayesian probability for positive and negative reviews
words = ['I','do','not','like','to','fill','in','the','application','form']
probPos = 1
probNeg = 1
for i in words:
    valPos = merged.loc[i,'Positive']
    valNeg = merged.loc[i,'Negative']
    probPos = valPos*probPos
    probNeg = valNeg*probNeg
print("Probability Review is positive: ", probPos)
print("Probability Review is negative: ", probNeg)
print("Positive Review") if probPos > probNeg else print("Negative Review")

Probability Review is positive:  1.2234374999999997e-06
Probability Review is negative:  4.2000000000000006e-07
Positive Review
