Exercise 1

In [3]:
'''
Lisp Code

(let ((g (* 2 (or (gethash word good) 0)))
      (b (or (gethash word bad) 0)))
   (unless (< (+ g b) 5)
     (max .01
          (min .99 (float (/ (min 1 (/ b nbad))
                             (+ (min 1 (/ g ngood))   
                                (min 1 (/ b nbad)))))))))

Translation from Lisp to something actually remotely readable

for word in email {
    let g = 2 * getNumTimesInGoodList(word)
    let b = 2 * getNumTimesInBadList(word)
    if (g + b > 5) {
        result = null
        let nBadMin = min(1, b / nbad)
        let nAllMin = min(1, g / ngood) + min(1, b / nbad)
        let percentBad = nBadMin / nAllMin
        probSpam = min(.99, percentBad)
        result = max(.01, probSpam)
        return result
    }
}


'''

spam_corpus = [["I", "am", "spam", "spam", "I", "am"], ["I", "do", "not", "like", "that", "spamiam"]]
ham_corpus = [["do", "i", "like", "green", "eggs", "and", "ham"], ["i", "do"]]

# Create the test email
def buildWordList(all_emails):
    wordList = []
    for email in all_emails:
        for word in email:
            wordList.append(word)
    return wordList

# Create a dict() object from a list of collected emails
def buildDictionary(corpus):
    result = dict()
    for email in corpus:
        for word in email:
            if(word in result.keys()):
                result[word] = result[word] + 1
            else:
                result[word] = 1
    return result

# Lookup the value in the dictionary, returning 0 if it doesn't exist
def lookupWordInDictionary(dictionary, word):
    if(word not in dictionary.keys()):
        return 0
    return dictionary[word]


# Calculate the Probability of being spam using Paul Graham's Lisp algorithm
def calculateProbabilityOfSpam(word):
    good = 2 * lookupWordInDictionary(goodDict, word)
    bad = 2 * lookupWordInDictionary(spamDict, word)
    totalBad = len(spam_corpus)
    totalGood = len(ham_corpus)
    if(good + bad > 1):
        result = 0
        minBad = min(1, bad / totalBad)
        minAll = min(1, good / totalGood) + minBad
        percentBad = minBad / minAll
        probSpam = min(.99, percentBad)
        result = max(.01, probSpam)
        return result
    else:
        return 0

def buildProbSpamDictionary(email):
    result = {}
    for word in email:
        wordLower = word.lower()
        result[word] = calculateProbabilityOfSpam(word)
    return result

# Create both hash tables
goodDict = buildDictionary(ham_corpus)
spamDict = buildDictionary(spam_corpus)

# Create email to test from words of both lists
email = buildWordList(ham_corpus + spam_corpus)

# Create Probability hash table
probSpamDictionary = buildProbSpamDictionary(email)

print(probSpamDictionary)



{'do': 0.5, 'i': 0.01, 'like': 0.5, 'green': 0.01, 'eggs': 0.01, 'and': 0.01, 'ham': 0.01, 'I': 0.99, 'am': 0.99, 'spam': 0.99, 'not': 0.99, 'that': 0.99, 'spamiam': 0.99}


Exercise 2

b) The number of independent values in the full joint probablility will be 2 ^ (number of random variables) - 1 since the last value could be calculated using previous values because all values must add to 1.0. Since there are 4 random variables, there are 15 independent values.

c) By looking at the Bayesian Netowork, it appears that only the 'Cloudy' random variable is completely indepedent as all other values are dependent on 'Cloudy' directly or indirectly.

In [7]:
import sys
sys.path.insert(0, '../tools/aima')
from probability import BayesNet, enumeration_ask, elimination_ask, gibbs_ask

# Utility variables
T, F = True, False

# Create Network
wetGrassNetwork = BayesNet([
    ('Cloudy', '', 0.5),
    ('Sprinkler', 'Cloudy', {T:0.10, F:0.50 }),
    ('Rain', 'Cloudy', { T:0.80, F:0.20 }),
    ('WetGrass', 'Sprinkler Rain', { (T,T): 0.99, 
                               (T,F): 0.9,
                               (F,T):0.9,
                               (F,F):0.0
                            })
])

# Print Statements are the computer generated versions

# Comments are the by hand version

print("BP ( Cloudy )")
print(enumeration_ask('Cloudy', dict(), wetGrassNetwork).show_approx())
print("\n")
'''
BP ( Cloudy ) = < P(Cloudy), P(not Cloudy) >
= < 0.5, 0.5 > ( Cloudy is an independent random variable )
'''

print("BP ( Sprinkler | Cloudy )")
print(enumeration_ask('Sprinkler', dict(Cloudy=T), wetGrassNetwork).show_approx())
print("\n")

'''
BP ( Sprinkler | Cloudy ) = < P( Sprinkler | Cloudy ), P( not Sprinkler | Cloudy ) >
= < 0.1, 0.9 > ( Conditional given by the diagram )
'''

print("BP ( Cloudy | Sprinkler && No Rain )")
print(enumeration_ask('Cloudy', dict(Sprinkler=T, Rain=F), wetGrassNetwork).show_approx())
print("\n")

'''
BP ( Cloudy | Sprinkler && No Rain ) = < P ( Cloudy | Sprinkler && No Rain), P( not Cloudy | Sprinkler && No Rain ) >
= alpha * < P(Cloudy && Sprinkler && No Rain), P(not Cloudy && Sprinkler && No Rain) >
= alpha * < .5 * .1 * .2, .5 * .5 * .8 >
= alpha * < 0.01, 0.2 > ( alpha = 1 / (0.01 + 0.2) = 4.7619)
= < .0476, 0.952 >
( True, False )
'''

print("BP ( WetGrass | Cloudy && Sprinkler && Raining )")
print(enumeration_ask('WetGrass', dict(Cloudy=T, Sprinkler=T, Rain=T), wetGrassNetwork).show_approx())
print("\n")

'''
BP ( WetGrass | Cloudy && Sprinkler && Raining ) 
= < P ( WetGrass | Cloudy && Sprinkler && Raining), P( no WetGrass | Cloudy && Sprinkler && Raining ) >
= alpha * < P(WetGrass && Cloudy && Sprinkler && Raining), P(not WetGrass && Cloudy && Sprinkler && Raining) >
= alpha * < .5 * .1 * .8 * .99, .5 * .1 * .8 * .01 >
= alpha * < .0396, 0.0004 > ( alpha = 1 / (0.0396 + 0.0004) = 25)
= < 0.99, 0.01 >
 ( True, False )
'''

print("BP ( Cloudy | Grass not Wet )")
print(enumeration_ask('Cloudy', dict(WetGrass=F), wetGrassNetwork).show_approx())

# I am close, but I'm missing a path or two somewhere
'''
BP ( Cloudy | not WetGrass ) = < P( Cloudy | not WetGrass ), P( not Cloudy | not WetGrass ) >
= alpha * < P( Cloudy && not WetGrass), P( not Cloudy && not WetGrass )
= alpha * < .5 * (.1 * .8 * .99 + .9 * .8 * .9 + .1 * .2 * .9), P( not Cloudy && not WetGrass) >
= alpha * < .3726, .5 * (.5 * .8 ) + .5 * (.5 * .2 * .01 + .5 * .8 * .1 + .5 * .2 * .1)
= alpha * < .3726, .2255 > ( alpha = 1 / (.3726 + .2255) = 1.67196)
= < .623, .377 > 
'''

BP ( Cloudy )
False: 0.5, True: 0.5


BP ( Sprinkler | Cloudy )
False: 0.9, True: 0.1


BP ( Cloudy | Sprinkler && No Rain )
False: 0.952, True: 0.0476


BP ( WetGrass | Cloudy && Sprinkler && Raining )
False: 0.01, True: 0.99


BP ( Cloudy | Grass not Wet )
False: 0.639, True: 0.361
