# Homework 2
**Exercise 1**

In [4]:
"""
This module implements a Bayesian spam filter based on Paul Graham's "A Plan for Spam".
Part 1 of Homework 2 for CS 344

@author: ljh27
"""


def parse_corpus(corpus):
    corpus_dict = {}
    for message in corpus:
        for word in message:
            #force lowercase
            word = word.lower()

            #if the word is already in the dict, increment its count by 1
            if word in corpus_dict:
                corpus_dict[word] += 1

            #otherwise, add the word to the dict and set its count to 1
            else:
                corpus_dict[word] = 1

    return corpus_dict


def get_unique_words(corpus_list):
    unique_words = []
    for corpus in corpus_list:
        for message in corpus:
            for word in message:
                word = word.lower()
                if word not in unique_words:
                    unique_words.append(word)
    return unique_words


def create_probs_dict(words, good, bad, ngood, nbad):
    probs_dict = {}
    for word in words:
        if word in good:
            g = 2 * good[word]
        else:
            g = 0
        if word in bad:
            b = bad[word]
        else:
            b = 0

        #use a minimum count threshold of 1
        if g+b > 1:
            probs_dict[word] = max(0.01, min(0.99, min(1.0, b / nbad) / (min(1.0, g / ngood) + min(1.0, b / nbad))))
        else:
            probs_dict[word] = 0

    return probs_dict


def evaluate_msg(msg, probs):
    prod = 1
    comp_prod = 1
    for word in msg:
        if word in probs:
            prod *= probs[word]
            comp_prod *= (1 - probs[word])
        else:
            prod *= 0.4
            comp_prod *= 0.6
    spam_prob = prod / (prod + comp_prod)

    if spam_prob > 0.9:
        return ["SPAM", spam_prob]
    else:
        return ["Not spam", spam_prob]


#Test it out!
spam_corpus = [["I", "am", "spam", "spam", "I", "am"], ["I", "do", "not", "like", "that", "spamiam"]]
ham_corpus = [["do", "i", "like", "green", "eggs", "and", "ham"], ["i", "do"]]
ngood = len(ham_corpus)
nbad = len(spam_corpus)

good = parse_corpus(ham_corpus)
bad = parse_corpus(spam_corpus)
words = get_unique_words([ham_corpus, spam_corpus])
probs = create_probs_dict(words, good, bad, ngood, nbad)

print("good: " + str(good))
print("bad: " + str(bad))
print("\nprobs: ")
print(probs)


print("\n\n--- TESTS ---")
test_msg = ["I", "do", "not", "like", "green", "eggs", "and", "ham", "i", "do", "not", "like", "them", "Sam-I-am"]
test_result = evaluate_msg(test_msg, probs)
print(" Message: " + " ".join(test_msg))
print("\tResult: " + test_result[0] + " (Probability = " + str(test_result[1]) + ")")

test_msg2 = ["You", "can't", "have", "egg", "bacon", "spam", "and", "sausage", "without", "the", "spam"]
test_result2 = evaluate_msg(test_msg2, probs)
print("\n Message: " + " ".join(test_msg2))
print("\tResult: " + test_result2[0] + " (Probability = " + str(test_result2[1]) + ")")

test_msg3 = ["spam", "egg", "spam", "spam", "bacon", "and", "spam"]
test_result3 = evaluate_msg(test_msg3, probs)
print("\n Message: " + " ".join(test_msg3))
print("\tResult: " + test_result3[0] + " (Probability = " + str(test_result3[1]) + ")")


good: {'do': 2, 'i': 2, 'like': 1, 'green': 1, 'eggs': 1, 'and': 1, 'ham': 1}
bad: {'i': 3, 'am': 2, 'spam': 2, 'do': 1, 'not': 1, 'like': 1, 'that': 1, 'spamiam': 1}

probs: 
{'do': 0.3333333333333333, 'i': 0.5, 'like': 0.3333333333333333, 'green': 0.01, 'eggs': 0.01, 'and': 0.01, 'ham': 0.01, 'am': 0.99, 'spam': 0.99, 'not': 0, 'that': 0, 'spamiam': 0}


--- TESTS ---
 Message: I do not like green eggs and ham i do not like them Sam-I-am
	Result: Not spam (Probability = 0.0)

 Message: You can't have egg bacon spam and sausage without the spam
	Result: Not spam (Probability = 0.7943582510578278)

 Message: spam egg spam spam bacon and spam
	Result: SPAM (Probability = 0.9999976811325348)


Graham says in "A Plan for Spam" that this approach uses a "Bayesian combination of the spam probabilities of individual words." It is also Bayesian in that it starts out by using existing data (i.e. the two corpuses) and continues to adapt by analyzing new data (i.e. new emails) as they come in.


**Exercise 2**
Part a:

In [1]:
"""
Implements a Bayesian network for Figure 14.12a
Part 2 of Homework 2 for CS 344
(based on network.py by kvlinden)

@author: ljh27
"""

from probability import BayesNet, enumeration_ask

# Utility variables
T, F = True, False

# From AIMA code (probability.py)
cloudyNet = BayesNet([
    ('Cloudy', '', 0.5),
    ('Sprinkler', 'Cloudy', {T: 0.10, F: 0.50}),
    ('Rain', 'Cloudy', {T: 0.80, F: 0.20}),
    ('WetGrass', 'Sprinkler Rain', {(T, T): 0.99, (T, F): 0.90, (F, T): 0.90, (F, F): 0.00})
    ])


Part b:
There are 4 variables, each with 2 possible values (True or False), so the number of values in the full joint probability distribution is 2^4 = 16

Part c:
The number of values in the Bayesian network is found by counting the number of values in the probability tables given in the diagram (Figure 14.12a), which in this case is 9

Part d:
_Hand calculations for these problems can be found in Homework2-HandCalculations.pdf_

In [2]:
# i. P(Cloudy)
print('P(Cloudy):')
print('\t' + enumeration_ask('Cloudy', dict(), cloudyNet).show_approx())

# ii. P(Sprinkler | cloudy)
print('\nP(Sprinkler | cloudy):')
print('\t' + enumeration_ask('Sprinkler', dict(Cloudy=T), cloudyNet).show_approx())

# iii. P(Cloudy | sprinkler ∧ ¬rain)
print('\nP(Cloudy | sprinkler ^ ¬rain):')
print('\t' + enumeration_ask('Cloudy', dict(Sprinkler=T, Rain=F), cloudyNet).show_approx())

# iv. P(WetGrass | cloudy ∧ sprinkler ∧ rain)
print('\nP(WetGrass | cloudy ∧ sprinkler ∧ rain):')
print('\t' + enumeration_ask('WetGrass', dict(Cloudy=T, Sprinkler=T, Rain=T), cloudyNet).show_approx())

# v. P(Cloudy | ¬WetGrass)
print('\nP(Cloudy | ¬WetGrass):')
print('\t' + enumeration_ask('Cloudy', dict(WetGrass=F), cloudyNet).show_approx())


P(Cloudy):
	False: 0.5, True: 0.5

P(Sprinkler | cloudy):
	False: 0.9, True: 0.1

P(Cloudy | sprinkler ^ ¬rain):
	False: 0.952, True: 0.0476

P(WetGrass | cloudy ∧ sprinkler ∧ rain):
	False: 0.01, True: 0.99

P(Cloudy | ¬WetGrass):
	False: 0.639, True: 0.361
