In [33]:
def get_count_dictionary(corpus):
    """Returns a dictionary with each word and the number of times it occurs in that corpus"""
    new_dict = {}
    for i in range(len(corpus)):
        for w in range(len(corpus[i])):
            word = corpus[i][w].lower()
            if word in new_dict:
                new_dict[word] = new_dict[word] + 1
            else:
                new_dict.update({word: 1})
    return new_dict
    
    
def get_number_words(dictionary):
    """Returns the number of words in a dictionary (adds up each key's values)"""
    count = 0
    for i in dictionary:
        count = count + dictionary[i]
    return count
    

def get_probability_dictionary(good_dict, bad_dict):
    """Returns a dictionary with the weighted probability depending on the number of times a word occurs in
        the good or bad dictionaries"""
    new_dict = {}
    num_bad = get_number_words(good_dict)
    num_good = get_number_words(bad_dict)
    for i in good_dict:
        b = 0
        word_val = 0
        
        g = 2 * good_dict[i]
        if i in bad_dict:
            b = bad_dict[i]
        if g + b > 1:
            word_val = max(0.01, min(0.99, min(1.0, b / num_bad) / (min(1.0, g / num_good) + min(1.0, b/num_bad))))
            
        new_dict[i] = word_val
        
    for i in bad_dict:
        if i not in new_dict:
            g = 0
            word_val = 0
            
            if i in good_dict:
                g = 2 * good_dict[i]
            b = bad_dict[i]
            if g + b > 1:
                word_val = max(0.01, min(0.99, min(1.0, b / num_bad) / (min(1.0, g / num_good) + min(1.0, b/num_bad))))
                
            new_dict[i] = word_val
            
    return new_dict


def calculate_spam_probability(prob_dict, message):
    """Returns the probability that the message is spam"""
    prod = 1
    complement_prod = 1
    
    for i in range(len(message)):
        word = message[i]
        if word in prob_dict:
            if prob_dict[word] > 0:
                prod = prod * prob_dict[word]
                complement_prod = complement_prod * (1 - prob_dict[word])
        else:
            prod = prod * .4
            complement_prod = complement_prod * .6
    return prod / (prod + complement_prod)

    
if __name__ == '__main__':
    spam_corpus = [["I", "am", "spam", "spam", "I", "am"], ["I", "do", "not", "like", "that", "spamiam"]]
    ham_corpus = [["do", "i", "like", "green", "eggs", "and", "ham"], ["i", "do"]]
    test_corpus = ['See', 'if', 'I', 'am', 'spam', 'or', 'not', 'am', 'i', 'ham']
    test2_corpus = ['Do', 'I', 'seem', 'eggs', 'green', 'ham']

    good_dict = {}
    bad_dict = {}
    prob_dict = {}
    
    good_dict = get_count_dictionary(ham_corpus)
    bad_dict = get_count_dictionary(spam_corpus)
    prob_dict = get_probability_dictionary(good_dict, bad_dict)
    print(bad_dict)
    print(good_dict)
    print(prob_dict)
    
    probability_spam = calculate_spam_probability(prob_dict, test_corpus)
    print("The probability of the message being spam is: " + str(probability_spam))
    
    probability_spam = calculate_spam_probability(prob_dict, test2_corpus)
    print("The probability of the message being spam is: " + str(probability_spam))


{'i': 3, 'that': 1, 'spamiam': 1, 'am': 2, 'spam': 2, 'do': 1, 'not': 1, 'like': 1}
{'i': 2, 'green': 1, 'and': 1, 'eggs': 1, 'do': 2, 'ham': 1, 'like': 1}
{'i': 0.5, 'green': 0.01, 'am': 0.99, 'spamiam': 0, 'do': 0.25, 'that': 0, 'ham': 0.01, 'and': 0.01, 'not': 0, 'spam': 0.99, 'eggs': 0.01, 'like': 0.39999999999999997}
The probability of the message being spam is: 0.9994837377387713
The probability of the message being spam is: 3.0536587775264924e-07


In [None]:
from probability import BayesNet, enumeration_ask

# Utility variables
T, F = True, False

wet_grass = BayesNet([
    ('Cloudy', '', 0.50),
    ('Sprinkler', 'Cloudy', {T: 0.10, F: 0.50}),
    ('Rain', 'Cloudy', {T: 0.80, F: 0.20}),
    ('WetGrass', 'Sprinkler Rain', {(T, T): 0.99, (T, F): 0.90, (F, T): 0.90, (F, F): 0.00}),
    ])

# a) Multiply the number of values for each variable together to get the number of possible combinations
#       2 * 2 * 2 * 2 = 16

# b) Counted the number of values in the left Bayesian net 9

# c)
# i)
# P(Cloudy)
print(enumeration_ask('Cloudy', dict(), wet_grass).show_approx())
# Output - False: 0.5, True: 0.5
# Hand-calculation -
#       P(Cloudy) = < 0.5, 0.5 > (Looked it up in the chart)

# ii)
# P(Sprinkler | cloudy)
print(enumeration_ask('Sprinkler', dict(Cloudy=T), wet_grass).show_approx())
# Output - False: 0.9, True: 0.9
# Hand-calculation -
#       P(Sprinkler | Cloudy) = < 0.1, 0.9 > (Looked it up in the chart)

# iii)
# P(Cloudy| sprinkler=T and rain=F)
print(enumeration_ask('Cloudy', dict(Sprinkler=T, Rain=F), wet_grass).show_approx())
# Output - False: 0.952, True: 0.0476
# Hand-calculation - 
#       P(Cloudy| sprinkler=T and rain=F) = alpha < P(C) * P(s and not r | C), P(not C) * P(s and not r | not C) >
#                                         = alpha < .5 * .02, .5 * .4 >
#                                         = alpha < .01, .2 >
#                                         = < .01 / .21, .2 / .21 > = < .0476, .952 >

# iv)
# P(WetGrass | cloudy=T and sprinkler=T and rain=T)
print(enumeration_ask('WetGrass', dict(Cloudy=T, Sprinkler=T, Rain=T), wet_grass).show_approx())
# Output - False: 0.01, True: 0.99
# Hand-calculation - 
#       P(W | c and s and r) = alpha < P(W) * P(c and s and r | W), P(not W) * P(c and s and r | not W) >
#                            = alpha P(c) < P(s and r | c) * P(W | s and r), P(s and r | c) * P(not W | s and r) >
#                            = alpha * .5 * < .08 * .99, .08 * .01 >
#                            = alpha * .5 < .0792, .0008 >
#                            = alpha < .0396, .0004 > = < .99, .01 >

# v)
# P(Cloudy | not wetgrass)
print(enumeration_ask('Cloudy', dict(WetGrass=F), wet_grass).show_approx())
# Output - False: 0.639, True: 0.361
# Hand-calculation - 
#       P(C | not w) = sum respect to s and sum respect to r ( P(C) * P(s and r | C) * P(not w | s and r) )
#                    = alpha < P(C) * ( P(s and r | C) * P(not w | s and r) + P(not s and r | C) * 
#                               P(not w | not s and r) + P(s and not r | C) * P(not w | s and not r) + 
#                               P(not s and not r | C) * P(not w | not s and not r) ), 
#           `                   P(not C) * ( P(s and r | C) * P(not w | s and r) + P(not s and r | C) * 
#                               P(not w | not s and r) + P(s and not r | C) * P(not w | s and not r) + 
#                               P(not s and not r | C) * P(not w | not s and not r) ) >
#                     = alpha < .5 * ( .08 * .01 + .02 * .10 + .72 * .10 + .18 * 1 ), .5 * ( .10 * .01 + .40 * .10 + 
#                               .10 * .10 + .40 * 1) > 
#                     = alpha < .1274, .2255 >
#                     = < .361, .639 >


False: 0.5, True: 0.5
False: 0.9, True: 0.1
False: 0.952, True: 0.0476
False: 0.01, True: 0.99
False: 0.639, True: 0.361
