# 1. Spam Filtering

In [None]:
import string

spam_corpus = [["I", "am", "spam", "spam", "I", "am"], ["I", "do", "not", "like", "that", "spamiam"]]
ham_corpus = [["do", "i", "like", "green", "eggs", "and", "ham"], ["i", "do"]]

THRESHOLD = 1
BASE_PROB = 0.5


class BayesSpam:

    def __init__(self, ham, spam):

        self.threshold = THRESHOLD
        self.unknown_probability_value = BASE_PROB
        self.ngood = len(ham)
        self.nbad = len(spam)

        self.ham_hash = self.hash_occurances(ham)
        self.spam_hash = self.hash_occurances(spam)

        # get a list of all words, from either corpus
        # Credit to https://stackoverflow.com/a/16902603
        self.token_list = set().union(*[self.ham_hash, self.spam_hash])

        # Create the combined score hashmap
        self.score_hash = {}

        for word in self.token_list:
            # Ternary operation to assign good value
            g = (2 * self.ham_hash[word] if word in self.ham_hash else self.unknown_probability_value)
            # Ternary operation to assign bad value
            b = (self.spam_hash[word] if word in self.spam_hash else self.unknown_probability_value)

            if g + b > self.threshold:
                self.score_hash[word] = max(0.01, min(0.99, min(1.0, b / self.nbad) /
                                                      (min(1.0, g / self.ngood) + min(1.0, b / self.nbad))))

        print("Scores:")
        print(self.score_hash)

    @staticmethod
    def hash_occurances(corpus):
        new_hash = {}
        for i in corpus:
            for v in i:
                if v in new_hash:
                    new_hash[v.upper()] += 1
                else:
                    new_hash[v.upper()] = 1
        return new_hash

    def filter_spam(self, mail):
        mail = mail.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
        words = mail.upper().split()
        prod_list = 1
        compliment_list = 1

        for word in words:
            if word in self.score_hash:
                prob = self.score_hash[word]
            else:
                prob = self.unknown_probability_value

            prod_list *= prob
            compliment_list *= (1 - prob)

        return prod_list / (prod_list + compliment_list)
    
    filter = BayesSpam(ham_corpus, spam_corpus)
    print(filter.filter_spam("i"))
    print(filter.filter_spam("I am sam, sam I am, do you like green eggs and ham?"))
    print(filter.filter_spam("spamiam"))
    print(filter.filter_spam("I am spam, spam I am"))


# I was having kernel issues with jupyter, so I ran the code myself as a .py file. Here is the printout:
'''
Scores:
{'NOT': 0.6666666666666666, 'GREEN': 0.2, 'LIKE': 0.3333333333333333, 'SPAM': 0.6666666666666666, 'AND': 0.2, 'I': 0.5, 'SPAMIAM': 0.6666666666666666, 'THAT': 0.6666666666666666, 'HAM': 0.2, 'DO': 0.3333333333333333, 'EGGS': 0.2, 'AM': 0.6666666666666666}
0.5
0.0038910505836575863
0.6666666666666666
0.9411764705882353
'''

# Feel free to verify if desired.