In [58]:
import re
import glob
import random, math
from machine_learning import split_data
from collections import defaultdict, Counter
from sklearn.naive_bayes import GaussianNB
from sklearn import preprocessing

In [35]:
def tokenize(message):
    message = message.lower()
    all_words = re.findall("[a-z0-9]+", message)
    return set(all_words)

In [36]:
def count_words(training_set):
    counts = defaultdict(lambda: [0,0])
    for message, is_spam in training_set:
        for word in tokenize(message):
            counts[word][0 if is_spam else 1] += 1
    return counts

In [37]:
def word_probabilities(counts, total_spams, total_non_spams, k=0.5):
    return [(w,
             (spam + k) / (total_spams + 2 * k),
             (non_spam + k) / (total_non_spams + 2 * k))
             for w, (spam, non_spam) in counts.items()]

In [38]:
def spam_probability(word_probs, message):
    message_words = tokenize(message)
    log_prob_if_spam = log_prob_if_not_spam = 0.0

    for word, prob_if_spam, prob_if_not_spam in word_probs:
        if word in message_words:
            log_prob_if_spam += math.log(prob_if_spam)
            log_prob_if_not_spam += math.log(prob_if_not_spam)
        else:
            log_prob_if_spam += math.log(1.0 - prob_if_spam)
            log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)

    prob_if_spam = math.exp(log_prob_if_spam)
    prob_if_not_spam = math.exp(log_prob_if_not_spam)
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

In [39]:
class NaiveBayesClassifier:

    def __init__(self, k=0.5):
        self.k = k
        self.word_probs = []

    def train(self, training_set):

        num_spams = len([is_spam
                         for message, is_spam in training_set
                         if is_spam])
        num_non_spams = len(training_set) - num_spams

        word_counts = count_words(training_set)
        self.word_probs = word_probabilities(word_counts,
                                             num_spams,
                                             num_non_spams,
                                             self.k)

    def classify(self, message):
        return spam_probability(self.word_probs, message)

In [40]:
def get_subject_data(path):

    data = []

    subject_regex = re.compile(r"^Subject:\s+")

    for fn in glob.glob(path):
        is_spam = "ham" not in fn

        with open(fn,'r',encoding='ISO-8859-1') as file:
            for line in file:
                if line.startswith("Subject:"):
                    subject = subject_regex.sub("", line).strip()
                    data.append((subject, is_spam))

    return data

In [41]:
def train_and_test_model(path):

    data = get_subject_data(path)
    random.seed(0)
    train_data, test_data = split_data(data, 0.5)

    classifier = NaiveBayesClassifier()
    classifier.train(train_data)

    classified = [(subject, is_spam, classifier.classify(subject))
              for subject, is_spam in test_data]

    counts = Counter((is_spam, spam_probability > 0.5)
                     for _, is_spam, spam_probability in classified)

    print(counts)
    print('\n\n')
    classified.sort(key=lambda row: row[2])
    
    spammiest_hams = list(filter(lambda row: not row[1], classified))[-5:]
    hammiest_spams = list(filter(lambda row: row[1], classified))[:5]

    print("spammiest_hams", spammiest_hams)
    print('\n\n')

    print("hammiest_spams", hammiest_spams)
    print('\n\n')


    words = sorted(classifier.word_probs, key=p_spam_given_word)

    spammiest_words = words[-5:]
    hammiest_words = words[:5]

    print("spammiest_words", spammiest_words)
    print('\n\n')
    print("hammiest_words", hammiest_words)



In [42]:
def p_spam_given_word(word_prob):
    word, prob_if_spam, prob_if_not_spam = word_prob
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)


In [43]:
train_and_test_model(r"D:\\Programowanie\\Python\\MachineLearning\\spam\\*\\**")

Counter({(False, False): 2009, (True, True): 826, (False, True): 194, (True, False): 142})



spammiest_hams [("W3C approves HTML 4 'emotitags' - Now you'll be able to say it with feeling", False, 0.9993790303535273), ("Month by month, 'Most Beautiful Man' winners and their galleries. sfw", False, 0.9993893093351801), ('=?iso-8859-1?Q?Matrox_Parhelia=99_now_available?=', False, 0.9998144809098227), ('"I meditated in a cave for 12 years and now I\'m here to tell you', False, 0.9999356238656534), ('RE: FREE ORIGINAL STAR WARS CARDS Adv:', False, 0.9999852296284948)]



hammiest_spams [('I was so scared... my very first DP', True, 0.0008494763375696866), ('RE: Public Notice, Immune Support              25834', True, 0.001580285113986115), ('RE: Own An Automated Shopping Mall                       32736', True, 0.0018278272632820112), ('Re: Funny', True, 0.00292653415800908), ('Invite: Content Management Summit, Oct. 10th New York City', True, 0.0038710953187596924)]



spammiest_words [('

In [44]:
# BA EXAMPLE FROM SCIKIT LEARN


In [45]:
weather=['Sunny','Sunny','Overcast','Rainy','Rainy','Rainy','Overcast','Sunny','Sunny',
'Rainy','Sunny','Overcast','Overcast','Rainy']
temp=['Hot','Hot','Hot','Mild','Cool','Cool','Cool','Mild','Cool','Mild','Mild','Mild','Hot','Mild']

play=['No','No','Yes','Yes','Yes','No','Yes','No','Yes','Yes','Yes','Yes','Yes','No']

le = preprocessing.LabelEncoder()
weather_encoded=le.fit_transform(weather)
print(wheather_encoded)

[2 2 0 1 1 1 0 2 2 1 2 0 0 1]


In [55]:
temp_encoded=le.fit_transform(temp)
label=le.fit_transform(play)
features=list(zip(weather_encoded,temp_encoded))
print(features)

[(2, 1), (2, 1), (0, 1), (1, 2), (1, 0), (1, 0), (0, 0), (2, 2), (2, 0), (1, 2), (2, 2), (0, 2), (0, 1), (1, 2)]


In [56]:
model = GaussianNB()
model.fit(features,label)
predicted= model.predict([[0,2]])
print("Predicted Value:", predicted)

Predicted Value: [1]
