In [221]:
import re
import random
import numpy as np
import math

In [222]:
def bag_of_words(documents):
    result = set()
    for words in documents:
        result |= set(words)
    return result

In [223]:
def word2vec(bag_of_words, words):
    result = np.zeros(len(bag_of_words))
    
    for i, word in enumerate(bag_of_words):
        if word in words:
            # For every word found, increment by one.
            result[i] += 1

    return result

In [224]:
def tokenize(sentence):
    regexp = re.compile('\W+')
    return [word.lower() 
            for word in regexp.split(sentence)
            if len(word) > 2]

In [225]:
with open('data/email/ham/6.txt', 'r', encoding='latin-1') as f:
    text = f.read()
    print(text)
    print()
    print(tokenize(text))

Hello,

Since you are an owner of at least one Google Groups group that uses the customized welcome message, pages or files, we are writing to inform you that we will no longer be supporting these features starting February 2011. We made this decision so that we can focus on improving the core functionalities of Google Groups -- mailing lists and forum discussions.  Instead of these features, we encourage you to use products that are designed specifically for file storage and page creation, such as Google Docs and Google Sites.

For example, you can easily create your pages on Google Sites and share the site (http://www.google.com/support/sites/bin/answer.py?hl=en&answer=174623) with the members of your group. You can also store your files on the site by attaching files to pages (http://www.google.com/support/sites/bin/answer.py?hl=en&answer=90563) on the site. If youre just looking for a place to upload your files so that your group members can download them, we suggest you try Googl

In [226]:
X = []
y = []
fulltext = []

for i in range(1, 26):
    with open(f'data/email/spam/{i}.txt', 'r', encoding='latin-1') as f:
        words = tokenize(f.read())
        X.append(words)
        y.append(1)
        fulltext.extend(words)

    with open(f'data/email/ham/{i}.txt', 'r', encoding='latin-1') as f:
        words = tokenize(f.read())
        X.append(words)
        y.append(0)
        fulltext.extend(words)

In [227]:
bow = bag_of_words(X) # Or set(fulltext)
X = np.array([word2vec(bow, X_i) 
              for X_i in X])

In [228]:
indices = list(range(50))
train_size = 40
random.shuffle(indices)

train_indices, test_indices = indices[:train_size], indices[train_size:]

X_train, y_train = [], []
for i in train_indices:
    X_train.append(X[i])
    y_train.append(y[i])

X_train = np.array(X_train)
    
X_test, y_test = [], []
for i in test_indices:
    X_test.append(X[i])
    y_test.append(y[i])

X_test = np.array(X_test)

In [229]:
def train(X, y):
    n_samples, n_words = X.shape
    p_abusive = sum(y) / n_samples
    
    p0_num, p1_num = np.ones(n_words), np.ones(n_words)
    p0_den, p1_den = 1, 1
    
    for i in range(n_samples):
        if y[i] == 1:
            # Matrix addition.
            p1_num += X[i]
            p1_den += sum(X[i])
        else:
            p0_num += X[i]
            p0_den += sum(X[i])

    p1 = np.log(p1_num / p1_den)
    p0 = np.log(p0_num / p0_den)
    
    return p0, p1, p_abusive

In [230]:
def classify(X, p0, p1, p_class1):
    p1 = sum(X * p1) + math.log(p_class1)
    p0 = sum(X * p0) + math.log(1 - p_class1)
    return 1 if p1 > p0 else 0

In [231]:
p0, p1, p_spam = train(X_train, y_train)
p_spam

0.5

In [233]:
error = 0
for i, X_i in enumerate(X_test):
    if classify(X_i, p0, p1, p_spam) != y_test[i]:
        error += 1
error / len(X_test)

0.2