In [0]:
from google.colab import drive
drive.mount('/content/drive/')

In [0]:
import sys
import os

import numpy as np
import re
import matplotlib.pyplot as plt
from scipy.io import loadmat
import utils

# Enable auto reload
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Data Reference: [SpamAssassin Public Corpus](http://spamassassin.apache.org/old/publiccorpus/). 

In [0]:
def getVocabList():
    vocabList = np.genfromtxt(os.path.join('/content/drive/My Drive/Data', 'vocab.txt'), dtype=object)
    return list(vocabList[:, 1].astype(str))

In [0]:
def process_email(email_contents, verbose=True):
   
    # Load Vocabulary
    vocabList = getVocabList()

    # Init return value
    word_indices = []

    # ========================== Preprocess Email ===========================
    # Find the Headers ( \n\n and remove )
    # Uncomment the following lines if you are working with raw emails with the
    # full headers
    # hdrstart = email_contents.find(chr(10) + chr(10))
    # email_contents = email_contents[hdrstart:]

    # Lower case
    email_contents = email_contents.lower()
    
    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    email_contents =re.compile('<[^<>]+>').sub(' ', email_contents)

    # Handle Numbers
    # Look for one or more characters between 0-9
    email_contents = re.compile('[0-9]+').sub(' number ', email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://
    email_contents = re.compile('(http|https)://[^\s]*').sub(' httpaddr ', email_contents)

    # Handle Email Addresses
    # Look for strings with @ in the middle
    email_contents = re.compile('[^\s]+@[^\s]+').sub(' emailaddr ', email_contents)
    
    # Handle $ sign
    email_contents = re.compile('[$]+').sub(' dollar ', email_contents)
    
    # get rid of any punctuation
    email_contents = re.split('[ @$/#.-:&*+=\[\]?!(){},''">_<;%\n\r]', email_contents)

    # remove any empty word string
    email_contents = [word for word in email_contents if len(word) > 0]
    
    # Stem the email contents word by word
    stemmer = utils.PorterStemmer()
    processed_email = []
    
    for word in email_contents:
        # Remove any remaining non alphanumeric characters in word
        word = re.compile('[^a-zA-Z0-9]').sub('', word).strip()
        word = stemmer.stem(word)
        processed_email.append(word)

        if len(word) < 1:
            continue    
        try:
          word_indices.append(vocabList.index(word))
        except ValueError:
          pass
          
    if verbose:
        print('----------------')
        print('Processed email:')
        print('----------------')
        print(' '.join(processed_email))
    return word_indices

In [0]:
# Extract Features
with open(os.path.join('/content/drive/My Drive/Data', 'emailSample1.txt')) as fid:
    file_contents = fid.read()

word_indices  = process_email(file_contents)

#Print Stats
print('-------------')
print('Word Indices:')
print('-------------')
print(word_indices)

In [0]:
def email_features(word_indices):
    
    # Total number of words in the dictionary
    n = 1899

    x = np.zeros(n)

    for idx in range(n):
      if idx in word_indices:
        x[idx] = 1
    return x

In [0]:
# Extract Features
with open(os.path.join('/content/drive/My Drive/Data', 'emailSample1.txt')) as fid:
    file_contents = fid.read()

word_indices  = process_email(file_contents)
features      = email_features(word_indices)

# Print Stats
print('\nLength of feature vector: %d' % len(features))
print('Number of non-zero entries: %d' % sum(features > 0))

In [0]:
### Training SVM for Spam Classification

# Load the Spam Email dataset
# You will have X, y in your environment
data = loadmat(os.path.join('/content/drive/My Drive/Data', 'spamTrain.mat'))
X, y= data['X'].astype(float), data['y'][:, 0]

print('Training Linear SVM (Spam Classification)')

C = 0.1
model = LinearSVC(C=C, penalty='l2', loss='hinge', random_state=5566)
model.fit(X, y)


In [0]:
# Load the test dataset
# You will have Xtest, ytest in your environment
data = loadmat(os.path.join('/content/drive/My Drive/Data', 'spamTest.mat'))
Xtest, ytest = data['Xtest'].astype(float), data['ytest'][:, 0]

print('Evaluating the trained Linear SVM on a test set ...')
p = model.predict(Xtest)

print('Test Accuracy: %.2f' % (np.mean(p == ytest) * 100))

In [0]:
# Sort the weights and obtin the vocabulary list

weights = model.coef_[0]

idx = np.argsort(weights)
top_idx = idx[-15:][::-1]
vocabList = getVocabList()

print('Top predictors of spam:')
print('%-15s %-15s' % ('word', 'weight'))
print('----' + ' '*12 + '------')
for word, w in zip(np.array(vocabList)[top_idx], weights[top_idx]):
    print('%-15s %0.2f' % (word, w))
