In [86]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.stem import PorterStemmer
from scipy.io import loadmat
from sklearn.svm import SVC 
from sklearn.svm import LinearSVC
ps = PorterStemmer()

In [87]:
cwd= os.getcwd() # current working directory
path = os.path.join(cwd,'data') 

def get_sample(fn):
    with open(fn, 'r') as f:
        content = f.read()
    return content
    
fn=  os.path.join(path , 'emailSample1.txt')
content = get_sample(fn)
content

"> Anyone knows how much it costs to host a web portal ?\n>\nWell, it depends on how many visitors you're expecting.\nThis can be anywhere from less than 10 bucks a month to a couple of $100. \nYou should checkout http://www.rackspace.com/ or perhaps Amazon EC2 \nif youre running something big..\n\nTo unsubscribe yourself from this mailing list, send an email to:\ngroupname-unsubscribe@egroups.com\n\n"

In [88]:
def word_tokeniize(content):
    '''
    content: str - body of mail 
    return: list of tokens (str) e.g. ['>', 'Anyone', 'knows', 'how', 'much', 'it', 'costs', 'to', 'host', 'a']
    '''
    tokens = re.split("\s+", content)
    return tokens

In [89]:
tokens  = word_tokeniize('''> Anyone knows how much it costs to host a web portal ?\n>\nWell, it depends on how many visitors you're expecting.\nThis can be anywhere from less than 10 bucks a month to a couple of $100. \nYou should checkout http://www.rackspace.com/ or perhaps Amazon EC2 \nif youre running something big..\n\nTo unsubscribe yourself from this mailing list, send an email to:\ngroupname-unsubscribe@egroups.com\n\n''')
print(tokens)

['>', 'Anyone', 'knows', 'how', 'much', 'it', 'costs', 'to', 'host', 'a', 'web', 'portal', '?', '>', 'Well,', 'it', 'depends', 'on', 'how', 'many', 'visitors', "you're", 'expecting.', 'This', 'can', 'be', 'anywhere', 'from', 'less', 'than', '10', 'bucks', 'a', 'month', 'to', 'a', 'couple', 'of', '$100.', 'You', 'should', 'checkout', 'http://www.rackspace.com/', 'or', 'perhaps', 'Amazon', 'EC2', 'if', 'youre', 'running', 'something', 'big..', 'To', 'unsubscribe', 'yourself', 'from', 'this', 'mailing', 'list,', 'send', 'an', 'email', 'to:', 'groupname-unsubscribe@egroups.com', '']


In [90]:
def lower_case(tokens):
    '''
    tokens: ndarry of str
    return: ndarry of tokens in lower case (str)
    '''
    tokens = [x.lower() for x in tokens]
    return tokens

In [91]:
tokens = lower_case(tokens)
print(tokens)

['>', 'anyone', 'knows', 'how', 'much', 'it', 'costs', 'to', 'host', 'a', 'web', 'portal', '?', '>', 'well,', 'it', 'depends', 'on', 'how', 'many', 'visitors', "you're", 'expecting.', 'this', 'can', 'be', 'anywhere', 'from', 'less', 'than', '10', 'bucks', 'a', 'month', 'to', 'a', 'couple', 'of', '$100.', 'you', 'should', 'checkout', 'http://www.rackspace.com/', 'or', 'perhaps', 'amazon', 'ec2', 'if', 'youre', 'running', 'something', 'big..', 'to', 'unsubscribe', 'yourself', 'from', 'this', 'mailing', 'list,', 'send', 'an', 'email', 'to:', 'groupname-unsubscribe@egroups.com', '']


In [92]:
def normalize_tokens (tokens):
    '''
    tokens: ndarry of str
    return: ndarry of tokens replaced with corresponding unified words
    '''
    tokens = [re.sub(">", "", x) for x in tokens] # Remove html and other tags
    tokens = [re.sub("\d+", "number", x) for x in tokens] # mark all numbers "number"
    tokens = [re.sub("^http.*", "httpaddr", x) for x in tokens] # mark all  urls as "httpaddr"
    tokens = [re.sub(".*@.*", "emailaddr", x) for x in tokens] # mark all emails as "emailaddr"
    tokens = [re.sub("\$", "dollar", x) for x in tokens] # replace $ as "dollar"
    tokens = [re.sub("\.|,|\?|!|:|;", "", x) for x in tokens] # get rid of any punctuation
    tokens = [re.sub("\W", "", x) for x in tokens] # Remove any non alphanumeric characters

    return tokens

In [93]:
tokens = normalize_tokens(tokens)
print(tokens)

['', 'anyone', 'knows', 'how', 'much', 'it', 'costs', 'to', 'host', 'a', 'web', 'portal', '', '', 'well', 'it', 'depends', 'on', 'how', 'many', 'visitors', 'youre', 'expecting', 'this', 'can', 'be', 'anywhere', 'from', 'less', 'than', 'number', 'bucks', 'a', 'month', 'to', 'a', 'couple', 'of', 'dollarnumber', 'you', 'should', 'checkout', 'httpaddr', 'or', 'perhaps', 'amazon', 'ecnumber', 'if', 'youre', 'running', 'something', 'big', 'to', 'unsubscribe', 'yourself', 'from', 'this', 'mailing', 'list', 'send', 'an', 'email', 'to', 'emailaddr', '']


In [94]:
def filter_short_tokens (tokens):
    '''
    tokens: ndarry of str
    return: ndarry of filtered tokens (str)
    '''
    original_tokens_len = len(tokens)
    while("" in tokens):
        tokens.remove("")
    print ('Original len= {}\nRemaining len= {}'.format(original_tokens_len, len(tokens)))    
    
    return tokens

In [95]:
tokens = filter_short_tokens(tokens)
print(tokens)

Original len= 65
Remaining len= 61
['anyone', 'knows', 'how', 'much', 'it', 'costs', 'to', 'host', 'a', 'web', 'portal', 'well', 'it', 'depends', 'on', 'how', 'many', 'visitors', 'youre', 'expecting', 'this', 'can', 'be', 'anywhere', 'from', 'less', 'than', 'number', 'bucks', 'a', 'month', 'to', 'a', 'couple', 'of', 'dollarnumber', 'you', 'should', 'checkout', 'httpaddr', 'or', 'perhaps', 'amazon', 'ecnumber', 'if', 'youre', 'running', 'something', 'big', 'to', 'unsubscribe', 'yourself', 'from', 'this', 'mailing', 'list', 'send', 'an', 'email', 'to', 'emailaddr']


In [96]:
def stem_tokens(tokens):
    '''
    tokens: ndarry of str
    return: ndarry of stemmed tokens e.g. array(['anyon', 'know', 'how', 'much', 'it', 'cost', 'to', 'host', 'a',
       'web', 'portal', 'well', 'it', 'depend', 'on', 'how', 'mani']...
    '''
    tokens = [ps.stem(w) for w in tokens] 
    return tokens

In [97]:
tokens = stem_tokens(tokens)
print(tokens)

['anyon', 'know', 'how', 'much', 'it', 'cost', 'to', 'host', 'a', 'web', 'portal', 'well', 'it', 'depend', 'on', 'how', 'mani', 'visitor', 'your', 'expect', 'thi', 'can', 'be', 'anywher', 'from', 'less', 'than', 'number', 'buck', 'a', 'month', 'to', 'a', 'coupl', 'of', 'dollarnumb', 'you', 'should', 'checkout', 'httpaddr', 'or', 'perhap', 'amazon', 'ecnumb', 'if', 'your', 'run', 'someth', 'big', 'to', 'unsubscrib', 'yourself', 'from', 'thi', 'mail', 'list', 'send', 'an', 'email', 'to', 'emailaddr']


In [98]:
def get_vocabulary(fn):
    '''
    fn: str - full path to file 
    return: ndarray of str e.g. array(['aa', 'ab', 'abil', ..., 'zdnet', 'zero', 'zip'], dtype=object)
    '''
    vocab_list = pd.read_table(fn, header=None)
    vocab = np.array(vocab_list)[:,1] # first columns is index, select only words column  
    print ('len(vocab)= {:,}'.format(len(vocab)))
    return vocab

fn=  os.path.join(path , 'vocab.txt')
vocab = get_vocabulary(fn)
vocab

len(vocab)= 1,899


array(['aa', 'ab', 'abil', ..., 'zdnet', 'zero', 'zip'], dtype=object)

In [99]:
def represent_features(tokens, vocab):
    '''
    tokens: ndarry of str
    return: ndarry of binary values 1 if word from vocabulary is in mail 0 otherwise
    '''
    # YOUR_CODE. Compute the array with 1/0 corresponding to is word from vocabulary in mail 
    # START_CODE 
    tokens_represented = [int(i in tokens) for i in vocab]
    # END_CODE     

    print ('{} word(s) from vocab are in the tokens.'.format(np.sum(tokens_represented)))

    return tokens_represented

In [100]:
tokens_represented = represent_features(tokens, vocab)
print(tokens_represented[:5])

44 word(s) from vocab are in the tokens.
[0, 0, 0, 0, 0]


In [101]:
def preprocess (content, vocab):
    '''
    content: str - body of mail 
    vocab: ndarray of str - list of considered words 
    '''
    # YOUR_CODE. Compute the array with 1/0 corresponding to is word from vocabulary in mail 
    # START_CODE 

    # tokenize content    
    tokens  = word_tokeniize(content)
    
    # make lower case
    tokens = lower_case(tokens)

    # normalize tokens
    tokens = normalize_tokens(tokens)

    # remove zero words
    tokens = filter_short_tokens(tokens)
    
    # stem words
    tokens = stem_tokens(tokens)
    
    # convert to binary array of features  
    tokens_represented = represent_features(tokens, vocab)
    tokens_represented = np.array(tokens_represented)
    tokens_represented = tokens_represented.reshape(1, -1)
    # END_CODE     
    
    return tokens_represented


In [102]:
preprocess(content, vocab)

Original len= 65
Remaining len= 61
44 word(s) from vocab are in the tokens.


array([[0, 0, 0, ..., 0, 0, 0]])

In [103]:
fn=  os.path.join(path , 'spamTrain.mat')

mat= loadmat(fn)
X_train= mat['X']
y_train= mat['y'].ravel()

print ('X_train.shape= {}',X_train.shape)
print ('y_train.shape= {}',y_train.shape)

fn=  os.path.join(path , 'spamTest.mat')
mat= loadmat(fn)
X_test = mat['Xtest']
y_test = mat['ytest'].ravel() 

print ('X_test.shape= {}',X_test.shape)
print ('y_test.shape= {}',y_test.shape)
index = 0 
print ('Sample with index  ={}: \n{}'.format(index, X_train[index]))


X_train.shape= {} (4000, 1899)
y_train.shape= {} (4000,)
X_test.shape= {} (1000, 1899)
y_test.shape= {} (1000,)
Sample with index  =0: 
[0 0 0 ... 0 0 0]


In [104]:
C = .1
clf= LinearSVC(C=C)
clf.fit(X_train,y_train)
print ('Score train= {}'.format(clf.score(X_train,y_train)))
print ('Score test= {}'.format(clf.score(X_test,y_test)))

Score train= 0.99975
Score test= 0.992


In [105]:
#print('clf.intercept_={}'.format(clf.intercept_))
#print ('clf.coef_={}'.format(clf.coef_))
arr = clf.coef_
ind = np.argpartition(arr, -20)[0, -20:]
top_spam_contributors = vocab[ind]
print(top_spam_contributors)

['hour' 'wi' 'credit' 'dollarnumb' 'send' 'click' 'guarante' 'nbsp'
 'numberb' 'remov' 'price' 'basenumb' 'visit' 'our' 'most' 'bodi' 'dollar'
 'below' 'will' 'lo']


In [109]:
for sfn in [ 'emailSample1.txt', 'emailSample2.txt', 'spamSample1.txt', 'spamSample2.txt']:
    fn = os.path.join(path,sfn)    
    content = get_sample(fn)
    email = preprocess(content, vocab)
    prediction = clf.predict(email)
    
    label = np.array(['Not Spam','Spam'])
    print (f'{sfn} is {label[prediction]}\n')
    #print ('{} is {}\n'.format(sfn, ('Not Spam','Spam')[prediction])) not working

print ('Latter sample:\n{1}\n{0}\n{1}'.format(content, '='*50))

Original len= 65
Remaining len= 61
44 word(s) from vocab are in the tokens.
emailSample1.txt is ['Not Spam']

Original len= 228
Remaining len= 222
122 word(s) from vocab are in the tokens.
emailSample2.txt is ['Not Spam']

Original len= 99
Remaining len= 97
46 word(s) from vocab are in the tokens.
spamSample1.txt is ['Spam']

Original len= 35
Remaining len= 31
18 word(s) from vocab are in the tokens.
spamSample2.txt is ['Spam']

Latter sample:
Best Buy Viagra Generic Online

Viagra 100mg x 60 Pills $125, Free Pills & Reorder Discount, Top Selling 100% Quality & Satisfaction guaranteed!

We accept VISA, Master & E-Check Payments, 90000+ Satisfied Customers!
http://medphysitcstech.ru



