In [29]:

from __future__ import print_function, division
import nltk
import os
import random
from collections import Counter
from nltk import word_tokenize, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import NaiveBayesClassifier, classify
 
stoplist = stopwords.words('english')
 
def init_lists(folder):
    a_list = []
    file_list = os.listdir(folder)
    for a_file in file_list:
        f = open(folder + a_file, 'r',encoding='ISO-8859-1')
        a_list.append(f.read())
    f.close()
    return a_list
 
def preprocess(sentence):
    lemmatizer = WordNetLemmatizer()
    #return [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(unicode(sentence, errors='ignore'))]
    return [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(sentence)]
 
def get_features(text, setting):
    if setting=='bow':
        return {word: count for word, count in Counter(preprocess(text)).items() if not word in stoplist}
    else:
        return {word: True for word in preprocess(text) if not word in stoplist}
 


In [30]:
#import sys
#if sys.version_info[0] >= 3:
#   unicode = str


In [31]:
def train(features, samples_proportion):
    train_size = int(len(features) * samples_proportion)
    # initialise the training and test sets
    train_set, test_set = features[:train_size], features[train_size:]
    print ('Training set size = ' + str(len(train_set)) + ' emails')
    print ('Test set size = ' + str(len(test_set)) + ' emails')
    # train the classifier
    classifier = NaiveBayesClassifier.train(train_set)
    return train_set, test_set, classifier
 
def evaluate(train_set, test_set, classifier):
    # check how the classifier performs on the training and test sets
    print ('Accuracy on the training set = ' + str(classify.accuracy(classifier, train_set)))
    print ('Accuracy of the test set = ' + str(classify.accuracy(classifier, test_set)))
    # check which words are most informative for the classifier
    classifier.show_most_informative_features(20)
 


In [32]:
if __name__ == '__main__':
    # initialise the data
    spam = init_lists('enron/enron1/spam/')
    ham = init_lists('enron/enron1/ham/')
    all_emails = [(email, 'spam') for email in spam]
    all_emails += [(email, 'ham') for email in ham]
    random.shuffle(all_emails)
    print ('Corpus size = ' + str(len(all_emails)) + ' emails')
 
    # extract the features
    all_features = [(get_features(email, ''), label) for (email, label) in all_emails]
    print ('Collected ' + str(len(all_features)) + ' feature sets')
 
    # train the classifier
    train_set, test_set, classifier = train(all_features, 0.8)
 
    # evaluate its performance
    evaluate(train_set, test_set, classifier)

Corpus size = 15525 emails
Collected 15525 feature sets
Training set size = 12420 emails
Test set size = 3105 emails
Accuracy on the training set = 0.7981481481481482
Accuracy of the test set = 0.7716586151368761
Most Informative Features
                kaminski = True              ham : spam   =    425.7 : 1.0
                 shirley = True              ham : spam   =    163.0 : 1.0
                     hpl = True              ham : spam   =    144.3 : 1.0
                   vince = True              ham : spam   =    139.2 : 1.0
                  valium = True             spam : ham    =     73.2 : 1.0
                 melissa = True              ham : spam   =     63.6 : 1.0
                 actuals = True              ham : spam   =     61.7 : 1.0
                     hou = True              ham : spam   =     61.2 : 1.0
                   zimin = True              ham : spam   =     56.6 : 1.0
                  vasant = True              ham : spam   =     56.0 : 1.0
           

In [None]:
#https://cambridgecoding.wordpress.com/2016/01/25/implementing-your-own-spam-filter/