In [4]:
# open python and nltk packages needed for processing
import os
import sys
import random
import nltk
from nltk.corpus import stopwords

In [5]:
# function to read spam and ham files, train and test a classifier 
def processspamham(dirPath,limitStr):
  # convert the limit argument from a string to an int
  limit = int(limitStr)
  
  # start lists for spam and ham email texts
  hamtexts = []
  spamtexts = []
  os.chdir(dirPath)
  # process all files in directory that end in .txt up to the limit
  #    assuming that the emails are sufficiently randomized
  for file in os.listdir("./spam"):
    if (file.endswith(".txt")) and (len(spamtexts) < limit):
      # open file for reading and read entire file into a string
      f = open("./spam/"+file, 'r', encoding="latin-1")
      spamtexts.append (f.read())
      f.close()
  for file in os.listdir("./ham"):
    if (file.endswith(".txt")) and (len(hamtexts) < limit):
      # open file for reading and read entire file into a string
      f = open("./ham/"+file, 'r', encoding="latin-1")
      hamtexts.append (f.read())
      f.close()
  
  # print number emails read
  #print ("Number of spam files:",len(spamtexts))
 # print ("Number of ham files:",len(hamtexts))
  
  
  # create list of mixed spam and ham email documents as (list of words, label)
  emaildocs = []
  # add all the spam
  for spam in spamtexts:
    tokens = nltk.word_tokenize(spam)
    emaildocs.append((tokens, 'spam'))
  # add all the regular emails
  for ham in hamtexts:
    tokens = nltk.word_tokenize(ham)
    emaildocs.append((tokens, 'ham'))
  
  # randomize the list
  random.shuffle(emaildocs)
  return emaildocs
  # print a few token lists
  #for email in emaildocs[:1]:   
   # print (email)

In [6]:
direc = 'C:\\Users\\udayv\\Downloads\\FinalProject_Data_N\\FinalProjectData\\EmailSpamCorpora\\corpus'
#passing the directory and the no of files to be picked up from it.
res = processspamham(direc, 1000)


In [7]:
all_words_list = [word for (sent,cat) in res for word in res]

In [8]:
new_list = []
for x in all_words_list:
    for i in x:
        new_list.append(i)

In [9]:
result = []
for i in new_list:
    for j in i:
        if len(j) > 2 and j not in ('ham','spam','Subject'):
            result.append(j)

In [10]:
all_words = nltk.FreqDist(result)
all_words

FreqDist({'the': 18816000, 'ect': 10980000, 'and': 10032000, 'for': 7700000, 'you': 6926000, 'this': 5712000, 'hou': 5698000, 'your': 3802000, 'that': 3620000, '2000': 3410000, ...})

In [16]:
# get the 3000 most frequently appearing keywords in the corpus
word_items = all_words.most_common(1000)
word_features = [word for (word,count) in word_items]
print(word_features[:100])


['the', 'ect', 'and', 'for', 'you', 'this', 'hou', 'your', 'that', '2000', 'enron', 'with', 'will', 'have', 'from', 'are', 'please', 'not', 'com', 'all', 'our', 'meter', 'subject', 'can', 'gas', 'deal', 'any', 'http', '000', 'has', 'corp', 'new', 'thanks', 'get', 'was', 'know', 'here', 'need', 'more', 'forwarded', 'out', 'only', 'may', 'daren', 'hpl', 'there', 'information', 'these', 'into', 'company', 'mmbtu', 'www', 'let', 'time', 'would', 'but', 'been', 'price', 'should', 'one', 'now', 'month', 'mail', 'contract', 'email', 'nbsp', 'sitara', 'what', 'see', 'which', 'day', 'also', 'they', 'volume', 'free', 'about', 'like', 'energy', 'deals', 'font', 'their', 'pills', 'change', 'some', 'ami', 'business', 'just', 'message', 'want', 'volumes', '2004', 'over', 'attached', 'other', 'xls', 'farmer', 'its', 'questions', 'who', 'contact']


In [17]:
# define features (keywords) of a document for a BOW/unigram baseline
# each feature is 'contains(keyword)' and is true or false depending
# on whether that keyword is in the document
def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    return features

In [18]:
# get features sets for a document, including keyword features and category feature
featuresets = [(document_features(d, word_features), c) for (d, c) in res]

In [19]:
# training using naive Baysian classifier, training set is approximately 90% of data
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [20]:
# evaluate the accuracy of the classifier
nltk.classify.accuracy(classifier, test_set)


0.96

## Filtering by Stop Words


In [21]:
stopwords = nltk.corpus.stopwords.words('english')
len(stopwords)

179

In [22]:
# this list of additional stop words includes some other words like hardly and rarely
additionalstopwords = ['no', 'not', 'never', 'none', 'nowhere', 'nothing', 'noone', 'rather', 'hardly', 
                 'scarcely', 'rarely', 'seldom', 'neither', 'nor']

In [23]:
newstopwords = [word for word in stopwords if word not in additionalstopwords]
len(newstopwords)
#newstopwords

176

In [24]:
len(result)

523866000

In [None]:
new_all_words_list = [word for word in result if word not in newstopwords]
len(new_all_words_list)

In [23]:
new_all_words = nltk.FreqDist(new_all_words_list)
len(new_all_words)

36469

In [None]:
new_word_items = new_all_words.most_common(2000)

In [None]:
new_word_features = [word for (word,count) in new_word_items]

In [None]:
print(new_word_features[:50])

In [27]:
# get features sets for a document, including keyword features and category feature
featuresetsWithStop = [(document_features(d, new_word_features), c) for (d, c) in res]

In [28]:
# training using naive Baysian classifier, training set is approximately 90% of data
train_set, test_set = featuresetsWithStop[100:], featuresetsWithStop[:100]
classifier    = nltk.NaiveBayesClassifier.train(train_set)

In [29]:
# evaluate the accuracy of the classifier
nltk.classify.accuracy(classifier, test_set)

0.95

## Experimenting with Bigraphs

In [30]:
####   adding Bigram features   ####
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()

In [31]:
finder = BigramCollocationFinder.from_words(result)

In [32]:
bigram_features = finder.nbest(bigram_measures.chi_sq, 500)

In [33]:
# define features that include words as before 
# add the most frequent significant bigrams
# this function takes the list of words in a document as an argument and returns a feature dictionary
# it depends on the variables word_features and bigram_features
def bigram_document_features(document, word_features, bigram_features):
    document_words = set(document)
    document_bigrams = nltk.bigrams(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    for bigram in bigram_features:
        features['B_{}_{}'.format(bigram[0], bigram[1])] = (bigram in document_bigrams)    
    return features

In [34]:
# use this function to create feature sets for all sentences
bigram_featuresets = [(bigram_document_features(d, word_features, bigram_features), c) for (d, c) in res]

In [35]:
# number of features for document 0
print(len(bigram_featuresets[0][0].keys()))

3500


In [36]:
# features in document 0
print(bigram_featuresets[0][0])



In [37]:
# train a classifier and report accuracy
train_set, test_set = bigram_featuresets[100:], bigram_featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.95

## POS Tag Featutres

In [38]:
# this function takes a document list of words and returns a feature dictionary
# it runs the default pos tagger (the Stanford tagger) on the document
#   and counts 4 types of pos tags to use as features
def POS_features(document, word_features):
    document_words = set(document)
    tagged_words = nltk.pos_tag(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    numNoun = 0
    numVerb = 0
    numAdj = 0
    numAdverb = 0
    for (word, tag) in tagged_words:
        if tag.startswith('N'): numNoun += 1
        if tag.startswith('V'): numVerb += 1
        if tag.startswith('J'): numAdj += 1
        if tag.startswith('R'): numAdverb += 1
    features['nouns'] = numNoun
    features['verbs'] = numVerb
    features['adjectives'] = numAdj
    features['adverbs'] = numAdverb
    return features

In [39]:
# define feature sets using this function
POS_featuresets = [(POS_features(d, word_features), c) for (d, c) in res]
# number of features for document 0
print(len(POS_featuresets[0][0].keys()))

3004


In [40]:
# the first sentence
print(res[0])
# the pos tag features for this sentence
print('num nouns', POS_featuresets[0][0]['nouns'])
print('num verbs', POS_featuresets[0][0]['verbs'])
print('num adjectives', POS_featuresets[0][0]['adjectives'])
print('num adverbs', POS_featuresets[0][0]['adverbs'])


(['Subject', ':', 'i', 'put', 'in', '35000', 'for', 'the', 'expected', 'volume', 'for', 'the', 'sale', 'to', 'sds', '/', 'tufco', '.', 'sitara', '276366'], 'ham')
num nouns 6
num verbs 2
num adjectives 2
num adverbs 0


## NLTK Naïve Bayes classifier to train and test a classifier 

In [41]:
# train and test the classifier
train_set, test_set = POS_featuresets[100:], POS_featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)


0.95

## Cross Validating

In [42]:
## cross-validation ##
# this function takes the number of folds, the feature sets
# it iterates over the folds, using different sections for training and testing in turn
#   it prints the accuracy for each fold and the average accuracy at the end
def cross_validation_accuracy(num_folds, featuresets):
    subset_size = int(len(featuresets)/num_folds)
    print('Each fold size:', subset_size)
    accuracy_list = []
    # iterate over the folds
    for i in range(num_folds):
        test_this_round = featuresets[(i*subset_size):][:subset_size]
        train_this_round = featuresets[:(i*subset_size)] + featuresets[((i+1)*subset_size):]
        # train using train_this_round
        classifier = nltk.NaiveBayesClassifier.train(train_this_round)
        # evaluate against test_this_round and save accuracy
        accuracy_this_round = nltk.classify.accuracy(classifier, test_this_round)
        print (i, accuracy_this_round)
        accuracy_list.append(accuracy_this_round)
    # find mean accuracy over all rounds
    print ('mean accuracy', sum(accuracy_list) / num_folds)

In [64]:
num_folds = 5
cross_validation_accuracy(num_folds, featuresets)

Each fold size: 400
0 0.96
1 0.98
2 0.96
3 0.9675
4 0.96
mean accuracy 0.9654999999999999


In [44]:
num_folds = 5
cross_validation_accuracy(num_folds, bigram_featuresets)

Each fold size: 400
0 0.96
1 0.98
2 0.96
3 0.9675
4 0.96
mean accuracy 0.9654999999999999


## Comparing Cross Validation of POS_featuresets

In [45]:
num_folds = 5
cross_validation_accuracy(num_folds, POS_featuresets)

Each fold size: 400
0 0.9575
1 0.98
2 0.9625
3 0.97
4 0.9625
mean accuracy 0.9665000000000001


In [46]:
num_folds = 10
cross_validation_accuracy(num_folds, POS_featuresets)

Each fold size: 200
0 0.96
1 0.95
2 0.98
3 0.98
4 0.975
5 0.955
6 0.98
7 0.96
8 0.96
9 0.96
mean accuracy 0.966


In [47]:
goldlist = []
predictedlist = []
for (features, label) in test_set:
    	goldlist.append(label)
    	predictedlist.append(classifier.classify(features))

In [49]:
# look at the first 50 examples
print(goldlist[:50])
print(predictedlist[:50])

['ham', 'spam', 'ham', 'spam', 'spam', 'spam', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'spam', 'spam', 'spam', 'spam', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'spam', 'spam', 'spam', 'spam', 'spam', 'ham', 'spam', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'spam', 'spam', 'spam', 'spam', 'ham', 'spam', 'ham', 'ham', 'ham']
['ham', 'ham', 'ham', 'spam', 'spam', 'spam', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'spam', 'spam', 'spam', 'spam', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'spam', 'spam', 'spam', 'spam', 'ham', 'spam', 'ham', 'ham', 'ham']


## Confusion Matrix

In [50]:
cm = nltk.ConfusionMatrix(goldlist, predictedlist)
print(cm.pretty_format(sort_by_count=True, truncate=9))


     |     s |
     |  h  p |
     |  a  a |
     |  m  m |
-----+-------+
 ham |<47> 4 |
spam |  1<48>|
-----+-------+
(row = reference; col = test)



In [51]:
# or show the results as percentages
print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=9))

     |             s |
     |      h      p |
     |      a      a |
     |      m      m |
-----+---------------+
 ham | <47.0%>  4.0% |
spam |   1.0% <48.0%>|
-----+---------------+
(row = reference; col = test)



## Precision, Recall and F1 Score

In [52]:
# Function to compute precision, recall and F1 for each label
#  and for any number of labels
# Input: list of gold labels, list of predicted labels (in same order)
# Output:  prints precision, recall and F1 for each label
def eval_measures(gold, predicted):
    # get a list of labels
    labels = list(set(gold))
    # these lists have values for each label 
    recall_list = []
    precision_list = []
    F1_list = []
    for lab in labels:
        # for each label, compare gold and predicted lists and compute values
        TP = FP = FN = TN = 0
        for i, val in enumerate(gold):
            if val == lab and predicted[i] == lab:  TP += 1
            if val == lab and predicted[i] != lab:  FN += 1
            if val != lab and predicted[i] == lab:  FP += 1
            if val != lab and predicted[i] != lab:  TN += 1
        # use these to compute recall, precision, F1
        recall = TP / (TP + FP)
        precision = TP / (TP + FN)
        recall_list.append(recall)
        precision_list.append(precision)
        F1_list.append( 2 * (recall * precision) / (recall + precision))

    # the evaluation measures in a table with one row per label
    print('\tPrecision\tRecall\t\tF1')
    # print measures for each label
    for i, lab in enumerate(labels):
        print(lab, '\t', "{:10.3f}".format(precision_list[i]), \
          "{:10.3f}".format(recall_list[i]), "{:10.3f}".format(F1_list[i]))

In [53]:
# call the function with our data
eval_measures(goldlist, predictedlist)

	Precision	Recall		F1
spam 	      0.980      0.923      0.950
ham 	      0.922      0.979      0.949


In [54]:
from nltk.classify import SklearnClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC, NuSVC

In [55]:
#support vector classification
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(train_set)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, test_set))*100)

SVC_classifier accuracy percent: 68.0


In [56]:
#kernal parameter is set to linear
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(train_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, test_set))*100)



LinearSVC_classifier accuracy percent: 96.0


In [57]:
#SVC with lower and upper bounds for margin error
NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(train_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, test_set))*100)

NuSVC_classifier accuracy percent: 91.0
