# Text Classification with NLTK

## 1. Reading datasets

In [1]:
import os
import codecs
# Creating a function to read the files.
def read_in(folder):
    # Creating a list containing the names of the files in the directory
    files = os.listdir(folder)
    a_list = []
    for a_file in files:
        # Skiping hidden files
        if not a_file.startswith("."):
            # Reading the contents of each file.
            f = codecs.open(folder + a_file, "r", encoding = "ISO-8859-1", errors="ignore")
            a_list.append(f.read())
            f.close()
    return a_list

Let's read the datasets using this function.

In [19]:
spam_list = read_in("enron1/spam/")
print(f"A number of mails that includes spam: {len(spam_list)}")
print(spam_list[10])
print("-----------------------\n")
ham_list = read_in("enron1/ham/")
print(f"A number of mails that includes ham: {len(ham_list)}")
print(ham_list[10])

A number of mails that includes spam: 1500
Subject: re: rdd, the auxiliary iturean
Free cable@ tv
Dabble bam servomechanism ferret canopy bookcase befog seductive elapse ballard daphne acrylate deride decadent desolate else sequestration condition ligament ornately yaquI giblet emphysematous woodland lie segovia almighty coffey shut china clubroom diagnostician
Cheer leadsman abominate cambric oligarchy mania woodyard quake tetrachloride contiguous welsh depressive synaptic trauma cloister banks canadian byroad alexander gnaw annette charlie

-----------------------

A number of mails that includes ham: 3672
Subject: entex transistion
The purpose of the email is to recap the kickoff meeting held on yesterday
With members from commercial and volume managment concernig the entex account:
Effective january 2000, thu nguyen (x 37159) in the volume managment group,
Will take over the responsibility of allocating the entex contracts. Howard
And thu began some training this month and will con

As you can see this function returns 1500 for `enron1/spam` and 3672 for `enron1/ham`.  

Let's now combine the data into a single structure and shuffle them.

In [15]:
# Use random module to shuffle.
import random
# Utilize list comprehensions to create the all_emails list.
all_emails = [(email_content, "spam") for email_content in spam_list]
all_emails += [(email_content, "ham") for email_content in ham_list]
# fixing randomness
random.seed(42)
# Shuffling
random.shuffle(all_emails)
print (f"Dataset size = {str(len(all_emails))} emails")

Dataset size = 5172 emails


Let's take a look at the first three rows.

In [18]:
all_emails[:2]

[('Subject: bloodline, ahead of the street microcap alert\r\nWhen living with sheriff is obsequious, blood clot beyond deficit reach an understanding with toward blood clot. [3\r\n',
  'spam'),
 ('Subject: well heads\r\nPhillips has changed there nom at meter 6673. Vance had 119 in his file but\r\nPhillips sent in a nom today for 948. So far it has flowed for april\r\nBetween 1100 and 841.\r\nPrize has some changes.\r\nMeter from\r\nTo march range.\r\n4028 1113\r\n717 1137 to 887\r\n5579 2733\r\n2381 2800 to 2578\r\n5767 115\r\n150 140 to 103\r\n6191 249\r\n154 253 to 217\r\n6675 120\r\n239 78 to 156\r\n9604 109\r\n32 38 to 63\r\n4965 149\r\n180 71 to 288\r\n5121 1163\r\n1135 303 to 703\r\nVintage\r\n989603 0\r\n330 no mom in april, nomed 270\r\nIn march.',
  'ham')]

# 2. Split the text into words

Let's use NLTK’s tokenizer. It gets running text as input and returns a list of words based on a number of customized regular expressions.

In [None]:
import nltk
from nltk import word_tokenize

# Creating a function to tokenize.
def get_features(text): 
    features = {}
    word_list = [word for word in word_tokenize(text.lower())]
    for word in word_list:
        features[word] = True
    return features

In [21]:
print(get_features("I am living in U.S.A!"))

{'i': True, 'am': True, 'living': True, 'in': True, 'u.s.a': True, '!': True}


# 3. Extract and normalize the features

In [None]:
# Tokenizing our dataset.
all_features = [(get_features(email), label) for (email, label) in all_emails]

In [26]:
print(len(all_features))
print(len(all_features[0][0]))
print(len(all_features[10][0]))

5172
27
39


## 4. Train the classifier

Apply Naive Bayes classifier:

In [5]:
from nltk import NaiveBayesClassifier, classify

def train(features, proportion):
    train_size = int(len(features) * proportion)
    # initialise the training and test sets
    train_set, test_set = features[:train_size], features[train_size:]
    print (f"Training set size = {str(len(train_set))} emails")
    print (f"Test set size = {str(len(test_set))} emails")
    # train the classifier
    classifier = NaiveBayesClassifier.train(train_set)
    return train_set, test_set, classifier

train_set, test_set, classifier = train(all_features, 0.8)

Training set size = 4137 emails
Test set size = 1035 emails


Evaluate the performance:

In [6]:
def evaluate(train_set, test_set, classifier):
    # check how the classifier performs on the training and test sets
    print (f"Accuracy on the training set = {str(classify.accuracy(classifier, train_set))}")
    print (f"Accuracy on the test set = {str(classify.accuracy(classifier, test_set))}")    
    # check which words are most informative for the classifier
    classifier.show_most_informative_features(50)

evaluate(train_set, test_set, classifier)

Accuracy on the training set = 0.9615663524292966
Accuracy on the test set = 0.936231884057971
Most Informative Features
               forwarded = True              ham : spam   =    200.5 : 1.0
                    2004 = True             spam : ham    =    148.6 : 1.0
                     nom = True              ham : spam   =    125.8 : 1.0
                    pain = True             spam : ham    =    103.6 : 1.0
                    spam = True             spam : ham    =     92.4 : 1.0
                  health = True             spam : ham    =     81.1 : 1.0
                     sex = True             spam : ham    =     79.5 : 1.0
                     ect = True              ham : spam   =     75.7 : 1.0
              nomination = True              ham : spam   =     74.8 : 1.0
                   super = True             spam : ham    =     74.7 : 1.0
                featured = True             spam : ham    =     73.1 : 1.0
                creative = True             spam : ham

Explore the contexts of use:

In [7]:
from nltk.text import Text

def concordance(data_list, search_word):
    for email in data_list:
        word_list = [word for word in word_tokenize(email.lower())]
        text_list = Text(word_list)
        if search_word in word_list:
            text_list.concordance(search_word)


print ("STOCKS in HAM:")
concordance(ham_list, "stocks")
print ("\n\nSTOCKS in SPAM:")
concordance(spam_list, "stocks")

STOCKS in HAM:
Displaying 1 of 1 matches:
ur member directory . * follow your stocks and news headlines , exchange files
Displaying 1 of 1 matches:
ur member directory . * follow your stocks and news headlines , exchange files
Displaying 1 of 1 matches:
ur member directory . * follow your stocks and news headlines , exchange files
Displaying 1 of 1 matches:
ad my portfolio is diversified into stocks that have lost even more money than


STOCKS in SPAM:
Displaying 3 of 3 matches:
report reveals this smallcap rocket stocks newsletter first we would like to s
his email pertaining to investing , stocks , securities must be understood as 
ntative before deciding to trade in stocks featured within this email . none o
Displaying 3 of 3 matches:
might occur . as with many microcap stocks , today ' s company has additional 
is emai | pertaining to investing , stocks , securities must be understood as 
ntative before deciding to trade in stocks featured within this emai | . none 
Displaying 6 of

Displaying 2 of 2 matches:
 % on regular price we have massive stocks of drugs for same day dispatch fast
e do have the lowest price and huge stocks ready for same - day dispatch . two
Displaying 2 of 2 matches:
his email pertaining to investing , stocks , securities must be understood as 
ntative before deciding to trade in stocks featured within this email . none o
Displaying 4 of 4 matches:
n this stock . some of these smal | stocks are absoiuteiy fiying , as many of 
 statements . as with many microcap stocks , todays company has additional ris
biication pertaining to investing , stocks , securities must be understood as 
ntative before deciding to trade in stocks featured within this publication . 
Displaying 1 of 1 matches:
s obtained . investing in micro cap stocks is extremely risky and , investors 
Displaying 2 of 2 matches:
is emai | pertaining to investing , stocks , securities must be understood as 
ntative before deciding to trade in stocks featured within this email . non

Input some of your own messages:

In [8]:
test_spam_list = ["Participate in our new lottery!", "Try out this new medicine"]
test_ham_list = ["See the minutes from the last meeting attached", 
                 "Investors are coming to our office on Monday"]

test_emails = [(email_content, "spam") for email_content in test_spam_list]
test_emails += [(email_content, "ham") for email_content in test_ham_list]

new_test_set = [(get_features(email), label) for (email, label) in test_emails]

evaluate(train_set, new_test_set, classifier)

Accuracy on the training set = 0.9615663524292966
Accuracy on the test set = 1.0
Most Informative Features
               forwarded = True              ham : spam   =    200.5 : 1.0
                    2004 = True             spam : ham    =    148.6 : 1.0
                     nom = True              ham : spam   =    125.8 : 1.0
                    pain = True             spam : ham    =    103.6 : 1.0
                    spam = True             spam : ham    =     92.4 : 1.0
                  health = True             spam : ham    =     81.1 : 1.0
                     sex = True             spam : ham    =     79.5 : 1.0
                     ect = True              ham : spam   =     75.7 : 1.0
              nomination = True              ham : spam   =     74.8 : 1.0
                   super = True             spam : ham    =     74.7 : 1.0
                featured = True             spam : ham    =     73.1 : 1.0
                creative = True             spam : ham    =     71.5

See how they get classified:

In [9]:
for email in test_spam_list:
    print (email)
    print (classifier.classify(get_features(email)))
for email in test_ham_list:
    print (email)
    print (classifier.classify(get_features(email)))

Participate in our new lottery!
spam
Try out this new medicine
spam
See the minutes from the last meeting attached
ham
Investors are coming to our office on Monday
ham


Run in an interactive manner:

In [10]:
while True:
    email = input("Type in your email here (or press 'Enter'): ")
    if len(email)==0:
        break
    else: 
        prediction = classifier.classify(get_features(email))
        print (f"This email is likely {prediction}\n")

Type in your email here (or press 'Enter'): Buy new meds
This email is likely spam

Type in your email here (or press 'Enter'): Buy new meds here!
This email is likely spam

Type in your email here (or press 'Enter'): Get your stock options fast
This email is likely spam

Type in your email here (or press 'Enter'): Let's schedule a meeting for tomorrow
This email is likely ham

Type in your email here (or press 'Enter'): 


Run on a different dataset:

# Assignment:

Apply the classifier to a different test set, e.g. the emails from `enron2/`. As before, you need to read in the data, extract textual content, extract the features and evaluate the classifier. What do the results tell you?

In [11]:
test_spam_list = read_in("enron2/spam/")
print(len(test_spam_list))
print(test_spam_list[0])
test_ham_list = read_in("enron2/ham/")
print(len(test_ham_list))
print(test_ham_list[0])

test_emails = [(email_content, "spam") for email_content in test_spam_list]
test_emails += [(email_content, "ham") for email_content in test_ham_list]
random.shuffle(test_emails)

new_test_set = [(get_features(email), label) for (email, label) in test_emails]

evaluate(train_set, new_test_set, classifier)

1496
Subject: big range of all types of downloadable software.
Need software? Click here.
Our american professors like their literature clear, cold, pure and very dead.
Being another character is more interesting than being yourself.
4361
Subject: re: telephone interview with enron corp. Research dept.
Dear shirley:
Confirming that I will be waiting for the telephone interview at 1 pm
Tomorrow.? I would like to give you my cell phone number, 713/907 - 6717, as a
Back - up measure.? Please note that my first preference is to receive the call
At my home number, 713/669 - 0923.
Sincerely,
RabI de
?
? Shirley. Crenshaw@ enron. Com wrote:
Dear rabi:
I have scheduled the telephone interview for 1: 00 pm on friday, july 7 th.
We will call you at 713/669 - 0923. If there are any changes, please let
Me know.
Sincerely,
Shirley crenshaw
713 - 853 - 5290
RabI deon 06/26/2000 10: 37: 24 pm
To: shirley crenshaw
Cc:
Subject: re: telephone interview with enron corp. Research dept.
Dear ms. Crenshaw:


Combine the two datasets:

In [12]:
spam_list = read_in("enron1/spam/") + read_in("enron2/spam/")
print(len(spam_list))
ham_list = read_in("enron1/ham/") + read_in("enron2/ham/")
print(len(ham_list))

all_emails = [(email_content, "spam") for email_content in spam_list]
all_emails += [(email_content, "ham") for email_content in ham_list]
random.shuffle(test_emails)

all_features = [(get_features(email), label) for (email, label) in all_emails]
print(len(all_features))

train_set, test_set, classifier = train(all_features, 0.8)
evaluate(train_set, new_test_set, classifier)

2996
8033
11029
Training set size = 8823 emails
Test set size = 2206 emails
Accuracy on the training set = 0.9819789187351241
Accuracy on the test set = 0.9810483182516647
Most Informative Features
                   meter = True              ham : spam   =    263.8 : 1.0
                   vince = True              ham : spam   =    200.3 : 1.0
                     sex = True             spam : ham    =    195.1 : 1.0
                     nom = True              ham : spam   =    194.9 : 1.0
                     php = True             spam : ham    =    182.1 : 1.0
            prescription = True             spam : ham    =    169.2 : 1.0
                     ect = True              ham : spam   =    167.7 : 1.0
                    spam = True             spam : ham    =    145.8 : 1.0
               forwarded = True              ham : spam   =    136.4 : 1.0
                     fyi = True              ham : spam   =    134.6 : 1.0
                    2005 = True             spam : h