# MAT 345 Spam Filter
#### Joanna Li

## 1) Downloading the data set (Set A was used)

In [1]:
# import all needed libraries
import pandas as pd
import tarfile
import urllib.request
import os
import email
import shutil
import re

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
# url info for set A data
Download_URL = 'http://spamassassin.apache.org/old/publiccorpus/'
Easy_Ham_URL = Download_URL + '20021010_easy_ham.tar.bz2'
Hard_Ham_URL = Download_URL+ '20021010_hard_ham.tar.bz2'
Spam_URL = Download_URL + '20021010_spam.tar.bz2'

# paths for folders where the emails would be placed
Base_Path = 'training_set'
Spam_Path = os.path.join(Base_Path, 'spam')
Hard_Ham_Path = os.path.join(Base_Path, 'hard_ham')
Easy_Ham_Path = os.path.join(Base_Path, 'easy_ham')
Testing_Set_Path = 'testing_set'

# list of all the ham and spam emails
hamEmails = []
spamEmails = []

# dataframe containing the testing set
testingData = pd.DataFrame()

'''
Brief: Helps parse the email
Param: filepath
        - path where the file to be read is
Return: The subject line parsed from the email
'''
def ParseEmail(filePath):
    with open(filePath, 'rb') as file:
        msg = email.message_from_bytes(file.read())

    # Read the subject line only
    subject = msg.get('Subject')
    
    return subject


In [3]:
'''
Brief: Helps fetch all the data from the url and extracts them to their own folders
'''
def ExtractData():

    # make folder to place all the emails
    if not os.path.isdir(Base_Path):
        os.makedirs(Base_Path)

    # extract all the tar files into their own folder
    for filename, url in (('spam.tar.bz2', Spam_URL), ('easy_ham.tar.bz2', Easy_Ham_URL), ('hard_ham.tar.bz2', Hard_Ham_URL)):
        path = os.path.join(Base_Path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)

        tarFile = tarfile.open(path)
        tarFile.extractall(path = Base_Path)
        tarFile.close()

In [4]:
'''
Brief: Helps append the emails in the directory given to the list specified
          Also helpes separate the emails into testing and training sets
Param:
    - dirPath = path where the folder is located
    - list = list to append the email to
'''
def EmailToData(dirPath, list):
    # make folder for the testing set
    if not os.path.isdir(Testing_Set_Path):
        os.makedirs(Testing_Set_Path)

    index = 1
    for file in os.listdir(dirPath):
        # place every 4th message into the testing set folder
        if index == 4:
            filePath = f'{dirPath}/{file}'

            # remove file if already exists
            if os.path.exists(os.path.join(Testing_Set_Path, file)):
                os.remove(os.path.join(Testing_Set_Path, file))

            # move email to the testing set folder
            shutil.move(filePath, Testing_Set_Path)

            # parse the email and check if its spam or ham
            parsed = ParseEmail(os.path.join(Testing_Set_Path, file))
            label = 'ham'
            if(dirPath == Spam_Path):
                label = 'spam'

            currEmail = pd.DataFrame({'Subject': [parsed], 'Label': [label]})

            # add the email to the testing set
            global testingData
            testingData = pd.concat([testingData, currEmail], ignore_index = True)

            index = 0
        else:
            # leave rest in training set folder and add to the appropriate list (hamEmails or spamEmails)
            filePath = f'{dirPath}/{file}'
            parsed = ParseEmail(filePath)
            list.append(parsed)

        index = index + 1
        

In [5]:
'''
Brief: Helps clean up messages by converting everything to lowercase, removing punctuations, 
          numbers, words with numbers (a1, b12, etc.), and stop words (a, at, the, etc)
Param: 
          - data = set to clean all the messages (subject line) for
'''
def CleanMessage(data):
    # make everything lowercase
    data['Subject'] = data['Subject'].str.lower()

    # remove punctuations and words with numbers in them and numbers
    data['Subject'] = data['Subject'].str.replace('\W', ' ', regex = True)
    data['Subject'] = data['Subject'].str.replace(r'\w*\d\w*','', regex = True)

    # remove stop words
    stopWords = set(stopwords.words('english'))
    data['Subject'] = data.apply(lambda row: word_tokenize(row['Subject']), axis=1)
    data['Subject'] = data['Subject'].apply(lambda x: [item for item in x if item not in stopWords])

In [6]:
def SetUpData():
    # extracts the data from the files and put them into the correct list
    ExtractData()
    EmailToData(Spam_Path, spamEmails)
    EmailToData(Easy_Ham_Path, hamEmails)
    EmailToData(Hard_Ham_Path, hamEmails)

    # spam dataframe
    spamData = pd.DataFrame()
    spamData['Subject'] = spamEmails
    spamData['Label'] = 'spam'

    # ham dataframe
    hamData = pd.DataFrame()
    hamData['Subject'] = hamEmails
    hamData['Label'] = 'ham'

    # convert the values to needed type
    spamData['Subject'] = spamData['Subject'].astype(str)
    hamData['Subject'] = hamData['Subject'].astype(str)
    testingData['Subject'] = testingData['Subject'].astype(str)

    trainingData = pd.concat([spamData, hamData], ignore_index = True)
    return trainingData, testingData

In [7]:
trainingSet, testingSet = SetUpData()

#### What the testing set and training set currently looks like:

In [8]:
testingSet

Unnamed: 0,Subject,Label
0,Guaranteed to lose 10-12 lbs in 30 days ...,spam
1,"FORTUNE 500 COMPANY HIRING, AT HOME REPS.",spam
2,^^^^^Cell Phone Belt Clips $1.95^^^^^^ ...,spam
3,MSNBC: Rates Hit 35 year Low 4.75% ...12304,spam
4,A CRY FOR HELP,spam
...,...,...
819,"Core Java Technologies Tech Tips, September 10...",ham
820,You're signed up for the Reich Report!,ham
821,GOVERNMENT REGULATION IS KILLING THE STOCK MARKET,ham
822,[SQL] Database joins,ham


In [9]:
trainingSet

Unnamed: 0,Subject,Label
0,,spam
1,Life Insurance - Why Pay More?,spam
2,[ILUG] Guaranteed to lose 10-12 lbs in 30 days...,spam
3,Re: Fw: User Name & Password to Membership To ...,spam
4,[ILUG-Social] re: Guaranteed to lose 10-12 lbs...,spam
...,...,...
2473,[NOVICE] pl/pgsql and returns timestamp type,ham
2474,Geocaching.com Weekly Cache Notification,ham
2475,Securing multiple virtual hosts,ham
2476,Linux-Announce Digest #180,ham


In [10]:
'''
Brief: Creates the list of words to test against
Return: The list of words created
'''
def createVocab():
    CleanMessage(trainingSet)
    
    # the list of words
    vocab = []

    for line in trainingSet['Subject']:
        for word in line:
            vocab.append(word)

    # make sure there is no duplicate words
    vocab = list(set(vocab))
    return vocab

In [11]:
vocab = createVocab()

### Training Set after removing stop words like "a", "at", "the", and etc.

In [12]:
trainingSet

Unnamed: 0,Subject,Label
0,[none],spam
1,"[life, insurance, pay]",spam
2,"[ilug, guaranteed, lose, lbs, days]",spam
3,"[fw, user, name, password, membership, sites, ...",spam
4,"[ilug, social, guaranteed, lose, lbs, days]",spam
...,...,...
2473,"[novice, pl, pgsql, returns, timestamp, type]",ham
2474,"[geocaching, com, weekly, cache, notification]",ham
2475,"[securing, multiple, virtual, hosts]",ham
2476,"[linux, announce, digest]",ham


In [13]:
# Number of times word appears in the subject line
wordCountPerMess = {unique_word: [0] * len(trainingSet['Subject']) for unique_word in vocab}
for index, sms in enumerate(trainingSet['Subject']):
    for word in sms:
        wordCountPerMess[word][index] += 1

In [14]:
wordCounts = pd.DataFrame(wordCountPerMess)

In [15]:
# add the words count to the training set to form one dataframe
trainingSetClean = pd.concat([trainingSet, wordCounts], axis=1)
trainingSetClean.head()

Unnamed: 0,Subject,Label,young,beta,survey,nessus,back,wins,cartridges,blood,...,files,bishop,notebooks,formatted,skills,updated,recreations,lovers,folks,forty
0,[none],spam,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"[life, insurance, pay]",spam,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"[ilug, guaranteed, lose, lbs, days]",spam,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"[fw, user, name, password, membership, sites, ...",spam,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"[ilug, social, guaranteed, lose, lbs, days]",spam,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Naive Bayes

In [16]:
# total spam and ham messages
spamMess = trainingSetClean[trainingSetClean['Label'] == 'spam']
hamMess = trainingSetClean[trainingSetClean['Label'] == 'ham']

In [17]:
# P(spam) and P(ham)
pSpam = len(spamMess) / len(trainingSetClean)
pHam = len(hamMess) / len(trainingSetClean)

In [18]:
# smoothing
alpha = 1
beta = 2

In [19]:
# probabilies for each word 
probabilitesSpam = {unique_word:0 for unique_word in vocab}
probabilitesHam = {unique_word:0 for unique_word in vocab}

# probabilites for each word with smoothing
for word in vocab:
    # num of spam containing the word
    spamWithWord = spamMess[word].sum() 
    pWordGivenSpam = (spamWithWord + alpha) / (beta + len(spamMess))
    probabilitesSpam[word] = pWordGivenSpam
    
    # num of ham containing the word
    hamWithWord = hamMess[word].sum() 
    pWordGivenSpam = (hamWithWord + alpha) / (beta + len(hamMess))
    probabilitesHam[word] = pWordGivenSpam
    

In [20]:
# sort the result by the probability values calculated, output the top 5 Spammiest words
spamRes = sorted(probabilitesSpam.items(), key = lambda x:x[1], reverse=True)[:5]
print('Top 5 Spammiest words: ')
[print(i) for i in spamRes]

Top 5 Spammiest words: 
('ilug', 0.10317460317460317)
('free', 0.06878306878306878)
('get', 0.04497354497354497)
('adv', 0.042328042328042326)
('money', 0.03968253968253968)


[None, None, None, None, None]

In [21]:
# sort the result by the probability values calculated, output the top 5 Hammiest words
hamRes = sorted(probabilitesHam.items(), key = lambda x:x[1], reverse=True)[:5]
print('Top 5 Hammiest words')
[print(i) for i in hamRes]

Top 5 Hammiest words
('spambayes', 0.051330798479087454)
('satalk', 0.05038022813688213)
('razor', 0.04800380228136882)
('new', 0.0408745247148289)
('users', 0.03897338403041825)


[None, None, None, None, None]

# Testing filter with testing set

In [22]:
'''
Brief : Classifies the message passed as spam or ham
Param: 
            - message = line to be classified
Return: Ham or Spam
'''
def classify(message):
    pSpamGivenMess = pSpam
    pHamGivenMess = pHam
    
    for word in message:
        
        # P(X = a | Spam)
        if word in probabilitesSpam:
            pSpamGivenMess *= probabilitesSpam[word]
            
        # P(X = a | Ham)
        if word in probabilitesHam:
            pHamGivenMess *= probabilitesHam[word]
    
    # P(spam | X = a) = ( P(X = a | spam) P(spam) ) / ( (P(X = a | spam) P(spam)) + ( P(X = a | Ham) P(Ham) )
    result = pSpamGivenMess / (pSpamGivenMess + pHamGivenMess)
    
    # Threshold for the probaility, if more then 50% then classify it as spam
    if result > 0.5:
        return 'spam'
    else:
        return 'ham'

In [23]:
CleanMessage(testingSet)
testingSet['Predicted'] = testingSet['Subject'].apply(classify)

### Testing set after going through the spam filter created
- new column, Predicted, indicates what the spam filter labeled it as 

In [24]:
testingSet

Unnamed: 0,Subject,Label,Predicted
0,"[guaranteed, lose, lbs, days]",spam,spam
1,"[fortune, company, hiring, home, reps]",spam,spam
2,"[cell, phone, belt, clips]",spam,spam
3,"[msnbc, rates, hit, year, low]",spam,spam
4,"[cry, help]",spam,ham
...,...,...,...
819,"[core, java, technologies, tech, tips, septemb...",ham,ham
820,"[signed, reich, report]",ham,ham
821,"[government, regulation, killing, stock, market]",ham,spam
822,"[sql, database, joins]",ham,spam


# Accuracy

In [25]:
total = testingSet.shape[0]
correct = 0

for row in testingSet.iterrows():
    row = row[1]
    if row['Label'] == row['Predicted']:
        correct += 1

print('Accuracy:', correct/total)

Accuracy: 0.8094660194174758


# Precision

In [26]:
spamPredicted = 0
correctSpam = 0

for row in testingSet.iterrows():
    row = row[1]
    if row['Predicted'] == 'spam':
        spamPredicted += 1
        
        if row['Label'] == row['Predicted']:
            correctSpam += 1

print('Precision:', correctSpam/spamPredicted)

Precision: 0.4322033898305085


# Recall

In [27]:
totalSpam = 0
correctSpam = 0

for row in testingSet.iterrows():
    row = row[1]
    if row['Label'] == 'spam':
        totalSpam += 1
        
        if row['Label'] == row['Predicted']:
            correctSpam += 1

print('Recall:', correctSpam / totalSpam)

Recall: 0.816
