# Spam Filter with Naive Bayes

## Author: Salvatore Porcheddu
## Date: 2021-08-12

# Step 1: Exploring the Dataset (source: [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/sms+spam+collection))

In [1]:
import pandas as pd
import numpy as np

SMS_df = pd.read_csv("SMSSpamCollection", header=None, sep="\t", names=["Label", "SMS"])

SMS_df.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
print(SMS_df.shape)
SMS_df.Label.value_counts(normalize=True)

(5572, 2)


ham     0.865937
spam    0.134063
Name: Label, dtype: float64

# Step 2: Training and Test Set

In [3]:
# randomizing and splitting data into train and test set

randomized_sms = SMS_df.sample(frac=1, random_state=1)

train = randomized_sms.iloc[:4458, :].copy()
test = randomized_sms.iloc[4458:, :].copy()

train.reset_index(inplace=True, drop=True)
test.reset_index(inplace=True, drop=True)

print(train.head())
print(test.head())

  Label                                                SMS
0   ham                       Yep, by the pretty sculpture
1   ham      Yes, princess. Are you going to make me moan?
2   ham                         Welp apparently he retired
3   ham                                            Havent.
4   ham  I forgot 2 ask ü all smth.. There's a card on ...
  Label                                                SMS
0   ham          Later i guess. I needa do mcat study too.
1   ham             But i haf enuff space got like 4 mb...
2  spam  Had your mobile 10 mths? Update to latest Oran...
3   ham  All sounds good. Fingers . Makes it difficult ...
4   ham  All done, all handed in. Don't know if mega sh...


In [4]:
train_perc = train.shape[0] / randomized_sms.shape[0] * 100
test_perc = test.shape[0] / randomized_sms.shape[0] * 100

print(f"train: {train_perc}% of the original dataset")
print(f"test: {test_perc}% of the original dataset\n")

print(train["Label"].value_counts(normalize=True))
print(test["Label"].value_counts(normalize=True))

train: 80.00717875089734% of the original dataset
test: 19.992821249102658% of the original dataset

ham     0.86541
spam    0.13459
Name: Label, dtype: float64
ham     0.868043
spam    0.131957
Name: Label, dtype: float64


# Step 3: Letter Case and Punctuation 

In [5]:
# converting all text into lowercase and removing punctuation

train["SMS"] = train["SMS"].str.replace("\W", " ").str.lower()   # \W represents any character which is not from a-z, A-Z and 0-9
test["SMS"] = test["SMS"].str.replace("\W", " ").str.lower()

print(train.head())
print(test.head())

  Label                                                SMS
0   ham                       yep  by the pretty sculpture
1   ham      yes  princess  are you going to make me moan 
2   ham                         welp apparently he retired
3   ham                                            havent 
4   ham  i forgot 2 ask ü all smth   there s a card on ...
  Label                                                SMS
0   ham          later i guess  i needa do mcat study too 
1   ham             but i haf enuff space got like 4 mb   
2  spam  had your mobile 10 mths  update to latest oran...
3   ham  all sounds good  fingers   makes it difficult ...
4   ham  all done  all handed in  don t know if mega sh...


  train["SMS"] = train["SMS"].str.replace("\W", " ").str.lower()   # \W represents any character which is not from a-z, A-Z and 0-9
  test["SMS"] = test["SMS"].str.replace("\W", " ").str.lower()


# Step 4: Creating the Vocabulary

In [6]:
train["SMS"] = train["SMS"].str.split()

vocabulary = []     # this will contain all unique words that appear in the SMS column of the training set

for sms in train["SMS"]:
    for word in sms:
        vocabulary.append(word)
        
vocabulary = set(vocabulary)    # this removes duplicates
vocabulary = list(vocabulary)   # final result
        
len(vocabulary)

7783

# Step 5: Train Set Bag of Words Representation

In [7]:
word_counts_per_sms = {word: [0] * train.shape[0] for word in vocabulary}   # dictionary with a key for each unique word and array of zeros of train.shape[0] length

for index, sms in enumerate(train["SMS"]):    # update the dictionary with the counts of each word for each sms (row)
    for word in sms:
        word_counts_per_sms[word][index] += 1
        
word_counts_per_sms["prize"]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,


In [8]:
word_counts_per_sms = pd.DataFrame(word_counts_per_sms)
train_counts = pd.concat([train, word_counts_per_sms], axis=1)   # concatenate BoW representation with train set

train_counts.head()

Unnamed: 0,Label,SMS,carpark,quickly,stones,blankets,appropriate,inlude,sem,woods,...,pub,walls,incredible,warm,afford,89693,prestige,song,maruti,nowadays
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Step 6: Calculating NB constants 

The constants are p(spam), p(ham), n(spam), n(ham), n(vocabulary)

In [9]:
perc = train["Label"].value_counts(normalize=True)

p_ham, p_spam = perc[0], perc[1]

In [10]:
def counter(label):
    
    '''This function counts the number of words associated with
    the given label'''
    
    filtered = train[train["Label"] == label]
    count = 0
    for row in filtered["SMS"]:
        count += len(row)
    return count

In [11]:
n_ham = counter("ham")
n_spam = counter("spam")

print(n_ham)
print(n_spam)

57237
15190


In [12]:
n_vocab = len(vocabulary)
alpha = 1   # smoothing parameter

print(n_vocab)

7783


## Step 7: Calculating Parameters (p_w_ham, p_w_spam)

In [13]:
p_w_spam_dict = {word: 0 for word in vocabulary}
p_w_ham_dict = {word: 0 for word in vocabulary}

train_ham = train[train["Label"] == "ham"]
train_spam = train[train["Label"] == "spam"]

for word in vocabulary:
    n_w_ham = 0   # this variable will measure how many times the word will appear in ham messages
    n_w_spam = 0   
    for row in train_ham["SMS"]:
        for w in row:
            if w == word:
                n_w_ham += 1
    for row in train_spam["SMS"]:
        for w in row:
            if w == word:
                n_w_spam += 1
    p_w_ham = (n_w_ham + alpha) / (n_ham + (alpha * n_vocab))
    p_w_spam = (n_w_spam + alpha) / (n_spam + (alpha * n_vocab))
    
    p_w_ham_dict[word] = p_w_ham
    p_w_spam_dict[word] = p_w_spam
    
print(p_w_spam_dict["other"])   # example of probability

0.00021764680276846734


# Step 8: Classifying a new message

In [14]:
import re

def classify(message):
    
    '''This function takes a message as a string and classifies it as ham or spam;
    the function will notify the user if it cannot make a classification (because the probabilities 
    for ham and spam are the same)'''
    
    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()

    p_spam_given_message = p_spam 
    p_ham_given_message = p_ham 
    
    for word in message:
        if word in p_w_spam_dict.keys():
            p_spam_given_message *= p_w_spam_dict[word] 
        if word in p_w_ham_dict.keys():
            p_ham_given_message *= p_w_ham_dict[word]

    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)

    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
    else:
        print('Equal proabilities, have a human classify this!')

In [15]:
print(classify("WINNER!! This is the secret code to unlock the money: C3421."))
print(classify("Sounds good, Tom, then see u there"))

P(Spam|message): 1.3481290211300841e-25
P(Ham|message): 1.9368049028589875e-27
Label: Spam
None
P(Spam|message): 2.4372375665888117e-25
P(Ham|message): 3.687530435009238e-21
Label: Ham
None


# Step 9: Measuring the Spam Filter's Accuracy

In [16]:
def classify_test_set(message):
    
    '''This function takes a message as a string and returns its labels (spam or ham) INSTEAD OF
    PRINTING THEM; the function will notify the user if it cannot make a classification 
    (because the probabilities for ham and spam are the same).'''
    
    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()

    p_spam_given_message = p_spam 
    p_ham_given_message = p_ham 
    
    for word in message:
        if word in p_w_spam_dict.keys():
            p_spam_given_message *= p_w_spam_dict[word] 
        if word in p_w_ham_dict.keys():
            p_ham_given_message *= p_w_ham_dict[word]

    if p_ham_given_message > p_spam_given_message:
        return "ham"
    elif p_ham_given_message < p_spam_given_message:
        return "spam"
    else:
        return "needs human classification"

In [17]:
test["Predicted"] = test["SMS"].apply(classify_test_set)

test.head()

Unnamed: 0,Label,SMS,Predicted
0,ham,later i guess i needa do mcat study too,ham
1,ham,but i haf enuff space got like 4 mb,ham
2,spam,had your mobile 10 mths update to latest oran...,spam
3,ham,all sounds good fingers makes it difficult ...,ham
4,ham,all done all handed in don t know if mega sh...,ham


In [18]:
accuracy = np.sum(test["Label"] == test["Predicted"]) / test.shape[0]

print(accuracy)

0.9874326750448833


The spam filter is very efficient in distinguishing relevant messages from spam, with an accuracy of 98.74%.