In [42]:
import pandas as pd
import re

In [43]:
sms_spam = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['Label', 'SMS'])
print(sms_spam.shape)
sms_spam.head()

(5572, 2)


Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Here, we have 5572 SMS, each labelled 'ham' (non spam) or 'spam'. 

In [44]:
# Let's split the dataset into training set and test set, but first, let's randomize them
data_randomize = sms_spam.sample(frac=1, random_state=1)

# Index for split
training_set_index = round(len(data_randomize) * 0.8)

training_set = data_randomize[:training_set_index].reset_index(drop=True)
testing_set = data_randomize[training_set_index:].reset_index(drop=True)

# reset index adds ano

print(training_set.shape)
print(testing_set.shape)

(4458, 2)
(1114, 2)


In [45]:
print(training_set.head())
print(testing_set.head())

  Label                                                SMS
0   ham                       Yep, by the pretty sculpture
1   ham      Yes, princess. Are you going to make me moan?
2   ham                         Welp apparently he retired
3   ham                                            Havent.
4   ham  I forgot 2 ask ü all smth.. There's a card on ...
  Label                                                SMS
0   ham          Later i guess. I needa do mcat study too.
1   ham             But i haf enuff space got like 4 mb...
2  spam  Had your mobile 10 mths? Update to latest Oran...
3   ham  All sounds good. Fingers . Makes it difficult ...
4   ham  All done, all handed in. Don't know if mega sh...


In [46]:
training_set['Label'].value_counts(normalize=True)

ham     0.86541
spam    0.13459
Name: Label, dtype: float64

Here we see that about 87% of messages in training set are ham and the remaining 13% are spam

# Naive Bayes


$$
P(spam | w_1, w_2, ... , w_n) \propto P(spam) \prod_{i=1}^{n} P(w_i | spam)
$$

$$
P(ham | w_1, w_2, ... , w_n) \propto P(ham) \prod_{i=1}^{n} P(w_i | ham)
$$

If $P(spam | w_1, w_2, ... , w_n)$ is greater than $P(ham | w_1, w_2, ... , w_n)$ then the message is spam \\


Each term in the product is calculated as follows:

$$
P(w_i | spam) = \frac{N_{w_i | spam} + \alpha}{N_{spam} + \alpha \cdot N_{vocabulary}}
$$

$$
P(w_i | ham) = \frac{N_{w_i | ham} + \alpha}{N_{ham} + \alpha \cdot N_{vocabulary}}
$$


$N_{w_i | spam} =$ the number of times the word w_i appears in spam messages

$N_{w_i | ham} =$ the number of times the word w_i appears in ham messages  

$N_{spam} =$ total number of spam words

$N_{ham} =$ total number of ham words

$N_{vocabulary} =$ total number of words in the vocabulary

$\alpha = 1$, the laplace smoothing parameter

We need to reformat and clean the data for further processing

In [47]:
# lets remove punctuation and convert all words to lowercase
training_set['SMS'] = training_set['SMS'].str.replace(
    '\W', ' ', regex=True) #using regex to remove a non word character (\W is a non-word character)
training_set['SMS'] = training_set['SMS'].str.lower()
training_set.head(3)

Unnamed: 0,Label,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired


In [48]:
training_set['SMS'] = training_set['SMS'].str.split() #split the string by space to a list

vocabulary = []
for sms in training_set['SMS']:
    for word in sms:
        vocabulary.append(word) #add every word to vocabulary

vocabulary = list(set(vocabulary)) #remove duplicates

In [49]:
print(len(vocabulary))

7783


There are 7783 unique words in our vocabulary

In [50]:
word_counts_per_sms = {unique_word: [0] * len(training_set['SMS']) for unique_word in vocabulary}


for index, sms in enumerate(training_set['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1

Here, `word_counts_per_sms` is a dictionary, in which the keys are each unique word in the dictionary and the value of each key is a list of length equal to size of training set. $i^{th}$ item of that list is the number of occurence of that word in the $i^{th}$ SMS.

In [51]:
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts.head()

Unnamed: 0,incident,shd,den,voucher,pattern,steps,wkend,wonderful,slow,kavalan,...,sticky,countin,door,coccooning,thy,8027,3650,je,intend,landlineonly
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Let's concatenate this dataframe to the training set

In [52]:
training_set_clean = pd.concat([training_set, word_counts], axis=1)
training_set_clean.head()

Unnamed: 0,Label,SMS,incident,shd,den,voucher,pattern,steps,wkend,wonderful,...,sticky,countin,door,coccooning,thy,8027,3650,je,intend,landlineonly
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
# Isolating the ham and spam messages first
spam_messages = training_set_clean[training_set_clean['Label'] == 'spam']
ham_messages = training_set_clean[training_set_clean['Label'] == 'ham']

# P(spam) and P(ham)
p_spam = len(spam_messages) / len(training_set_clean)
p_ham = len(ham_messages) / len(training_set_clean)

# N_spam: total number of words in spam messages
n_words_per_spam_message = spam_messages['SMS'].apply(len)
n_spam = n_words_per_spam_message.sum()

n_words_per_ham_message = ham_messages['SMS'].apply(len)
n_ham = n_words_per_ham_message.sum()

# N_vocabulary
n_vocabulary = len(vocabulary)

#Laplace smoothing
alpha = 1

Now, let's calculate parameters

In [54]:
# Initialize parameters
parameters_spam = {unique_word: 0 for unique_word in vocabulary}
parameters_ham = {unique_word: 0 for unique_word in vocabulary}

# Calculate the parameters
for word in vocabulary:
    n_word_given_spam = spam_messages[word].sum()
    p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha * n_vocabulary)
    parameters_spam[word] = p_word_given_spam
    
    n_word_given_ham = ham_messages[word].sum()
    p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha * n_vocabulary)
    parameters_ham[word] = p_word_given_ham

Our model is complete. Next, we will build a classify function, and we will test the accuracy of our model using  `testing_set`

In [55]:
def classify (message):
    """message: a string"""
    message = re.sub('\W', ' ', message) #remove non word characters
    message = message.lower().split() # lowercase and make list by splitting using spaces
    
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham #prior probabilities
    
    for word in message:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]
        
        if word in parameters_ham:
            p_ham_given_message *= parameters_ham[word]
    
    print("P (Spam | message): ", p_spam_given_message)
    print("P (Ham | message): ", p_ham_given_message)
    
    if p_ham_given_message > p_spam_given_message:
        print("Label: Ham")
    elif p_ham_given_message < p_spam_given_message:
        print("Label: Spam")
    else:
        print("Have a human classify this!")

In [56]:
classify('WINNER!! This is the secret code to unlock the money: C3421.')

P (Spam | message):  1.3481290211300841e-25
P (Ham | message):  1.9368049028589875e-27
Label: Spam


In [57]:
classify("Sounds good, Tom, then see u there")

P (Spam | message):  2.4372375665888117e-25
P (Ham | message):  3.687530435009238e-21
Label: Ham


There you go! it classified well for these two messages. Now we will feed the algorithm the entire testing set to see its accuracy

In [58]:
def classify_test_set(message):
    """message: a string"""
    
    message = re.sub('\W', ' ', message)
    message = message.lower().split()
    
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    
    for word in message:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]
        if word in parameters_ham:
            p_ham_given_message *= parameters_ham[word]
    
    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_ham_given_message < p_spam_given_message:
        return 'spam'
    else:
        return 'needs human classification'

In [59]:
testing_set['predicted'] = testing_set['SMS'].apply(classify_test_set)
testing_set.head()

Unnamed: 0,Label,SMS,predicted
0,ham,Later i guess. I needa do mcat study too.,ham
1,ham,But i haf enuff space got like 4 mb...,ham
2,spam,Had your mobile 10 mths? Update to latest Oran...,spam
3,ham,All sounds good. Fingers . Makes it difficult ...,ham
4,ham,"All done, all handed in. Don't know if mega sh...",ham


In [60]:
# calculating accuracy
correct = 0
total = testing_set.shape[0]

for row in testing_set.iterrows():
    row = row[1]
    if row['Label'] == row['predicted']:
        correct += 1

print("Correct: ", correct)
print("Incorrect: ", total - correct)
print("Accuracy: ", (correct*100 / total), "%")

Correct:  1100
Incorrect:  14
Accuracy:  98.74326750448833 %


That's about 99% accurate! Hurray!