In [1]:
#In this DataQuest project, we use multinomial Naive Bayes algorithm
#together with a dataset of 5k sms messages (that have aleady been classified
#as spam or otherwise) to build a spam filter.

In [2]:
#The algorithm tries to answer the following:
#what is the probability that this new message is spam given its content?
#What is the probability that the new message is not spam given its content?

#If the probability for spam is greater, the message is classified as spam.

#The algorithm is called "naive" because we assume conditional independence
#between words in a message to make the calculations tractable for messages
#of all lengths, but this approach is quite "naive" in practice, as words
#are often in a relationship of dependence.

#Despite this assumption, the algorithm still works well in practice and we
#will demonstrate this in this DataQuest project.

In [3]:
import numpy as np
import pandas as pd

In [4]:
sms = pd.read_csv("C:/Users/spzvl/OneDrive/Documents/Data/SMSSpamCollection"
                 ,sep="\t", header= None)

In [5]:
sms.columns = ["Label", "SMS"]

In [6]:
sms.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
sms.info() #Dataset has 5572 entries

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   5572 non-null   object
 1   SMS     5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [8]:
sms["Label"].value_counts(normalize = True)*100

ham     86.593683
spam    13.406317
Name: Label, dtype: float64

In [9]:
#13.4% of the entries are spam in this dataset, while 86.6% are not spam (we call it "ham" here)

In [10]:
#now that we have a better undertanding of how our data looks, we can move on
#to designing our software.
#First, lets plan out how we will test to see that it works.
#We can split the data into the "test" set and "training" set with the ratio
#20/80 respectively. (It is important to have as much training as possible
# but also have enough testing data) 

#We will treat the testing data as unclassified raw data and then compare
#the results of our algorithm with the actual classifications of the messages in the test set.

In [11]:
#First we spilt the dataset as was said above, and shuffle it so that
#spam is mixed with ham randomly.

In [12]:
#Shuffle our data so that the proportion of spam to ham is similar to that in the original dataset
data_randomized = sms.sample(frac=1, random_state=1) #random state argument
#added to make the results reproducible.

#setting the proportions of the split.
training_test_index = round(len(data_randomized) * 0.8)

# Training/Test split
sms_train = data_randomized[:training_test_index].reset_index(drop=True)
sms_test = data_randomized[training_test_index:].reset_index(drop=True)

print(sms_train.shape)
print(sms_test.shape)

(4458, 2)
(1114, 2)


In [13]:
sms_train["Label"].value_counts(normalize = True)*100

ham     86.54105
spam    13.45895
Name: Label, dtype: float64

In [14]:
sms_test["Label"].value_counts(normalize = True)*100

ham     86.804309
spam    13.195691
Name: Label, dtype: float64

In [15]:
sms_train.head()

Unnamed: 0,Label,SMS
0,ham,"Yep, by the pretty sculpture"
1,ham,"Yes, princess. Are you going to make me moan?"
2,ham,Welp apparently he retired
3,ham,Havent.
4,ham,I forgot 2 ask ü all smth.. There's a card on ...


In [16]:
#As we can see the proportions of spam/ham in the train and test sets correspond to
#those in our full dataset, indicating that our samples are representative of the data in general.

In [17]:
#Now we need to transform our data so it could be used together with the 
#Bayes algorithm. 
#First we remove all of the punctuation and make all words lowercase.

In [18]:
sms_train["SMS"] = sms_train["SMS"].str.replace("\W", " ")
#Using "\W" regex we identify all punctuation and replace it by a space.

In [19]:
sms_train["SMS"] = sms_train["SMS"].str.lower() #make all words lowercase.

In [20]:
sms_train.head()

Unnamed: 0,Label,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent
4,ham,i forgot 2 ask ü all smth there s a card on ...


In [21]:
#creating a set of unique words from the training set (called vocabulary)
sms_train["SMS"] = sms_train["SMS"].str.split() #splitting each word from each message into separate strings 
vocabulary = []
for entry in sms_train["SMS"]:
    for word in entry:
        vocabulary.append(word)     #adding all words into the list
        
vocabulary = set(vocabulary) #removes all duplicates by turning our list to a set
vocabulary = list(vocabulary) #converting back to the list

In [22]:
len(vocabulary) #number of unique words in our list.

7783

In [23]:
#This block of code was taken from DataQuest
#It creates a dictionary containing members of our vocabulary as keys and the number of times
# each word from our vocabulary was mentioned in each message as values for those keys, by using nested loops 
#and the
#enumerate function.

#We then convert that dictionary to a dataframe 
word_counts_per_sms = {unique_word: [0] * len(sms_train['SMS']) for unique_word
                       in vocabulary}

for index, sms in enumerate(sms_train['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1

In [24]:
sms_train_transformed = pd.DataFrame(word_counts_per_sms)

In [25]:
sms_train_transformed.head()

Unnamed: 0,lifetime,watchng,08717205546,deep,parts,rcb,4get,blue,ceiling,personal,...,arranging,ringing,befor,neighbor,detroit,varaya,arcade,act,being,docs
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
transformed_train = pd.concat([sms_train, sms_train_transformed], axis = 1)
#concatenating the resulting dataset and the data containing messages and labels

In [29]:
transformed_train.head()
#the resulting dataframe.
#Here, each row represents an entry of the dataset containing the message, the label (spam or ham) and the
#number of times
#a word from our vocabulary was mentioned in that particular message.

Unnamed: 0,Label,SMS,lifetime,watchng,08717205546,deep,parts,rcb,4get,blue,...,arranging,ringing,befor,neighbor,detroit,varaya,arcade,act,being,docs
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
#Now lets prepare all of the variables that will be used in our calculations

spam = transformed_train[transformed_train["Label"] == "spam"]
ham = transformed_train[transformed_train["Label"] == "ham"]
p_spam = len(spam)/len(transformed_train) #probability that a message is spam
p_ham = len(ham)/len(transformed_train)#probability that a message is ham
N_spam = sum(spam["SMS"].apply(len))#total number of words in all spam messages
N_ham = sum(ham["SMS"].apply(len))#total number of words in all ham messages
N_vocab = len(vocabulary)#number of words in our vocabulary
alpha = 1 #our smoothing parameter

In [31]:
#After defining all the variables we can calculate all of our parameters required for the algorithm

dict_spam = dict.fromkeys(vocabulary, 0)#creating two initially empty dictionaries with members of the 
#vocabulary as keys.
dict_ham = dict.fromkeys(vocabulary, 0) #One is used for spam while the other is for ham
for word in vocabulary:
    n_w_given_spam = sum(spam[word]) #number of times a word is mentioned in spam
    p_w_given_spam = (n_w_given_spam + alpha) / (N_spam + alpha*N_vocab)
    dict_spam[word] = p_w_given_spam #attaching the probability that a word is mentioned given the message is 
    #spam to the dictionary with that word as key
  
    
    n_w_given_ham = sum(ham[word]) #number of times a word is mentioned in ham
    p_w_given_ham = (n_w_given_ham + alpha) / (N_ham + alpha*N_vocab)
    dict_ham[word] = p_w_given_ham #attaching the probability that a word is mentioned given the message is ham
    #to the dictionary with that word as key
    

In [None]:
#One advantage of this kind of algorithm over other filtering algorithms
#is that we calculate a lot of values (like we do above) before even
#starting the algorithm, which makes it very fast.

#When a new message comes in, most calculations have already been complete,
#so the filtering process itself is almost instant.

#This advanatge becomes more apparent, as the number of messages increases
#to thousands or millions.
#Filtering them is quick and requires a lot less computational power.

In [32]:
import re

In [33]:
#Now we can create the spam filter
def spam_filter(message):  #input has to be a string
    message = re.sub('\W', ' ', message) #some data cleaning on the input
    message = message.lower().split()
    
   #Calculations - the goal is to find the probability that a message is spam given the word was mentioned in
#that message
   #(same for ham)

    p_spam_given_message = p_spam #initially set that value as the probability that the message is spam
    p_ham_given_message = p_ham #do the same thing respectively for ham
    
    for word in message:
        if word in dict_spam: #if a word present in spam, update conditional probabnility by multiplying by the 
            #parameter calculated previously
                              
            p_spam_given_message *= dict_spam[word]
        if word in dict_ham:
            p_ham_given_message *= dict_ham[word] #similar procedure with ham
     #if word not present anywhere, ignore it.  
     
    #                                                             n 
    #The formula we use here is: P(spam|w1,w2,...,wn) ∝ P(spam) * ∏ P(wi|Spam)
    #                                                            i=1  
    #The same formula is used for ham respectively
    
    #Sorting
    if p_ham_given_message >= p_spam_given_message: #if probabilities are equal
        return "ham" #we give it the benefit of the doubt and assume its ham 
    else: 
        return"spam"


In [34]:
#Lets test the function on a basic level.
spam_filter("WINNER! Claim you reward soon by using the code: 1546CD")

'spam'

In [35]:
spam_filter("Hey, Do you know Tom's phone number?")

'ham'

In [36]:
#We will now try to determine how well the spam filter works on
#our test set of over 1k messages.
#The algorithm will not see the actual labels in the test set, and will
#try to filter spam on its own.
#We can then compare the preditions to the actual labels.
#We need to make a separate column in the test data for predicitons.

In [37]:
sms_test["Prediction"] = sms_test["SMS"].apply(spam_filter)
sms_test.head()
#This is the first few entries of the test data with the predictions added.

Unnamed: 0,Label,SMS,Prediction
0,ham,Later i guess. I needa do mcat study too.,ham
1,ham,But i haf enuff space got like 4 mb...,ham
2,spam,Had your mobile 10 mths? Update to latest Oran...,spam
3,ham,All sounds good. Fingers . Makes it difficult ...,ham
4,ham,"All done, all handed in. Don't know if mega sh...",ham


In [38]:
#Now we need to measure how accurate the spam filter is.
# Accuracy = number of correct predictions/total number of entries
correct = 0
total = len(sms_test)
for entry in sms_test.iterrows(): #iterating over the entries of our data
    entry = entry[1]              #and counting the number of times the
    if entry["Label"] == entry["Prediction"]: #prediction matched the label
        correct += 1

In [39]:
print("Correct: ", correct )
print("Incorrect: ", total - correct)
print("Accuracy (%) : ", (correct/total)*100)

Correct:  1101
Incorrect:  13
Accuracy (%) :  98.8330341113106


In [None]:
#Great accuracy! Lets explore the 13 messages that our filter made a mistake on

In [44]:
for entry in sms_test.iterrows():
    entry = entry[1]
    if entry["Label"] != entry["Prediction"]:
        print(entry["SMS"])
        print("\n")

Not heard from U4 a while. Call me now am here all night with just my knickers on. Make me beg for it like U did last time 01223585236 XX Luv Nikiyu4.net


More people are dogging in your area now. Call 09090204448 and join like minded guys. Why not arrange 1 yourself. There's 1 this evening. A£1.50 minAPN LS278BB


Unlimited texts. Limited minutes.


26th OF JULY


Nokia phone is lovly..


No calls..messages..missed calls


We have sent JD for Customer Service cum Accounts Executive to ur mail id, For details contact us


Oh my god! I've found your number again! I'm so glad, text me back xafter this msgs cst std ntwk chg £1.50


Hi babe its Chloe, how r u? I was smashed on saturday night, it was great! How was your weekend? U been missing me? SP visionsms.com Text stop to stop 150p/text


0A$NETWORKS allow companies to bill for SMS, so they are responsible for their "suppliers", just as a shop has to give a guarantee on what they sell. B. G.


RCT' THNQ Adrian for U text. Rgds Vatian


In [None]:
#It looks like a lot of those messages contain numbers, codes or price tags
#which is why they could have been identified as spam even if they are not.
#Messages that mention brands could also be associated with a lot of spam
#messages. 

In [None]:
#Vladimir Sapozhnikov.