In [21]:
import pandas as pd
import numpy as np

sms_spam_data_set = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['Label', 'SMS'])
print("Examples of the data samples \n", sms_spam_data_set.head(3), "\n")
print("Dimension of the data set:\n", sms_spam_data_set.shape, "\n")
print("Distribution of the data set:\n", sms_spam_data_set['Label'].value_counts(normalize=True), "\n")

Examples of the data samples 
   Label                                                SMS
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina... 

Dimension of the data set:
 (5572, 2) 

Distribution of the data set:
 Label
ham     0.865937
spam    0.134063
Name: proportion, dtype: float64 



In [22]:
#Perform train/test split
sms_texts, labels = sms_spam_data_set.SMS, sms_spam_data_set.Label

from sklearn.model_selection import train_test_split
sms_texts_train, sms_texts_test, labels_train, labels_test = train_test_split(sms_texts, labels, test_size=0.2, random_state=123)

sms_texts_train = sms_texts_train.reset_index(drop=True)
labels_train = labels_train.reset_index(drop=True)

sms_texts_test = sms_texts_test.reset_index(drop=True)
labels_test = labels_test.reset_index(drop=True)

print("Distribtuion of the training data set:\n", labels_train.value_counts(normalize=True),  labels_train.shape[0], "\n")

print("Distribtuion of the testing data set:\n", labels_test.value_counts(normalize=True), labels_test.shape[0], "\n")

Distribtuion of the training data set:
 Label
ham     0.866726
spam    0.133274
Name: proportion, dtype: float64 4457 

Distribtuion of the testing data set:
 Label
ham     0.86278
spam    0.13722
Name: proportion, dtype: float64 1115 



In [23]:
#Function to convert input sms texts to feature vectors using BoW representation
def smsTextsToVectors(sms_texts):
    sms_texts = sms_texts.str.replace('\W', ' ') #Remove punctuation
    sms_texts = sms_texts.str.lower()
    sms_texts = sms_texts.str.split()

    vocabulary = []
    for sms in sms_texts:
        for word in sms:
            vocabulary.append(word)
        
    vocabulary = list(set(vocabulary))
    
    word_counts_per_sms = {unique_word: [0] * len(sms_texts) for unique_word in vocabulary}
    
    for index, sms in enumerate(sms_texts):
        for word in sms:
            word_counts_per_sms[word][index] += 1
            
    return word_counts_per_sms, vocabulary

word_counts_per_sms, vocabulary  = smsTextsToVectors(sms_texts_train) 
x_train = pd.DataFrame(word_counts_per_sms)
print("Features (the number of all possible words in the trainning data):\n", len(vocabulary), "\n")

training_data_set = pd.concat([labels_train, sms_texts_train, x_train], axis=1)
print("Examples of the training data \n", training_data_set.head(3), "\n")

Features (the number of all possible words in the trainning data):
 11789 

Examples of the training data 
   Label                                                SMS  spare  happiness  \
0  spam  Double mins and txts 4 6months FREE Bluetooth ...      0          0   
1   ham  Did you get any gift? This year i didnt get an...      0          0   
2   ham  Ever green quote ever told by Jerry in cartoon...      0          0   

   tuesday?  dear..i  08707808226.  beach  nah,  monday,  ...  alone"  \
0         0        0             0      0     0        0  ...       0   
1         0        0             0      0     0        0  ...       0   
2         0        0             0      0     0        0  ...       0   

   sarcasm  charity.  stopcost  relax,  lesson  hont.  bein  dt  l8tr  
0        0         0         0       0       0      0     0   0     0  
1        0         0         0       0       0      0     0   0     0  
2        0         0         0       0       0      0     0   

In [24]:
x_train_spam = x_train[labels_train == 'spam']
x_train_ham = x_train[labels_train == 'ham']

#Estimate P(y=spam) and P(y=ham)
p_spam = len(x_train_spam)/len(x_train)
print("Our estimate of P(y=spam) is ", p_spam)

p_ham = len(x_train_ham)/len(x_train)
print("Our estimate of P(y=ham) is ", p_ham)



Our estimate of P(y=spam) is  0.13327350235584473
Our estimate of P(y=ham) is  0.8667264976441552


In [25]:
#Initiate parameters
theta_spam = {unique_word:0 for unique_word in vocabulary}
theta_ham =  {unique_word:0 for unique_word in vocabulary}

#Estimate the probability distribution of selecting each word
# uncomment to implement the following
for word in vocabulary:
    # uncomment to implement the following
    theta_spam[word] = (x_train_spam[word].sum() + 0.001)/(len(x_train_spam))
    
    # uncomment to implement the following
    theta_ham[word] = (x_train_ham[word].sum() + 0.001)/(len(x_train_ham))

In [26]:
#implment Naive Bayes classifier
import re, math
def textToVector(message):
    message = re.sub('\W', ' ', message) #Remove punctuation
    message = message.lower().split()

    vocabulary = []
    for word in  message:
        vocabulary.append(word)
        
    vocabulary = list(set(vocabulary))
    
    word_counts = {unique_word: 0 for unique_word in vocabulary}
    
    for word in message:
            word_counts[word] += 1
            
    return word_counts, vocabulary

def naive_bayes_classify(sms_text):
    x_test, vocabulary_test = textToVector(sms_text)
    
    # uncomment to implement the following
    p_spam_given_sms = math.log(p_spam)
    p_ham_given_sms = math.log(p_ham)
    
    for word in vocabulary_test:
        if word in theta_spam:
            p_spam_given_sms += x_test[word] * math.log(theta_spam[word])
        if word in theta_ham:
            p_ham_given_sms += x_test[word] * math.log(theta_ham[word])

    print('Estimate of log(P(SPAM|message=',  sms_text, ')) =', p_spam_given_sms)
    print('Estimate of log(P(HAM|message=',  sms_text, ')) =', p_ham_given_sms)
    isSpam = True
    if(p_spam_given_sms > p_ham_given_sms):
        isSpam = True
    else:
        isSpam = False
    return isSpam
     

In [27]:
print(naive_bayes_classify("WINNER!! This is the secret code to unlock the money: C3421."))
print(naive_bayes_classify("Sounds good, Tom, then u there"))

Estimate of log(P(SPAM|message= WINNER!! This is the secret code to unlock the money: C3421. )) = -26.190071979610394
Estimate of log(P(HAM|message= WINNER!! This is the secret code to unlock the money: C3421. )) = -44.29183987582262
True
Estimate of log(P(SPAM|message= Sounds good, Tom, then u there )) = -42.7726562021452
Estimate of log(P(HAM|message= Sounds good, Tom, then u there )) = -26.346460806484025
False


In [28]:
def score(sms_texts, labels):
    mistakes = 0
    for i, message in enumerate(sms_texts):
        isSpam = naive_bayes_classify(message)
        if isSpam and labels[i] != "spam":
            mistakes += 1
        elif not isSpam and labels[i] == "spam":
            mistakes += 1
    return (len(sms_texts)-mistakes)/len(sms_texts)
    

In [29]:
#Calculate loss on training data
print("Training accuracy:", score(sms_texts_train, labels_train))
#Calculate generalization loss
print("Generalization accuracy:", score(sms_texts_test, labels_test))

Estimate of log(P(SPAM|message= Double mins and txts 4 6months FREE Bluetooth on Orange. Available on Sony, Nokia Motorola phones. Call MobileUpd8 on 08000839402 or call2optout/N9DX )) = -70.1436861418235
Estimate of log(P(HAM|message= Double mins and txts 4 6months FREE Bluetooth on Orange. Available on Sony, Nokia Motorola phones. Call MobileUpd8 on 08000839402 or call2optout/N9DX )) = -157.54747662322197
Estimate of log(P(SPAM|message= Did you get any gift? This year i didnt get anything. So bad )) = -67.33599534966545
Estimate of log(P(HAM|message= Did you get any gift? This year i didnt get anything. So bad )) = -47.81144239433357
Estimate of log(P(SPAM|message= Ever green quote ever told by Jerry in cartoon "A Person Who Irritates u Always Is the one Who Loves u Vry Much But Fails to Express It...!..!! :-) :-) gud nyt )) = -230.43272852997015
Estimate of log(P(HAM|message= Ever green quote ever told by Jerry in cartoon "A Person Who Irritates u Always Is the one Who Loves u Vry M