# Spam Detection using Naive Bayes 

## Collecting the Dataset 

In [1]:
spam = [
    "To use your credit, click the new WAP link in the next years txt message or click here", 
    "Thanks for your subscription to New Ringtone UK your new mobile will be charged £5/month Please confirm annoncement by replying", 
    "As a valued customer, I am pleased to advise you that following recent delivery waiting review of your Mob No. you are awarded with. Call us to review.", 
    "Please call our new customer service representative on", 
    "We are trying to contact you. Last weekends customer draw shows that you won a £1000 prize GUARANTEED. Calling years", 
]

In [2]:
# leave one sentence from spam for testing our model later 
spam_test = ["Customer service annoncement. You have a New Years delivery waiting for you. click"]

In [3]:
non = [
    "I don't think he goes to usf, he lives around here though", 
    "New car and house for my parents. i have only new job in hand", 
    "Great escape. I fancy the bridge but needs her lager. See you tomorrow", 
    "Tired. I haven't slept well the past few nights.",
    "Too late. I said i have the website. I didn't i have or dont have the slippers", 
    "I might come by tonight then if my class lets out early", 
    "Jos ask if u wana meet up?", 
    "That would be great. We'll be at the Guild. We can try meeting with the customer on Bristol road or somewhere"
    ]

In [4]:
# another sentence from non for testing our model 
spam_test_2 = ["That would be great. We'll be at the Guild. We can try meeting with the customer on Bristol road or somewhere"]

## Basic Pre-Processing

In [5]:
!pip install gensim

Collecting gensim
  Downloading gensim-3.8.3.tar.gz (23.4 MB)
[K     |████████████████████████████████| 23.4 MB 619 kB/s eta 0:00:01
Collecting smart_open>=1.8.1
  Downloading smart_open-4.1.2-py3-none-any.whl (111 kB)
[K     |████████████████████████████████| 111 kB 1.2 MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: gensim
  Building wheel for gensim (setup.py) ... [?25ldone
[?25h  Created wheel for gensim: filename=gensim-3.8.3-cp39-cp39-macosx_11_0_arm64.whl size=24156377 sha256=e9e2e7c67e9b3ef94cf033d7f00ae93d120ca4da33e8cad32d3b0d4381a6494a
  Stored in directory: /Users/nam/Library/Caches/pip/wheels/ca/5d/af/618594ec2f28608c1d6ee7d2b7e95a3e9b06551e3b80a491d6
Successfully built gensim
Installing collected packages: smart-open, gensim
Successfully installed gensim-3.8.3 smart-open-4.1.2


In [6]:
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.porter import PorterStemmer
from gensim.utils import tokenize

In [7]:
#test_sentence = non[4]
#test_sentence = non[5]
test_sentence = spam[1]

print(test_sentence)

removed_stops = remove_stopwords(test_sentence)
print(removed_stops)

p = PorterStemmer()
stemmed = p.stem(removed_stops)
print(stemmed)

tokens = tokenize(stemmed)
print(list(tokens))

Thanks for your subscription to New Ringtone UK your new mobile will be charged £5/month Please confirm annoncement by replying
Thanks subscription New Ringtone UK new mobile charged £5/month Please confirm annoncement replying
thanks subscription new ringtone uk new mobile charged £5/month please confirm annoncement repli
['thanks', 'subscription', 'new', 'ringtone', 'uk', 'new', 'mobile', 'charged', 'month', 'please', 'confirm', 'annoncement', 'repli']


## Create a dictionary of words 

In [8]:
def tokenize_sentence(sentence): 
    p = PorterStemmer()
    removed_stops = remove_stopwords(sentence)
    stemmed = p.stem(removed_stops)
    tokens = tokenize(stemmed)
    return list(tokens)

In [9]:
dictionary = set()     # will have unique values only 
spams_tokenized = [] 
nons_tokenized = [] 


for sentence in spam:      
    sentence_tokens = tokenize_sentence(sentence)
    spams_tokenized.append(sentence_tokens)
    dictionary  = dictionary.union(sentence_tokens)   # add sentence words to the dictionary  
    
    

for sentence in non:      
    sentence_tokens = tokenize_sentence(sentence)
    nons_tokenized.append(sentence_tokens)
    dictionary  = dictionary.union(sentence_tokens)


    
print("Tokenized spam: ", spams_tokenized)
print("Tokenized non:  ", nons_tokenized)
print("Dictionary:     ", dictionary)

Tokenized spam:  [['to', 'use', 'credit', 'click', 'new', 'wap', 'link', 'years', 'txt', 'message', 'click'], ['thanks', 'subscription', 'new', 'ringtone', 'uk', 'new', 'mobile', 'charged', 'month', 'please', 'confirm', 'annoncement', 'repli'], ['as', 'valued', 'customer', 'i', 'pleased', 'advise', 'following', 'recent', 'delivery', 'waiting', 'review', 'mob', 'no', 'awarded', 'with', 'call', 'review'], ['please', 'new', 'customer', 'service', 'repres'], ['we', 'trying', 'contact', 'you', 'last', 'weekends', 'customer', 'draw', 'shows', 'won', 'prize', 'guaranteed', 'calling', 'year']]
Tokenized non:   [['i', 'don', 't', 'think', 'goes', 'usf', 'l'], ['new', 'car', 'house', 'parents', 'new', 'job', 'hand'], ['great', 'escape', 'i', 'fancy', 'bridge', 'needs', 'lager', 'see', 'tomorrow'], ['tired', 'i', 'haven', 't', 'slept', 'past', 'nights'], ['too', 'late', 'i', 'said', 'website', 'i', 'didn', 't', 'dont', 'slipp'], ['i', 'come', 'tonight', 'class', 'lets', 'earli'], ['jos', 'ask', '

## Basic Stats 

In [10]:
# These things do not depend on an individual word so let's calculate them separately once 

total_word_count = len(dictionary)
total_spam_messages = len(spams_tokenized) 
total_all_messages = len(spams_tokenized) + len(nons_tokenized)

print("Total Number of words: ", total_word_count)

Total Number of words:  101


In [11]:
# P(spam) ... does not depend on an individual word so let's calculate that separately once 

p_spam = total_spam_messages / total_all_messages

print("P(spam) = ", p_spam)

P(spam) =  0.38461538461538464


In [12]:
# Helper function to count occurances 

def count_word_in_messages(word, messages): 
    total_count = 0
    for msg in messages: 
        if word in msg:       # notice this ensured uniqueness automatically  
            total_count += 1 
            
    return total_count 

## The Actual Probability Computation 

In [16]:
final_prob = 1   # can't start from 0 


for test_sentence in spam_test_2: 
    test_sentence = tokenize_sentence(test_sentence)
    print(test_sentence)
    
    # let's run this for each word separately 
    for word in test_sentence: 
        print("----------------")
        print("Runnig for word:", word)
        
        # Find P( w | spam)
        spam_count = count_word_in_messages(word, spams_tokenized)
        p_w_spam = spam_count / total_spam_messages 
        print("P( w | spam)  = ", p_w_spam)
        
        # Find P( w )
        w_count = count_word_in_messages(word, spams_tokenized)
        w_count += count_word_in_messages(word, nons_tokenized)
        p_w = w_count / total_all_messages
        print("P( w )        = ", p_w)
        
        
        # Find P( spam | w )
        p_spam_w = (p_w_spam * p_spam) / p_w
        print("P( spam )     = ", p_spam)
        print("P( spam | w ) = ", p_spam_w)
        print("")
        final_prob *= p_spam_w
        
        
    print("P( spam | all_words ) = ", final_prob)

['that', 'great', 'we', 'll', 'guild', 'we', 'try', 'meeting', 'customer', 'bristol', 'road']
----------------
Runnig for word: that
P( w | spam)  =  0.0
P( w )        =  0.07692307692307693
P( spam )     =  0.38461538461538464
P( spam | w ) =  0.0

----------------
Runnig for word: great
P( w | spam)  =  0.0
P( w )        =  0.15384615384615385
P( spam )     =  0.38461538461538464
P( spam | w ) =  0.0

----------------
Runnig for word: we
P( w | spam)  =  0.2
P( w )        =  0.15384615384615385
P( spam )     =  0.38461538461538464
P( spam | w ) =  0.5

----------------
Runnig for word: ll
P( w | spam)  =  0.0
P( w )        =  0.07692307692307693
P( spam )     =  0.38461538461538464
P( spam | w ) =  0.0

----------------
Runnig for word: guild
P( w | spam)  =  0.0
P( w )        =  0.07692307692307693
P( spam )     =  0.38461538461538464
P( spam | w ) =  0.0

----------------
Runnig for word: we
P( w | spam)  =  0.2
P( w )        =  0.15384615384615385
P( spam )     =  0.38461538461538