# Spam Detection Using Naive Bayes

In [None]:
import numpy as np

## Data Gathering

In [None]:
spam = [
    "To use your credit, click the new WAP link in the next years txt message or click here", 
    "Thanks for your subscription to New Ringtone UK your new mobile will be charged £5/month Please confirm annoncement by replying", 
    "As a valued customer, I am pleased to advise you that following recent delivery waiting review of your Mob No. you are awarded with. Call us to review.", 
    "Please call our new customer service representative on", 
    "We are trying to contact you. Last weekends customer draw shows that you won a £1000 prize GUARANTEED. Calling years", 
]

In [2]:
# leave one sentence from spam for testing our model later 
spam_test = ["Customer service annoncement. You have a New Years delivery waiting for you. click"]

In [3]:
non = [
    "I don't think he goes to usf, he lives around here though", 
    "New car and house for my parents. i have only new job in hand", 
    "Great escape. I fancy the bridge but needs her lager. See you tomorrow", 
    "Tired. I haven't slept well the past few nights.",
    "Too late. I said i have the website. I didn't i have or dont have the slippers", 
    "I might come by tonight then if my class lets out early", 
    "Jos ask if u wana meet up?", 
    "That would be great. We'll be at the Guild. We can try meeting with the customer on Bristol road or somewhere"
    ]

In [4]:
# another sentence from non for testing our model 
spam_test_2 = ["That would be great. We'll be at the Guild. We can try meeting with the customer on Bristol road or somewhere"]

## Basic Pre-Processing

In [6]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.0.5-py3-none-any.whl.metadata (24 kB)
Collecting wrapt (from smart-open>=1.8.1->gensim)
  Downloading wrapt-1.17.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.4 kB)
Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading numpy-1.26.4-cp312-cp312-manylin

In [7]:
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.porter import PorterStemmer
from gensim.utils import tokenize

In [16]:
test_sentence = non[0]
test_sentence = non[5]
# test_sentence = spam[1]

In [20]:
print(test_sentence)

# 1. Remove stop words: 
removed_stops = remove_stopwords(test_sentence)
print(removed_stops)

# 2. Stemming
p = PorterStemmer()
stemmed = p.stem(removed_stops)
print(stemmed)

# 3. Tokenization
tokens = tokenize(stemmed)
print(list(tokens))

I might come by tonight then if my class lets out early
I come tonight class lets early
i come tonight class lets earli
['i', 'come', 'tonight', 'class', 'lets', 'earli']


### 1. Stop Words
Stop words are common words in a language (like "and," "the," "in," "is") that are often ignored in tasks like text processing or search engines because they don't carry significant meaning.

**Example:**  
In the sentence "The cat is on the mat," the stop words are "the" and "is."

### 2. Stemming
Stemming is the process of reducing words to their base or root form.

**Example:**

```python
stemmed_word = p.stem("running")  # Output: "run"
```
### 3. Tokenization

Tokenization is the process of splitting text into individual units, such as words or phrases, called **tokens**. These tokens are the building blocks for further text processing tasks.

**Example:**
The sentence "running on the mat" after stemming becomes "run on mat".


## Create a dictionary of words

In [21]:
def tokenize_sentence(sentence):
    removed_stops = remove_stopwords(sentence)
    p = PorterStemmer()
    stemmed = p.stem(removed_stops)
    tokens = tokenize(stemmed)

    return list(tokens)

In [22]:
dictionary = set() # will have unique values only
spams_tokenized = []
nons_tokenized = []

In [34]:
for sentence in spam:
    sentence_tokens = tokenize_sentence(sentence)
    spams_tokenized.append(sentence_tokens)
    dictionary = dictionary.union(sentence_tokens)


for sentence in non:
    sentence_tokens = tokenize_sentence(sentence)
    nons_tokenized.append(sentence_tokens)
    dictionary = dictionary.union(sentence_tokens)


print("Tokenized spam: ",spams_tokenized) # spams_tokenized hold spam tokens
print("Tokenized non: ", nons_tokenized) # nons_tokenized hold non spam tokens
print("dictionary: ", dictionary) # dictionary holds all tokens

Tokenized spam:  [['to', 'use', 'credit', 'click', 'new', 'wap', 'link', 'years', 'txt', 'message', 'click'], ['to', 'use', 'credit', 'click', 'new', 'wap', 'link', 'years', 'txt', 'message', 'click'], ['to', 'use', 'credit', 'click', 'new', 'wap', 'link', 'years', 'txt', 'message', 'click'], ['thanks', 'subscription', 'new', 'ringtone', 'uk', 'new', 'mobile', 'charged', 'month', 'please', 'confirm', 'annoncement', 'repli'], ['as', 'valued', 'customer', 'i', 'pleased', 'advise', 'following', 'recent', 'delivery', 'waiting', 'review', 'mob', 'no', 'awarded', 'with', 'call', 'review'], ['please', 'new', 'customer', 'service', 'repres'], ['we', 'trying', 'contact', 'you', 'last', 'weekends', 'customer', 'draw', 'shows', 'won', 'prize', 'guaranteed', 'calling', 'year'], ['to', 'use', 'credit', 'click', 'new', 'wap', 'link', 'years', 'txt', 'message', 'click'], ['to', 'use', 'credit', 'click', 'new', 'wap', 'link', 'years', 'txt', 'message', 'click'], ['to', 'use', 'credit', 'click', 'new',

## Basic Stats

In [35]:
# These things do not depend on an individual word so let's calculate them separately once 

total_word_count = len(dictionary) # total words
total_spam_messages = len(spams_tokenized)
total_all_messages = len(spams_tokenized) + len(nons_tokenized)

print("Total Number of words: ", total_word_count)

Total Number of words:  101


In [36]:
# P(spam) ... does not depend on an individual word so let's calculate that separately once 

p_spam = total_spam_messages / total_all_messages

print("P(spam) = ", p_spam)

P(spam) =  0.6046511627906976


In [37]:
# Helper function to count occurences

def count_word_in_messages(word, messages):
    total_count = 0
    for msg in messages: 
        if word in msg:
            total_count += 1
    return total_count

## The Actual Probability Computation

In [38]:
final_prob = 1 # can't start from 0

In [43]:
for test_sentence in spam_test_2: 
    test_sentence = tokenize_sentence(test_sentence)
    print(test_sentence)
    
    # let's run this for each word separately 
    for word in test_sentence: 
        print("----------------")
        print("Running for word:", word)
        
        # Find P( w | spam)
        spam_count = count_word_in_messages(word, spams_tokenized)
        p_w_spam = spam_count / total_spam_messages 
        print("P( w | spam)  = ", p_w_spam)
        
        # Find P( w )
        w_count = count_word_in_messages(word, spams_tokenized)
        w_count += count_word_in_messages(word, nons_tokenized)
        p_w = w_count / total_all_messages
        print("P( w )        = ", p_w)
        
        
        # Find P( spam | w )
        p_spam_w = (p_w_spam * p_spam) / p_w
        print("P( spam )     = ", p_spam)
        print("P( spam | w ) = ", p_spam_w)
        print("")
        final_prob *= p_spam_w
        
        
    print("P( spam | all_words ) = ", final_prob)

['that', 'great', 'we', 'll', 'guild', 'we', 'try', 'meeting', 'customer', 'bristol', 'road']
----------------
Running for word: that
P( w | spam)  =  0.07692307692307693
P( w )        =  0.09302325581395349
P( spam )     =  0.6046511627906976
P( spam | w ) =  0.5

----------------
Running for word: great
P( w | spam)  =  0.0
P( w )        =  0.09302325581395349
P( spam )     =  0.6046511627906976
P( spam | w ) =  0.0

----------------
Running for word: we
P( w | spam)  =  0.15384615384615385
P( w )        =  0.13953488372093023
P( spam )     =  0.6046511627906976
P( spam | w ) =  0.6666666666666666

----------------
Running for word: ll
P( w | spam)  =  0.15384615384615385
P( w )        =  0.13953488372093023
P( spam )     =  0.6046511627906976
P( spam | w ) =  0.6666666666666666

----------------
Running for word: guild
P( w | spam)  =  0.0
P( w )        =  0.046511627906976744
P( spam )     =  0.6046511627906976
P( spam | w ) =  0.0

----------------
Running for word: we
P( w | spam