In [1]:
!pip install pandas
!pip install numpy
!pip install scikit-learn

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [3]:
spam_df = pd.read_csv('spam.csv')

In [4]:
spam_df['MESSAGE_ORIGINAL'] = spam_df['message']
spam_df['message'] = spam_df.apply(lambda row: row.message.lower(), axis=1)
spam_df['message'] = spam_df['message'].str.replace('\W', ' ', regex=True)
spam_df['message'] = spam_df['message'].str.split()
spam_df

Unnamed: 0,category,message,MESSAGE_ORIGINAL
0,ham,"[go, until, jurong, point, crazy, available, o...","Go until jurong point, crazy.. Available only ..."
1,ham,"[ok, lar, joking, wif, u, oni]",Ok lar... Joking wif u oni...
2,spam,"[free, entry, in, 2, a, wkly, comp, to, win, f...",Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,"[u, dun, say, so, early, hor, u, c, already, t...",U dun say so early hor... U c already then say...
4,ham,"[nah, i, don, t, think, he, goes, to, usf, he,...","Nah I don't think he goes to usf, he lives aro..."
...,...,...,...
5567,spam,"[this, is, the, 2nd, time, we, have, tried, 2,...",This is the 2nd time we have tried 2 contact u...
5568,ham,"[will, ü, b, going, to, esplanade, fr, home]",Will ü b going to esplanade fr home?
5569,ham,"[pity, was, in, mood, for, that, so, any, othe...","Pity, * was in mood for that. So...any other s..."
5570,ham,"[the, guy, did, some, bitching, but, i, acted,...",The guy did some bitching but I acted like i'd...


In [5]:
vocabulary = []

for msg in spam_df['message']:
    for word in msg:
        vocabulary.append(word)
        
vocabulary = list(set(vocabulary))
len(vocabulary)

8749

In [6]:
word_counts_per_sms = {unique_word: [0] * len(spam_df['message']) for unique_word in vocabulary}

for index, sms in enumerate(spam_df['message']):
    for word in sms:
        word_counts_per_sms[word][index] += 1

In [7]:
word_counts = pd.DataFrame(word_counts_per_sms)

In [8]:
spam_df = spam_df.rename(columns={"message": "MESSAGE", "category": "CATEGORY"}, errors="raise")
spam_clean = pd.concat([spam_df, word_counts], axis=1)
spam_clean

Unnamed: 0,CATEGORY,MESSAGE,MESSAGE_ORIGINAL,recovery,neighbour,applebees,alex,gets,calculated,weds,...,opt,colour,sir,poor,seat,organizer,82242,dao,tune,urgnt
0,ham,"[go, until, jurong, point, crazy, available, o...","Go until jurong point, crazy.. Available only ...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[ok, lar, joking, wif, u, oni]",Ok lar... Joking wif u oni...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,spam,"[free, entry, in, 2, a, wkly, comp, to, win, f...",Free entry in 2 a wkly comp to win FA Cup fina...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,"[u, dun, say, so, early, hor, u, c, already, t...",U dun say so early hor... U c already then say...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[nah, i, don, t, think, he, goes, to, usf, he,...","Nah I don't think he goes to usf, he lives aro...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,spam,"[this, is, the, 2nd, time, we, have, tried, 2,...",This is the 2nd time we have tried 2 contact u...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,ham,"[will, ü, b, going, to, esplanade, fr, home]",Will ü b going to esplanade fr home?,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,ham,"[pity, was, in, mood, for, that, so, any, othe...","Pity, * was in mood for that. So...any other s...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,ham,"[the, guy, did, some, bitching, but, i, acted,...",The guy did some bitching but I acted like i'd...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
spam_clean = shuffle(spam_clean, random_state=0)
training_set_clean = spam_clean[:5000]
testing_set_clean = spam_clean[-573:]

word_counts

Unnamed: 0,recovery,neighbour,applebees,alex,gets,calculated,weds,guai,admirer,finalise,...,opt,colour,sir,poor,seat,organizer,82242,dao,tune,urgnt
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
spam_messages = training_set_clean[training_set_clean['CATEGORY'] == 'spam']
ham_messages = training_set_clean[training_set_clean['CATEGORY'] == 'ham']

'''

"word" means every single word in the dictionary: w1, w2, w3, ..., wn.
Using Bayes' Law, 
P(spam|word) = P(spam)*P(word|spam) / P(word)
P(ham|word) = P(ham)*P(word|ham) / P(word)

Given that certain words appear in the string, we compare P(spam|word) and P(ham|word)
Since their denominators are the same, P(word) doesn't need to be calculated.
Just comparing the numerator will give the same comparison result.

'''

# P(spam) and P(ham)
p_spam = len(spam_messages) / len(training_set_clean)
p_ham = len(ham_messages) / len(training_set_clean)

# P(word|spam) = word_count_in_spam / total_spam_word_count, the probability of each word among spam messages
# P(word|ham) = word_count_in_ham / total_ham_word_count
word_count_per_spam = spam_messages['MESSAGE'].apply(len)
total_spam_word_count = word_count_per_spam.sum()

word_count_per_ham = ham_messages['MESSAGE'].apply(len)
total_ham_word_count = word_count_per_ham.sum()

# calculate parameters for each word
# aka the probability of a certain word appearing in a spam/ham msg
parameters_spam = {unique_word:0 for unique_word in vocabulary}
parameters_ham = {unique_word:0 for unique_word in vocabulary}

vocabulary_size = len(vocabulary)

for word in vocabulary:
    # count of word in spam
    word_count_in_spam = spam_messages[word].sum()
    
    # apply laplace smoothing with alpha=1
    p_word_in_spam = (word_count_in_spam + 1) / (total_spam_word_count + vocabulary_size)
    parameters_spam[word] = p_word_in_spam
    
    # count of word in ham
    word_count_in_ham = ham_messages[word].sum()
    
    # apply laplace smoothing
    p_word_in_ham = (word_count_in_ham + 1) / (total_ham_word_count + vocabulary_size)
    parameters_ham[word] = p_word_in_ham


In [21]:
import re

def classify(message):
    message = re.sub('\W', ' ', message).lower().split()
    
    # P(spam|word)
    p_spam_given_word = p_spam
    p_ham_given_word = p_ham
    
    for word in message:
        if word in parameters_spam:
            p_spam_given_word *= parameters_spam[word]
            
        if word in parameters_ham:
            p_ham_given_word *= parameters_ham[word]
            
    if p_spam_given_word > p_ham_given_word:
        return "spam"
    else: return "ham"
    

In [23]:
# test accuracy
test_correct = 0
test_total = len(testing_set_clean)

for index, row in testing_set_clean.iterrows():
    if(classify(row['MESSAGE_ORIGINAL']) == row['CATEGORY']):
        test_correct += 1

print(test_correct/test_total)

0.9860383944153578


In [24]:
classify("The egg tart is just 5 pounds. Do you wanna take a try tonight?")

'ham'

In [25]:
classify("Winner! You have won a free 100 pounds gift card. Call this number to receive the code now.")

'spam'