## In this project we are going to build a message spam filter usin naive bayes algorithm

In [1]:
import pandas as pd
d = pd.read_csv('SMSSpamCollection',sep='\t',header=None,names = ['Label','SMS'])
d

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [2]:
d['Label'].value_counts(normalize=True)

ham     0.865937
spam    0.134063
Name: Label, dtype: float64

NOW WE WILL SPLIT THE DATASET INTO TWO PARTS . ONE WILL BE TRAINING DATA WHICH WILL BE 80% AND REST WILL BE THE DATA FOR TESTING WHICH WILL BE 20% .

In [3]:
#doing some random sampling
d = d.sample(frac = 1 , random_state=1)

In [4]:
mark = round(len(d)*0.8)
dtr = d [:mark].reset_index(drop=True) # training data
dts = d[mark:].reset_index(drop=True) # testing data
print('Length of training data' , len(dtr))
print('Length of testing data' , len(dts))

Length of training data 4458
Length of testing data 1114


In [5]:
dtr

Unnamed: 0,Label,SMS
0,ham,"Yep, by the pretty sculpture"
1,ham,"Yes, princess. Are you going to make me moan?"
2,ham,Welp apparently he retired
3,ham,Havent.
4,ham,I forgot 2 ask ü all smth.. There's a card on ...
5,ham,Ok i thk i got it. Then u wan me 2 come now or...
6,ham,I want kfc its Tuesday. Only buy 2 meals ONLY ...
7,ham,No dear i was sleeping :-P
8,ham,Ok pa. Nothing problem:-)
9,ham,Ill be there on &lt;#&gt; ok.


NOW WE WILL FIND PERCENTAGES OF SPAM AND NON SPAM IN THESE DATASETS 
PS: HAM = NON-SPAM

FOR TRAINING DATASET

In [6]:
dtr['Label'].value_counts(normalize=True)

ham     0.86541
spam    0.13459
Name: Label, dtype: float64

FOR TESTING DATASET

In [7]:
dts['Label'].value_counts(normalize=True)

ham     0.868043
spam    0.131957
Name: Label, dtype: float64

## Now we will clean the data 

In [8]:
dtr['SMS'] = dtr['SMS'].str.replace('\W',' ')
dtr['SMS'] = dtr['SMS'].str.lower()
dtr

Unnamed: 0,Label,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent
4,ham,i forgot 2 ask ü all smth there s a card on ...
5,ham,ok i thk i got it then u wan me 2 come now or...
6,ham,i want kfc its tuesday only buy 2 meals only ...
7,ham,no dear i was sleeping p
8,ham,ok pa nothing problem
9,ham,ill be there on lt gt ok


NOW WE WILL CREATE A LIST OF WORDS (VOCABULARY) OF THIS DATASET

In [9]:
vocab = []
for row in dtr['SMS']:
    l = row.split()
    for c in l:
        if c not in vocab:
            vocab.append(c)
print(len(vocab))


7783


In [10]:
dtr['SMS'] = dtr['SMS'].str.split()

In [11]:
vocab = set(vocab)

In [12]:
vocab = list(vocab)

In [13]:
len(vocab)

7783

NOW WE WILL STORE THE WORD COUNTS OF EACH ROW FROM SMS COLUMN

In [14]:
#Empty dictionary
word_count_per_sms = {}
for word in vocab:
    word_count_per_sms[word] = [0]*len(dtr['SMS'])

In [15]:
for index,sms in enumerate(dtr['SMS']):
    for word in sms:
        word_count_per_sms[word][index]+=1

In [16]:
dtfm = pd.DataFrame(word_count_per_sms)

In [17]:
#converting the dictionary into dataframe
dtfm.head()

Unnamed: 0,0,00,000,000pes,008704050406,0089,01223585334,02,0207,02072069400,...,zindgi,zoe,zogtorius,zouk,zyada,é,ú1,ü,〨ud,鈥
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0


In [18]:
final = pd.concat([dtr,dtfm],axis=1)

In [19]:
final.head(20)

Unnamed: 0,Label,SMS,0,00,000,000pes,008704050406,0089,01223585334,02,...,zindgi,zoe,zogtorius,zouk,zyada,é,ú1,ü,〨ud,鈥
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
5,ham,"[ok, i, thk, i, got, it, then, u, wan, me, 2, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,ham,"[i, want, kfc, its, tuesday, only, buy, 2, mea...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,ham,"[no, dear, i, was, sleeping, p]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,ham,"[ok, pa, nothing, problem]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,ham,"[ill, be, there, on, lt, gt, ok]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


LETS CALCULATE CONSTANTS WHICH WE WILL BE USING FOR NAIVE BAYES

In [20]:
alpha = 1

In [21]:
p_spam = final['Label'].value_counts().loc['spam'] / len(final)
p_ham = final['Label'].value_counts().loc['ham'] / len(final)
print(p_spam)
print(p_ham)

0.13458950201884254
0.8654104979811574


In [22]:
#isolating
spam_messages = final[final['Label'] == 'spam']
ham_messages = final[final['Label'] == 'ham']

In [23]:
n_words_per_spam_msg = spam_messages['SMS'].apply(len)
n_spam = n_words_per_spam_msg.sum()
print(n_spam)

15190


In [24]:
n_words_per_ham_message = ham_messages['SMS'].apply(len)
n_ham = n_words_per_ham_message.sum()
print(n_ham)

57237


In [25]:
n_vocab = len(vocab)
print(n_vocab)

7783


CALCULATING PARAMETERS

In [26]:
par_ham = { word:0 for word in vocab}
par_spam = {word:0 for word in vocab}

In [27]:
for word in vocab:
    nwspam = spam_messages[word].sum()
    pbspam = (nwspam+alpha) / (n_spam + (alpha * n_vocab))
    par_spam[word] = pbspam
    
    nwham = ham_messages[word].sum()
    pbham = (nwham + alpha) / (n_ham + (alpha * n_vocab))
    par_ham[word] = pbham

THE MAIN FUNCTION WHICH WILL CLASSIFY THE MESSAGE

In [30]:
import re
def classify(message):
    message = re.sub('\W',' ',message)
    message = message.lower()
    message = message.split()
    
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham

    for word in message:
        if word in par_spam:
            p_spam_given_message *= par_spam[word]
            
        if word in par_ham:
            p_ham_given_message *= par_ham[word]
            
    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)
    
    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
    else:
        print('Equal proabilities, have a human classify this!')

In [31]:
classify("Apna sapna secret money money")

P(Spam|message): 9.664298413039638e-17
P(Ham|message): 7.844211829134064e-16
Label: Ham


In [32]:
classify('WINNER!! This is the secret code to unlock the money: C3421.')

P(Spam|message): 1.3481290211300841e-25
P(Ham|message): 1.9368049028589875e-27
Label: Spam


In [33]:
classify("Sounds good, Tom, then see u there")

P(Spam|message): 2.4372375665888117e-25
P(Ham|message): 3.687530435009238e-21
Label: Ham


LOOKING GOOD TILL HERE

## Checking the accuracy of our filter

In [34]:
def classify_test_set(message):

    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()

    p_spam_given_message = p_spam
    p_ham_given_message = p_ham

    for word in message:
        if word in par_spam:
            p_spam_given_message *= par_spam[word]
            
        if word in par_ham:
            p_ham_given_message *= par_ham[word]
            
    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_spam_given_message > p_ham_given_message:
        return 'spam'
    else:
        return 'needs human classification'

In [36]:
dts['predicted'] = dts['SMS'].apply(classify_test_set)
dts.head()

Unnamed: 0,Label,SMS,predicted
0,ham,Later i guess. I needa do mcat study too.,ham
1,ham,But i haf enuff space got like 4 mb...,ham
2,spam,Had your mobile 10 mths? Update to latest Oran...,spam
3,ham,All sounds good. Fingers . Makes it difficult ...,ham
4,ham,"All done, all handed in. Don't know if mega sh...",ham


In [42]:
correct = 0 
total = len(dts)
for row in dts.iterrows():
    row = row[1]
    if row['Label']==row['predicted']:
        correct+=1
print(correct)

1100


In [45]:
accuracy = correct / total *100

In [46]:
print(accuracy)

98.74326750448833
