In [2]:
import pandas as pd
sms_spam = pd.read_csv('SMSSpamCollection', sep= '\t', header = None, names= (['Label', 'SMS']))

sms_spam.shape
                       

(5572, 2)

In [3]:
sms_spam.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
sms_spam.Label.value_counts(normalize= True)

ham     0.865937
spam    0.134063
Name: Label, dtype: float64

In [6]:
data_randomized= sms_spam.sample(frac= 1, random_state=2)

training_test_index = round(len(data_randomized) * 0.8)


In [7]:
training_set = data_randomized[:training_test_index].reset_index(drop= True)
test_set = data_randomized[training_test_index:].reset_index(drop=True)


In [8]:
print(training_set.shape)
print(test_set.shape)

(4458, 2)
(1114, 2)


In [9]:
training_set['Label'].value_counts(normalize=True)

ham     0.867654
spam    0.132346
Name: Label, dtype: float64

In [10]:
test_set['Label'].value_counts(normalize=True)

ham     0.859066
spam    0.140934
Name: Label, dtype: float64

In [11]:
training_set.head()

Unnamed: 0,Label,SMS
0,ham,Omg if its not one thing its another. My cat h...
1,ham,I hope you know I'm still mad at you.
2,ham,Waqt se pehle or naseeb se zyada kisi ko kuch ...
3,ham,What time should I tell my friend to be around?
4,ham,Yo theres no class tmrw right?


In [12]:
training_set['SMS']= training_set['SMS'].str.replace('\W', ' ').str.lower()
training_set.head()

Unnamed: 0,Label,SMS
0,ham,omg if its not one thing its another my cat h...
1,ham,i hope you know i m still mad at you
2,ham,waqt se pehle or naseeb se zyada kisi ko kuch ...
3,ham,what time should i tell my friend to be around
4,ham,yo theres no class tmrw right


In [13]:
training_set['SMS'] = training_set['SMS'].str.split()

vocabulary = []
for sms in training_set['SMS']:
    for word in sms:
        vocabulary.append(word)
        
vocabulary = list(set(vocabulary))


In [14]:
len(vocabulary)

7785

In [15]:
word_counts_per_sms = {unique_word: [0]* len(training_set['SMS']) for unique_word in vocabulary}

for index, sms in enumerate(training_set['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] +=1 

In [16]:
word_counts = {unique_word: [0]* len(training_set['SMS']) for unique_word in vocabulary}


In [17]:
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts.head()

Unnamed: 0,subs16,fishhead,triple,situation,mitsake,evening,callfreefone,jelly,rude,finishing,...,courage,hint,drivin,centre,havnt,answer,knee,tscs,lol,fundamentals
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
training_set_clean = pd.concat([training_set, word_counts], axis=1)

In [19]:
spam_messages = training_set_clean[training_set_clean['Label'] == 'spam']
ham_messages = training_set_clean[training_set_clean['Label'] == 'ham']

p_spam= len(spam_messages)/len(training_set_clean)
p_ham = len(ham_messages)/len(training_set_clean)

n_words_per_spam_message = spam_messages['SMS'].apply(len)
n_spam = n_words_per_spam_message.sum()

n_words_per_ham_message = ham_messages['SMS'].apply(len)
n_ham = n_words_per_ham_message.sum()

n_vocabulary = len(vocabulary)

alpha = 1

In [20]:
parameters_spam = {word: 0 for word in vocabulary}
parameters_ham = {word:0 for word in vocabulary}

for word in vocabulary:
    n_word_given_spam = spam_messages[word].sum()
    p_word_given_spam = (n_word_given_spam + alpha)/ (n_spam + alpha * n_vocabulary)
    parameters_spam[word] = p_word_given_spam
    
    n_word_given_ham = ham_messages[word].sum()
    p_word_given_ham = (n_word_given_ham + alpha)/ (n_ham + alpha * n_vocabulary)
    parameters_ham[word] = p_word_given_ham

In [21]:
import re


def classify(message):
    '''message a string'''
    
    message = re.sub('\W', ' ', message).lower().split()
    #message = message.
    
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    
    for word in message:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]
            
        if word in parameters_ham:
            p_ham_given_message *= parameters_ham[word]
            
    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)
    
    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
    else:
        print('Equal proabilities, have a human classify this!')   
        
    

In [22]:
classify('WINNER: This is the secret code to unlock the money: C3421.')


P(Spam|message): 1.2656035752364677e-25
P(Ham|message): 3.249373610715579e-27
Label: Spam


In [23]:

classify("you won")

P(Spam|message): 3.896143183040672e-06
P(Ham|message): 5.243297444420137e-06
Label: Ham


In [24]:
 def classify_test_set(message):  
    '''
    message: a string
    '''
    
    message = re.sub('\W', ' ', message)
    message = message.lower().split()
    
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham

    for word in message:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]
            
        if word in parameters_ham:
            p_ham_given_message *= parameters_ham[word]
    
    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_spam_given_message > p_ham_given_message:
        return 'spam'
    else:
        return 'needs human classification'

In [25]:
test_set['predicted'] = test_set['SMS'].apply(classify_test_set)
test_set.head()

Unnamed: 0,Label,SMS,predicted
0,ham,You are a big chic. Common. Declare,ham
1,ham,Eh u remember how 2 spell his name... Yes i di...,ham
2,ham,I sent them. Do you like?,ham
3,ham,Hows the pain dear?y r u smiling?,ham
4,ham,Goodnight da thangam I really miss u dear.,ham


In [28]:
correct = 0
total = test_set.shape[0]
    
for row in test_set.iterrows():
    row = row[1]
    if row['Label'] == row['predicted']:
        correct += 1
        
print('Correct:', correct)
print('Incorrect:', total - correct)
print('Accuracy:', correct/total)

Correct: 1097
Incorrect: 17
Accuracy: 0.9847396768402155


In [29]:
test_set.loc[test_set['Label'] != test_set['predicted']]

Unnamed: 0,Label,SMS,predicted
5,spam,"I want some cock! My hubby's away, I need a re...",ham
16,ham,"Madam,regret disturbance.might receive a refer...",spam
49,spam,2/2 146tf150p,ham
73,spam,FreeMsg: Hey - I'm Buffy. 25 and love to satis...,ham
136,spam,"SMS. ac sun0819 posts HELLO:""You seem cool, wa...",ham
208,spam,LIFE has never been this much fun and great un...,ham
353,spam,"Hi babe its Jordan, how r u? Im home from abro...",ham
373,spam,dating:i have had two of these. Only started a...,ham
400,ham,Waiting for your call.,spam
421,spam,Sorry I missed your call let's talk when you h...,ham


In [34]:
classify('plese call, the offer is ging to end soon man')

P(Spam|message): 6.991911498638692e-24
P(Ham|message): 1.115941655993273e-22
Label: Ham


In [31]:
classify('For me the love should start with attraction')


P(Spam|message): 1.1204216748898042e-21
P(Ham|message): 2.3516173288367196e-18
Label: Ham
