In [1]:
import pandas as pd
df_spam=pd.read_csv("C:/Users/jpran/Downloads/SMSSpamCollection",sep='\t',header=None, names=['Label','SMS'])
print(df_spam.shape)
df_spam.head(2)

(5572, 2)


Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...


In [2]:
df_spam['Label'].value_counts(normalize=True)

Label
ham     0.865937
spam    0.134063
Name: proportion, dtype: float64

In [3]:
df_randomized=df_spam.sample(frac=1,random_state=1)
train_test_index=round(len(df_randomized)*0.8)
training_set=df_randomized[:train_test_index].reset_index(drop=True)
test_set=df_randomized[train_test_index:].reset_index(drop=True)
print(training_set.shape)
print(test_set.shape)

(4458, 2)
(1114, 2)


In [4]:

training_set['Label'].value_counts(normalize=True)

Label
ham     0.86541
spam    0.13459
Name: proportion, dtype: float64

In [5]:
test_set['Label'].value_counts(normalize=True)

Label
ham     0.868043
spam    0.131957
Name: proportion, dtype: float64

In [6]:
training_set['SMS']=training_set['SMS'].replace('\\W',' ',regex=True)
training_set['SMS']=training_set['SMS'].str.lower()
training_set.head()

Unnamed: 0,Label,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent
4,ham,i forgot 2 ask ü all smth there s a card on ...


In [7]:
training_set['SMS']=training_set['SMS'].str.split()
vocabulary=[]
for sms in training_set['SMS']:
    for word in sms:
        vocabulary.append(word)
vocabulary=list(set(vocabulary))

In [8]:
len(vocabulary)

7783

In [9]:
word_counts_per_sms={unique_word:[0]*len(training_set['SMS'])for unique_word in vocabulary}
for index, sms in enumerate(training_set['SMS']):
    for word in sms:
        word_counts_per_sms[word][index]+=1

In [10]:
word_counts=pd.DataFrame(word_counts_per_sms)
word_counts.head()

Unnamed: 0,turned,it,luck,textand,maniac,them,nokia6600,minute,1013,ec2a,...,dl,anjie,poboxox36504w45wq,bringing,brolly,trusting,spjanuary,masked,tonights,professional
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
df_training_cleaned=pd.concat([training_set,word_counts],axis=1)
df_training_cleaned.head()

Unnamed: 0,Label,SMS,turned,it,luck,textand,maniac,them,nokia6600,minute,...,dl,anjie,poboxox36504w45wq,bringing,brolly,trusting,spjanuary,masked,tonights,professional
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
spam_messages=df_training_cleaned[df_training_cleaned['Label']=='spam']
ham_messages=df_training_cleaned[df_training_cleaned['Label']=='ham']
p_spam=len(spam_messages)/len(df_training_cleaned)
p_ham=len(ham_messages)/len(df_training_cleaned)
n_words_per_spam_messages=spam_messages['SMS'].apply(len)
n_spam=n_words_per_spam_messages.sum()
n_words_per_ham_messages=ham_messages['SMS'].apply(len)
n_ham=n_words_per_ham_messages.sum()
n_vocabulary=len(vocabulary)
alpha=1

In [13]:
parameters_spam={unique_word:0 for unique_word in vocabulary}
parameters_ham={unique_word:0 for unique_word in vocabulary}
for word in vocabulary:
    n_words_given_spam=spam_messages[word].sum()
    p_words_given_spam=(n_words_given_spam+alpha)/(n_spam+alpha*n_vocabulary)
    parameters_spam[word]=p_words_given_spam
    n_words_given_ham=ham_messages[word].sum()
    p_words_given_ham=(n_words_given_ham+alpha)/(n_ham+alpha*n_vocabulary)
    parameters_ham[word]=p_words_given_ham

In [15]:
import re
def classify(message):
    message=re.sub('\\W',' ',message)
    message=message.lower().split()
    p_spam_given_message=p_spam
    p_ham_given_message=p_ham
    for word in message:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]
        if word in parameters_ham:
            p_ham_given_message*=parameters_ham[word]
    print('P(Spam|message):',p_spam_given_message)
    print('P(Ham|message):',p_ham_given_message)
    if p_ham_given_message>p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message<p_spam_given_message:
        print('Label: Spam')
    else:
        print('Equal probabilities, have a human classify this! ')


In [16]:
classify('Subject: 📢 You Won $1,000,000! Claim Your Prize Now! 🎉Dear Winner,CONGRATULATIONS!!! 🎊 You are our lucky winner of $1,000,000 in our exclusive prize draw. Your email was selected among millions of participants, and you are just one step away from claiming your life-changing prize!To claim your winnings, click the link below and follow the simple steps,🔗 Claim Your Prize HURRY! This offer is available for a limited time only, and you must act fast to avoid missing out! Please provide the following information to verify your identity:Our team is eagerly waiting to transfer your prize! 💸 Dont delay—click now to secure your future Best regards,')

P(Spam|message): 2.173641330419176e-298
P(Ham|message): 0.0
Label: Spam


In [17]:
classify('hi how are you?')

P(Spam|message): 9.005579833206856e-13
P(Ham|message): 6.76830280260844e-10
Label: Ham


In [18]:
classify('hello can you help me out i need 10 million dollar and ill never return it')

P(Spam|message): 3.789991845261025e-50
P(Ham|message): 7.650861164497053e-43
Label: Ham


In [23]:
def classify_test_set(message):
    message=re.sub('\\W',' ',message)
    message=message.lower().split()
    p_spam_given_message=p_spam
    p_ham_given_message=p_ham
    for word in message:
        if word in parameters_spam:
            p_spam_given_message*=parameters_spam[word]
        if word in parameters_ham:
            p_ham_give_message*=parameters_ham[word]
    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_spam_given_message > p_ham_given_message:
        return 'spam'
    else:
        return 'needs human classification'

In [26]:
def classify_test_set(message):    
    '''
    message: a string
    '''
    
    message = re.sub('\\W', ' ', message)
    message = message.lower().split()
    
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham

    for word in message:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]
            
        if word in parameters_ham:
            p_ham_given_message *= parameters_ham[word]
    
    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_spam_given_message > p_ham_given_message:
        return 'spam'
    else:
        return 'needs human classification'

In [27]:
test_set['predicted']=test_set['SMS'].apply(classify_test_set)
test_set.head()

Unnamed: 0,Label,SMS,predicted
0,ham,Later i guess. I needa do mcat study too.,ham
1,ham,But i haf enuff space got like 4 mb...,ham
2,spam,Had your mobile 10 mths? Update to latest Oran...,spam
3,ham,All sounds good. Fingers . Makes it difficult ...,ham
4,ham,"All done, all handed in. Don't know if mega sh...",ham


In [28]:
correct=0
total=test_set.shape[0]
for row in test_set.iterrows():
    row=row[1]
    if row['Label']==row['predicted']:
        correct+=1
print('Correct: ',correct)
print('Incorrect: ',total - correct)
print('Accuracy: ',correct/total)

Correct:  1100
Incorrect:  14
Accuracy:  0.9874326750448833
