In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string

In [14]:
df = pd.read_csv('spam.csv', encoding="ISO-8859-1")

In [15]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [16]:
df = df[['v1', 'v2']]

In [17]:
df.rename(columns = {"v1": "spam", "v2": "text"}, inplace = True)

In [18]:
df.head()

Unnamed: 0,spam,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [19]:
df.spam = df.spam.apply(lambda x: True if x == 'spam' else False)

In [20]:
df.text = df.text.apply(lambda x: x.lower().translate(str.maketrans('', '', string.punctuation)))

In [21]:
df = df.sample(frac = 1)
train_df = df.iloc[ : int(len(df) * 0.7)]
test_df = df.iloc[int(len(df) * 0.7) : ]
FRAC_SPAM_WORDS = train_df.spam.mean()

In [23]:
train_spam_words = ' '.join(train_df[train_df.spam == True].text).split(' ')
train_non_spam_words = ' '.join(train_df[train_df.spam == False].text).split(' ')

In [25]:
common_words = set(train_spam_words).intersection(set(train_non_spam_words))

In [26]:
train_spam_bow = dict()
for w in common_words:
  train_spam_bow[w] = train_spam_words.count(w) / len(train_spam_words)

In [28]:
train_non_spam_bow = dict()
for w in common_words:
  train_non_spam_bow[w] = train_non_spam_words.count(w) / len(train_non_spam_words)

In [35]:
def predict_text(t, verbose=False):

    valid_words = [w for w in t if w in common_words]
    spam_probs = [train_spam_bow[w] for w in valid_words]
    non_spam_probs = [train_non_spam_bow[w] for w in valid_words]
    
    if verbose:
        data_df = pd.DataFrame()
        data_df['word'] = valid_words
        data_df['spam_prob'] = spam_probs
        data_df['non_spam_prob'] = non_spam_probs
        data_df['ratio'] = [s/n if n > 0 else np.inf for s,n in zip(spam_probs, non_spam_probs)]
        print(data_df)
     
    #calculate spam score as sum of logs for all probabilities
    spam_score = sum([np.log(p) for p in spam_probs]) + np.log(FRAC_SPAM_WORDS)
    
    #calculate non-spam score as sum of logs for all probabilities
    non_spam_score = sum([np.log(p) for p in non_spam_probs]) + np.log(1-FRAC_SPAM_WORDS)
    
    if verbose:
        print('Spam Score: %s'%spam_score)
        print('Non-Spam Score: %s'%non_spam_score)
   
    return (spam_score >= non_spam_score)

In [36]:
predict_text('urgent call this number'.split(), verbose=True)

     word  spam_prob  non_spam_prob       ratio
0  urgent   0.003464       0.000021  166.126762
1    call   0.020471       0.003274    6.252600
2    this   0.004724       0.003608    1.309459
3  number   0.001653       0.000813    2.033020
Spam Score: -23.307975584013764
Non-Spam Score: -29.385251530982313


True

In [37]:
predict_text('hey do you want to go a movie tonight'.split(), verbose=True)

    word  spam_prob  non_spam_prob     ratio
0    hey   0.000157       0.001606  0.098068
1     do   0.001417       0.005297  0.267563
2    you   0.016298       0.025983  0.627248
3   want   0.001338       0.002377  0.563029
4     to   0.038343       0.021854  1.754505
5     go   0.002047       0.003628  0.564171
6      a   0.022124       0.015452  1.431776
7  movie   0.000079       0.000292  0.269686
Spam Score: -50.755291480694844
Non-Spam Score: -43.264556609583934


False

In [38]:
predictions = test_df.text.apply(lambda x: predict_text(x.split()))

In [40]:
fraction_correctly_detected = np.sum((predictions == True) & (test_df.spam == True)) / np.sum(test_df.spam == True)
fraction_wrongly_sent_to_spam = np.sum((predictions == True) & (test_df.spam == False)) / np.sum(test_df.spam == False)
print('Fraction of spam messages correctly detected: {:.2f}'.format(fraction_correctly_detected))
print('Fraction of valid messages sent to spam: {:.2f}' .format(fraction_wrongly_sent_to_spam))

Fraction of spam messages correctly detected: 0.88
Fraction of valid messages sent to spam: 0.03
