In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import string


In [2]:
#read dataset
spam_df = pd.read_csv(r"C:\Users\Swapnali\Desktop\Data_ML\spam.csv", encoding="ISO-8859-1")

#subset and rename columns
spam_df = spam_df[['v1', 'v2']]
spam_df.rename(columns={'v1': 'spam', 'v2': 'text'}, inplace=True)

#convert spam column to binary
spam_df.spam = spam_df.spam.apply(lambda s: True if s=='spam' else False)

#lowercase everything and remove punctuation
spam_df.text = spam_df.text.apply(lambda t: t.lower().translate(str.maketrans('', '', string.punctuation)))

#shuffle
spam_df = spam_df.sample(frac=1)

In [3]:
spam_df


Unnamed: 0,spam,text
3251,False,come to medical college at 7pm forward it da
765,True,ur awarded a city break and could win a å£200 ...
1039,False,they just talking thats it de they wont any other
2668,True,wanna get laid 2nite want real dogging locatio...
2319,False,in which place do you want da
...,...,...
832,True,dear voucher holder to claim this weeks offer ...
329,False,im reading the text i just sent you its meant ...
2969,False,u should make a fb list
3394,False,then i buy


In [5]:
for t in spam_df[spam_df.spam == True].iloc[:5].text:
    print(t)
    print('-------')


ur awarded a city break and could win a å£200 summer shopping spree every wk txt store to 88039  skilgme tscs087147403231winawkage16 å£150perwksub
-------
wanna get laid 2nite want real dogging locations sent direct to ur mobile join the uks largest dogging network txt park to 69696 now nyt ec2a 3lp å£150msg
-------
urgent we are trying to contact u todays draw shows that you have won a å£800 prize guaranteed call 09050001808 from land line claim m95 valid12hrs only
-------
sms services for your inclusive text credits pls goto wwwcomuknet login  unsubscribe with stop no extra charge help08700469649 po box420 ip4 5we
-------
freemsg hi baby wow just got a new cam moby wanna c a hot pic or fancy a chatim w8in 4utxt  rply chat to 82242 hlp 08712317606 msg150p 2rcv
-------


In [6]:
for t in spam_df[spam_df.spam == False].iloc[:5].text:
    print(t)
    print('-------')


come to medical college at 7pm forward it da
-------
they just talking thats it de they wont any other
-------
in which place do you want da
-------
o shore are you takin the bus
-------
oh kafter that placement there ah
-------


In [7]:
#get training set
train_spam_df = spam_df.iloc[:int(len(spam_df)*0.7)]

#get testing set
test_spam_df = spam_df.iloc[int(len(spam_df)*0.7):]

In [8]:
#Fraction of texts that are spam
FRAC_SPAM_TEXTS = train_spam_df.spam.mean()
print(FRAC_SPAM_TEXTS)


0.1317948717948718


In [9]:
#get all words from spam and non-spam datasets
train_spam_words = ' '.join(train_spam_df[train_spam_df.spam == True].text).split(' ')
train_non_spam_words = ' '.join(train_spam_df[train_spam_df.spam == False].text).split(' ')

common_words = set(train_spam_words).intersection(set(train_non_spam_words))

In [10]:
train_spam_bow = dict()
for w in common_words:
    train_spam_bow[w] = train_spam_words.count(w) / len(train_spam_words)


In [11]:
train_non_spam_bow = dict()
for w in common_words:
    train_non_spam_bow[w] = train_non_spam_words.count(w) / len(train_non_spam_words)


In [12]:
train_spam_bow

{'': 0.014140918751021742,
 'draw': 0.0017982671244073892,
 'allow': 8.173941474579042e-05,
 'post': 0.00016347882949158083,
 'her': 0.0006539153179663233,
 'no1': 0.00032695765898316167,
 'photo': 0.0002452182442373713,
 'access': 0.00032695765898316167,
 'mths': 0.0002452182442373713,
 'valentines': 0.00016347882949158083,
 'choice': 0.00016347882949158083,
 'phone': 0.0026156612718652933,
 'chatting': 0.00016347882949158083,
 'response': 8.173941474579042e-05,
 'with': 0.005394801373222168,
 'spanish': 0.00016347882949158083,
 'receiving': 0.00016347882949158083,
 'stay': 0.0002452182442373713,
 'ready': 0.0002452182442373713,
 'offer': 0.001389570050678437,
 'look': 8.173941474579042e-05,
 'personal': 0.00016347882949158083,
 'getting': 0.0002452182442373713,
 'wife': 8.173941474579042e-05,
 'maybe': 0.00016347882949158083,
 'giv': 8.173941474579042e-05,
 'left': 8.173941474579042e-05,
 'points': 0.000980872976949485,
 'village': 0.00016347882949158083,
 'open': 8.173941474579042e-

In [13]:
def predict_text(t, verbose=False):
    #if some word doesnt appear in either spam or non-spam BOW, disregard it
    valid_words = [w for w in t if w in train_spam_bow]

    #get the probabilities of each valid word showing up in spam and non-spam BOW
    spam_probs = [train_spam_bow[w] for w in valid_words]
    non_spam_probs = [train_non_spam_bow[w] for w in valid_words]
    
    #print probs if requested
    if verbose:
        data_df = pd.DataFrame()
        data_df['word'] = valid_words
        data_df['spam_prob'] = spam_probs
        data_df['non_spam_prob'] = non_spam_probs
        data_df['ratio'] = [s/n if n > 0 else np.inf for s,n in zip(spam_probs, non_spam_probs)]
        print(data_df)

    #calculate spam score as sum of logs for all probabilities
    spam_score = sum([np.log(p) for p in spam_probs]) + np.log(FRAC_SPAM_TEXTS)

    #calculate non-spam score as sum of logs for all probabilities
    non_spam_score = sum([np.log(p) for p in non_spam_probs]) + np.log(1-FRAC_SPAM_TEXTS)

#if verbose, report the two scores
    if verbose:
        print('Spam Score: %s'%spam_score)
        print('Non-Spam Score: %s'%non_spam_score)
   
    #if spam score is higher, mark this as spam
    return (spam_score >= non_spam_score)

In [14]:
predict_text('urgent call this number'.split(), verbose=True)


     word  spam_prob  non_spam_prob      ratio
0  urgent   0.003842       0.000041  92.887813
1    call   0.019127       0.003371   5.674389
2    this   0.005068       0.003619   1.400376
3  number   0.001471       0.000931   1.581069
Spam Score: -23.351430665863596
Non-Spam Score: -28.528446496624277


True

In [15]:
predict_text('hey do you want to go a movie tonight'.split(), verbose=True)


    word  spam_prob  non_spam_prob     ratio
0    hey   0.000245       0.001551  0.158107
1     do   0.001471       0.005728  0.256852
2    you   0.016103       0.026966  0.597145
3   want   0.001308       0.002399  0.545196
4     to   0.039725       0.022127  1.795326
5     go   0.001553       0.003557  0.436632
6      a   0.020925       0.015220  1.374843
7  movie   0.000082       0.000310  0.263512
Spam Score: -50.60170093999412
Non-Spam Score: -43.131781136527884


False

In [17]:
#test on the test data set
predictions = test_spam_df.text.apply(lambda t: predict_text(t.split()))


In [18]:
frac_spam_messages_correctly_detected = np.sum((predictions == True) & (test_spam_df.spam == True)) / np.sum(test_spam_df.spam == True)
print('Fraction Spam Correctly Detected: %s'%frac_spam_messages_correctly_detected)


Fraction Spam Correctly Detected: 0.9098712446351931


In [19]:
frac_valid_sent_to_spam = np.sum((predictions == True) & (test_spam_df.spam == False)) / np.sum(test_spam_df.spam == False)
print('Fraction Valid Messages Sent to Spam: %s'%frac_valid_sent_to_spam)


Fraction Valid Messages Sent to Spam: 0.022237665045170257


In [21]:
predict_text('call me'.split(), verbose=True)

   word  spam_prob  non_spam_prob     ratio
0  call   0.019127       0.003371  5.674389
1    me   0.001471       0.011105  0.132492
Spam Score: -12.504764173013728
Non-Spam Score: -10.334311510239111


False