# Student : Arora, Sanjana (V00966221)

Question 3

In [4]:
import numpy as np
import pandas as pd

Upload the smsspamcollection.zip folder to Google Colab

In [5]:
!unzip '/content/smsspamcollection.zip'

Archive:  /content/smsspamcollection.zip
  inflating: SMSSpamCollection       
  inflating: readme                  


In [6]:
df =pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['Label', 'SMS'])

In [7]:
df.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
df_clean = df.copy()

In [9]:
# removing the punctuation
df_clean['SMS'] = df_clean['SMS'].str.replace('\W', ' ')

In [10]:
# converting the messages into lower string
df_clean['SMS'] = df_clean['SMS'].str.lower()

In [11]:
df_clean['SMS'].head()

0    go until jurong point  crazy   available only ...
1                        ok lar    joking wif u oni   
2    free entry in 2 a wkly comp to win fa cup fina...
3    u dun say so early hor    u c already then say   
4    nah i don t think he goes to usf  he lives aro...
Name: SMS, dtype: object

In [12]:
df_clean['SMS'] = df_clean['SMS'].str.split()

In [13]:
df_clean['SMS'].head()

0    [go, until, jurong, point, crazy, available, o...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, in, 2, a, wkly, comp, to, win, f...
3    [u, dun, say, so, early, hor, u, c, already, t...
4    [nah, i, don, t, think, he, goes, to, usf, he,...
Name: SMS, dtype: object

In [14]:
df_clean['Label'].value_counts() / df_clean['Label'].shape[0]*100

ham     86.593683
spam    13.406317
Name: Label, dtype: float64

In [15]:
# splitting the dataset into training and testing
def train_validation_split(dataset):
    dataset = dataset.sample(frac = 1)
    validation_split = .3
    dataset_size = len(dataset)
    split = int(np.floor(validation_split * dataset_size))
    training_data = dataset.iloc[split:].reset_index(drop=True)
    validation_data = dataset.iloc[:split].reset_index(drop=True)
    return training_data,validation_data

In [16]:
train_data, test_data = train_validation_split(df_clean)

In [17]:
train_data['Label'].value_counts() / train_data.shape[0]*100

ham     86.87516
spam    13.12484
Name: Label, dtype: float64

Creating a vocabulary of the words included in the messages and counting the number of separate words in the same

In [18]:
vocabulary = []
for sms in train_data['SMS']:
   for word in sms:
      vocabulary.append(word)

vocabulary = list(set(vocabulary))

In [19]:
vocabulary[1:11]

['voila',
 '2',
 'slurp',
 'mobypobox734ls27yf',
 'bsnl',
 '09057039994',
 'voda',
 '08717205546',
 'waz',
 '5pm']

In [20]:
len(vocabulary)

7209

 bag of words

In [21]:
word_counts_per_sms = pd.DataFrame([[row[1].count(word) for word in vocabulary] for _, row in train_data.iterrows()], columns=vocabulary)

In [22]:
train_data = pd.concat([train_data.reset_index(), word_counts_per_sms], axis=1).iloc[:,1:]

In [23]:
train_data.head()


Unnamed: 0,Label,SMS,thats,voila,2,slurp,mobypobox734ls27yf,bsnl,09057039994,voda,08717205546,waz,5pm,mon,weiyi,try,account,tag,coupla,chatter,2000,9758,asked,bam,glands,bookedthe,inviting,wan,fyi,08712400200,sonyericsson,subletting,carly,night,cuppa,545,burn,abta,wasted,5,...,4years,20p,app,yaxx,ettans,hppnss,4a,secs,07753741225,dracula,nigh,touch,coz,once,uniform,site,pathaya,b4280703,concentrating,credits,victors,html,piece,aig,ip4,bbq,divorce,tunji,sdryb8i,140ppm,seriously,unconvinced,fren,stories,deleted,1lemon,two,hits,academic,temales
0,ham,"[hi, its, kate, how, is, your, evening, i, hop...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,ham,"[i, can, do, that, i, want, to, please, you, b...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,ham,"[got, it, seventeen, pounds, for, seven, hundr...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,ham,"[hi, hope, ur, day, good, back, from, walk, ta...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,ham,"[easy, ah, sen, got, selected, means, its, good]",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Probability of message to be Spam

In [24]:
spam_number = train_data['Label'].value_counts()['spam']
Pspam = spam_number / train_data.shape[0]

Probability of message to be Non-Spam

In [25]:
ham_number = train_data['Label'].value_counts()['ham']
Pham = ham_number / train_data.shape[0]

Number of words in Spam Messages

In [26]:
Nspam = len((train_data.loc[train_data['Label'] == 'spam', 'SMS']).sum())

Number of words in Non-Spam Messages

In [27]:
Nham = len(train_data.loc[train_data['Label'] == 'ham','SMS'].sum())
                      

Size of the Vocabulary

In [28]:
Nvoc = len(vocabulary)

In [29]:
# coefficient for the cases when a word is absent in the message
alpha = 1

In [30]:
# calculating the probability of a word present in spam messages

def p_w_spam(word):
    if word in train_data.columns:
        p_w_spam = (train_data.loc[train_data['Label'] == 'spam', word].sum() + alpha) / (Nspam + alpha*Nvoc)
        return p_w_spam
    else:
        return 1   

In [31]:
# calculating the probability of a word present in non-spam messages
def p_w_ham(word):
    if word in train_data.columns:
        return (train_data.loc[train_data['Label'] == 'ham', word].sum() + alpha) / (Nham + alpha*Nvoc)
    else:
        return 1

In [32]:
def bayes_classifier(message):
    p_spam_message = Pspam
    p_ham_message = Pham
    for word in message:
        p_spam_message *= p_w_spam(word)
        p_ham_message *= p_w_ham(word)
    if p_ham_message > p_spam_message:
        return 'ham'
    elif p_ham_message < p_spam_message:
        return 'spam'

In [33]:
bayes_classifier('secret')

'ham'

In [34]:
bayes_classifier(['secret', 'source', 'of', 'infinite', 'power'])

'ham'

In [35]:
test_data['predicted'] = test_data['SMS'].apply(bayes_classifier)

In [36]:
test_data.head()

Unnamed: 0,Label,SMS,predicted
0,ham,"[am, slow, in, using, biola, s, fne]",ham
1,ham,"[tap, spile, at, seven, is, that, pub, on, gas...",ham
2,ham,"[first, answer, my, question]",ham
3,ham,"[tomarrow, i, want, to, got, to, court, at, lt...",ham
4,ham,"[r, u, here, yet, i, m, wearing, blue, shirt, ...",ham


In [37]:
# calculating the testing accuracy
correct = (test_data['predicted'] == test_data['Label']).sum() / test_data.shape[0] * 100

In [38]:
# misclassified dataset
test_data.loc[test_data['predicted'] != test_data['Label']]

Unnamed: 0,Label,SMS,predicted
161,spam,"[babe, u, want, me, dont, u, baby, im, nasty, ...",ham
205,spam,"[you, won, t, believe, it, but, it, s, true, i...",ham
207,ham,"[gettin, rdy, to, ship, comp]",spam
254,spam,"[rct, thnq, adrian, for, u, text, rgds, vatian]",ham
264,ham,"[mathews, or, tait, or, edwards, or, anderson]",spam
532,spam,"[0a, networks, allow, companies, to, bill, for...",ham
548,spam,"[check, out, choose, your, babe, videos, sms, ...",ham
584,spam,"[i, want, some, cock, my, hubby, s, away, i, n...",ham
770,spam,"[dating, i, have, had, two, of, these, only, s...",ham
790,spam,"[how, come, it, takes, so, little, time, for, ...",ham


In [39]:
# testing accuracy
correct

98.62357869539198

In [40]:
train_data['predicted'] = train_data['SMS'].apply(bayes_classifier)

In [41]:
# calculating the training accuracy
correct_train = (train_data['predicted'] == train_data['Label']).sum() / train_data.shape[0] * 100

In [42]:
correct_train

99.15406306075366

As indicated in the training and testing accuracy numbers, the simple bayesian classifier is performing really well in classifying the SMS message as spam or not spam.

Sources Referred : https://towardsdatascience.com/build-sms-spam-classification-model-using-naive-bayes-random-forest-43465d6617ed


https://serokell.io/blog/machine-learning-algorithm-classification-overview