### Importing Dataset and getting basic information of it


In [None]:
import pandas as pd

data = pd.read_csv("/content/MessagesDataset.txt",
                   sep="\t", header=None,
                   names=["Type", "Message"])

data.Type = data.Type.replace("ham", "Non-Spam")
data.Type = data.Type.replace("spam", "Spam")

print(data.shape)
data.head(10)

(5572, 2)


Unnamed: 0,Type,Message
0,Non-Spam,"Go until jurong point, crazy.. Available only ..."
1,Non-Spam,Ok lar... Joking wif u oni...
2,Spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,Non-Spam,U dun say so early hor... U c already then say...
4,Non-Spam,"Nah I don't think he goes to usf, he lives aro..."
5,Spam,FreeMsg Hey there darling it's been 3 week's n...
6,Non-Spam,Even my brother is not like to speak with me. ...
7,Non-Spam,As per your request 'Melle Melle (Oru Minnamin...
8,Spam,WINNER!! As a valued network customer you have...
9,Spam,Had your mobile 11 months or more? U R entitle...


In [None]:
data.Type.value_counts(normalize=True)

Non-Spam    0.865937
Spam        0.134063
Name: Type, dtype: float64

### Splitting dataset into test and training sets

In [None]:
data_random = data.sample(frac=1, random_state=42)

training_test_index = round(len(data_random) * 0.8)

training = data_random[ : training_test_index].reset_index(drop=True)
test = data_random[training_test_index : ].reset_index(drop=True)

print(training.shape)
print(test.shape)

(4458, 2)
(1114, 2)


In [None]:
print(training.Type.value_counts(normalize=True))
print(test.Type.value_counts(normalize=True))

Non-Spam    0.866981
Spam        0.133019
Name: Type, dtype: float64
Non-Spam    0.861759
Spam        0.138241
Name: Type, dtype: float64


### Data Processing

In [None]:
training.Message = training.Message.str.replace("\W", " ")
training.Message = training.Message.str.lower()
training.head(10)
training.Message

  training.Message = training.Message.str.replace("\W", " ")


0       squeeeeeze   this is christmas hug   if u lik ...
1       and also i ve sorta blown him off a couple tim...
2       mmm thats better now i got a roast down me  i ...
3           mm have some kanji dont eat anything heavy ok
4       so there s a ring that comes with the guys cos...
                              ...                        
4453    maybe if you woke up before fucking 3 this wou...
4454    me hungry buy some food good lei    but mum n ...
4455               wow v v impressed  have funs shopping 
4456    yo you around  a friend of mine s lookin to pi...
4457                              i fetch yun or u fetch 
Name: Message, Length: 4458, dtype: object

In [None]:
training.Message = training.Message.str.split()
unique_words = []

for message in training.Message:
    for word in message:
        unique_words.append(word)

unique_words = list(set(unique_words))
len(unique_words)

7816

### Feature Extraction

In [None]:
word_count = {word: [0] * len(training.Message) for word in unique_words}

for index, data in enumerate(training.Message):
    for word in data:
        word_count[word][index] += 1

word_count = pd.DataFrame(word_count)
word_count.head(10)

Unnamed: 0,velachery,kath,unintentionally,thenampet,volcanoes,7oz,sg,general,definite,09066612661,...,hearts,regretted,cds,09061702893,breaker,zogtorius,practising,performed,plan,alertfrom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
clean_training = pd.concat([training, word_count], axis=1)
clean_training.head(10)

Unnamed: 0,Type,Message,velachery,kath,unintentionally,thenampet,volcanoes,7oz,sg,general,...,hearts,regretted,cds,09061702893,breaker,zogtorius,practising,performed,plan,alertfrom
0,Non-Spam,"[squeeeeeze, this, is, christmas, hug, if, u, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Non-Spam,"[and, also, i, ve, sorta, blown, him, off, a, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Non-Spam,"[mmm, thats, better, now, i, got, a, roast, do...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Non-Spam,"[mm, have, some, kanji, dont, eat, anything, h...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Non-Spam,"[so, there, s, a, ring, that, comes, with, the...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,Non-Spam,"[sary, just, need, tim, in, the, bollox, it, h...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,Non-Spam,"[love, isn, t, a, decision, it, s, a, feeling,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,Non-Spam,"[my, supervisor, find, 4, me, one, lor, i, thk...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,Non-Spam,"[dear, good, morning, now, only, i, am, up]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,Non-Spam,"[i, m, in, chennai, velachery]",1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Checking for spam

In [None]:
spam_messages = clean_training[clean_training.Type == "Spam"]
non_spam_messages = clean_training[clean_training.Type == "Non-Spam"]

p_spam = len(spam_messages) / len(clean_training)
p_non_spam = len(non_spam_messages) / len(clean_training)

words_per_spam = spam_messages.Message.apply(len)
n_spam = words_per_spam.sum()

words_per_non_spam = non_spam_messages.Message.apply(len)
n_non_spam = words_per_non_spam.sum()

n_unique = len(unique_words)

alpha = 1

In [None]:
parameters_spam = {word: 0 for word in unique_words}
parameters_non_spam = {word: 0 for word in unique_words}

for word in unique_words:

    n_word_given_spam = spam_messages[word].sum()
    p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha * n_unique)
    parameters_spam[word] = p_word_given_spam

    n_word_given_non_spam = non_spam_messages[word].sum()
    p_word_given_non_spam = (n_word_given_non_spam + alpha) / (n_non_spam + alpha * n_unique)
    parameters_non_spam[word] = p_word_given_non_spam

In [None]:
import re

def classify(message):

    message = re.sub("\W", " ", message)
    message = message.lower().split()

    p_spam_given_message = p_spam
    p_non_spam_given_message = p_non_spam

    for word in message:

        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]

        if word in parameters_non_spam:
            p_non_spam_given_message *= parameters_non_spam[word]

    if p_spam_given_message > p_non_spam_given_message:
        return "Spam"
    elif p_non_spam_given_message > p_spam_given_message:
        return "Non-Spam"
    else:
        return "Equal probabilities, human needed to classify this!"

classify("Hey Tirath, wassup")

'Non-Spam'

In [None]:
test["Prediction"] = test.Message.apply(classify)
test.head(10)

Unnamed: 0,Type,Message,Prediction
0,Non-Spam,Was playng 9 doors game and gt racing on phone...,Non-Spam
1,Non-Spam,I dont thnk its a wrong calling between us,Non-Spam
2,Non-Spam,All e best 4 ur exam later.,Non-Spam
3,Non-Spam,Hey what how about your project. Started aha da.,Non-Spam
4,Non-Spam,"Dunno, my dad said he coming home 2 bring us o...",Non-Spam
5,Non-Spam,Can i meet ü at 5.. As 4 where depends on wher...,Non-Spam
6,Non-Spam,"Sorry, I'll call later",Non-Spam
7,Non-Spam,Yeah we do totes. When u wanna?,Non-Spam
8,Non-Spam,X2 &lt;#&gt; . Are you going to get that,Non-Spam
9,Spam,PRIVATE! Your 2003 Account Statement for <fone...,Spam


### Checking accuracy of model

In [None]:
correct, wrong = 0, 0

total = test.shape[0]

for row in test.iterrows():

    row = row[1]

    if row["Type"] == row["Prediction"]:
        correct += 1
    else:
        wrong += 1

print("Correct Predictions: ", correct)
print("Incorrect Predictions: ", wrong)
print("Accuracy: ", correct / total)

Correct Predictions:  1092
Incorrect Predictions:  22
Accuracy:  0.9802513464991023
