### Importing Dataset and getting basic information of it


In [13]:
import pandas as pd

In [14]:
data = pd.read_csv("/content/MessagesDataset.csv", encoding='ISO-8859-1', usecols=["v1", "v2"])

In [15]:
data.rename(columns={'v1':'Type', 'v2':'Message'},inplace=True)

In [16]:
data.Type = data.Type.replace("ham", "Non-Spam")
data.Type = data.Type.replace("spam", "Spam")

In [17]:
print(data.shape)
data.head(10)

(5572, 2)


Unnamed: 0,Type,Message
0,Non-Spam,"Go until jurong point, crazy.. Available only ..."
1,Non-Spam,Ok lar... Joking wif u oni...
2,Spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,Non-Spam,U dun say so early hor... U c already then say...
4,Non-Spam,"Nah I don't think he goes to usf, he lives aro..."
5,Spam,FreeMsg Hey there darling it's been 3 week's n...
6,Non-Spam,Even my brother is not like to speak with me. ...
7,Non-Spam,As per your request 'Melle Melle (Oru Minnamin...
8,Spam,WINNER!! As a valued network customer you have...
9,Spam,Had your mobile 11 months or more? U R entitle...


In [18]:
data.Type.value_counts(normalize=True)

Non-Spam    0.865937
Spam        0.134063
Name: Type, dtype: float64

### Splitting dataset into test and training sets

In [19]:
data_random = data.sample(frac=1, random_state=42)

training_test_index = round(len(data_random) * 0.8)

training = data_random[ : training_test_index].reset_index(drop=True)
test = data_random[training_test_index : ].reset_index(drop=True)

print(training.shape)
print(test.shape)

(4458, 2)
(1114, 2)


In [20]:
print(training.Type.value_counts(normalize=True))
print(test.Type.value_counts(normalize=True))

Non-Spam    0.867654
Spam        0.132346
Name: Type, dtype: float64
Non-Spam    0.859066
Spam        0.140934
Name: Type, dtype: float64


### Data Processing

In [21]:
training.Message = training.Message.str.replace("\W", " ")
training.Message = training.Message.str.lower()
training.head(10)
training.Message

  training.Message = training.Message.str.replace("\W", " ")


0       funny fact nobody teaches volcanoes 2 erupt  t...
1       i sent my scores to sophas and i had to do sec...
2       we know someone who you know that fancies you ...
3       only if you promise your getting out as soon a...
4       congratulations ur awarded either å 500 of cd ...
                              ...                        
4453                              howz that persons story
4454    i probably won t eat at all today  i think i m...
4455                           i am on the way to ur home
4456    double eviction this week   spiral and michael...
4457                     thank you  i like you as well   
Name: Message, Length: 4458, dtype: object

In [22]:
training.Message = training.Message.str.split()
unique_words = []

for message in training.Message:
    for word in message:
        unique_words.append(word)

unique_words = list(set(unique_words))
len(unique_words)

7687

### Feature Extraction

In [23]:
word_count = {word: [0] * len(training.Message) for word in unique_words}

for index, data in enumerate(training.Message):
    for word in data:
        word_count[word][index] += 1

word_count = pd.DataFrame(word_count)
word_count.head(10)

Unnamed: 0,sayin,if,latr,aiya,aberdeen,aint,400mins,piss,addicted,cres,...,goodfriend,complaint,paranoid,pimples,living,pound,lock,giv,m100,cw25wx
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
clean_training = pd.concat([training, word_count], axis=1)
clean_training.head(10)

Unnamed: 0,Type,Message,sayin,if,latr,aiya,aberdeen,aint,400mins,piss,...,goodfriend,complaint,paranoid,pimples,living,pound,lock,giv,m100,cw25wx
0,Non-Spam,"[funny, fact, nobody, teaches, volcanoes, 2, e...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Non-Spam,"[i, sent, my, scores, to, sophas, and, i, had,...",0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Spam,"[we, know, someone, who, you, know, that, fanc...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Non-Spam,"[only, if, you, promise, your, getting, out, a...",0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Spam,"[congratulations, ur, awarded, either, å, 500,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,Non-Spam,"[i, ll, text, carlos, and, let, you, know, han...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,Non-Spam,"[k, i, did, t, see, you, k, where, are, you, now]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,Non-Spam,"[no, message, no, responce, what, happend]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,Non-Spam,"[get, down, in, gandhipuram, and, walk, to, cr...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,Non-Spam,"[you, flippin, your, shit, yet]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Checking for spam

In [25]:
spam_messages = clean_training[clean_training.Type == "Spam"]
non_spam_messages = clean_training[clean_training.Type == "Non-Spam"]

p_spam = len(spam_messages) / len(clean_training)
p_non_spam = len(non_spam_messages) / len(clean_training)

words_per_spam = spam_messages.Message.apply(len)
n_spam = words_per_spam.sum()

words_per_non_spam = non_spam_messages.Message.apply(len)
n_non_spam = words_per_non_spam.sum()

n_unique = len(unique_words)

alpha = 1

In [26]:
parameters_spam = {word: 0 for word in unique_words}
parameters_non_spam = {word: 0 for word in unique_words}

for word in unique_words:

    n_word_given_spam = spam_messages[word].sum()
    p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha * n_unique)
    parameters_spam[word] = p_word_given_spam

    n_word_given_non_spam = non_spam_messages[word].sum()
    p_word_given_non_spam = (n_word_given_non_spam + alpha) / (n_non_spam + alpha * n_unique)
    parameters_non_spam[word] = p_word_given_non_spam

In [27]:
import re

def classify(message):

    message = re.sub("\W", " ", message)
    message = message.lower().split()

    p_spam_given_message = p_spam
    p_non_spam_given_message = p_non_spam

    for word in message:

        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]

        if word in parameters_non_spam:
            p_non_spam_given_message *= parameters_non_spam[word]

    if p_spam_given_message > p_non_spam_given_message:
        return "Spam"
    elif p_non_spam_given_message > p_spam_given_message:
        return "Non-Spam"
    else:
        return "Equal probabilities, human needed to classify this!"

In [28]:
classify("Hey Tirath, wassup")

'Non-Spam'

In [29]:
test["Prediction"] = test.Message.apply(classify)
test.head(10)

Unnamed: 0,Type,Message,Prediction
0,Spam,"New Tones This week include: 1)McFly-All Ab..,...",Spam
1,Non-Spam,I am not sure about night menu. . . I know onl...,Non-Spam
2,Non-Spam,Hope ur head doesn't hurt 2 much ! Am ploughin...,Non-Spam
3,Non-Spam,Hey what how about your project. Started aha da.,Non-Spam
4,Non-Spam,Desires- u going to doctor 4 liver. And get a ...,Non-Spam
5,Non-Spam,Waiting 4 my tv show 2 start lor... U leh stil...,Non-Spam
6,Non-Spam,"Sorry, I'll call later",Non-Spam
7,Non-Spam,Ok i found dis pierre cardin one which looks n...,Non-Spam
8,Spam,UR awarded a City Break and could WIN a å£200 ...,Spam
9,Spam,PRIVATE! Your 2003 Account Statement for <fone...,Spam


### Checking accuracy of model

In [30]:
correct, wrong = 0, 0

total = test.shape[0]

for row in test.iterrows():

    row = row[1]

    if row["Type"] == row["Prediction"]:
        correct += 1
    else:
        wrong += 1

print("Correct Predictions: ", correct)
print("Incorrect Predictions: ", wrong)
print("Accuracy: ", correct / total)

Correct Predictions:  1094
Incorrect Predictions:  20
Accuracy:  0.9820466786355476
