In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('SMSSpamCollection.csv',names=['Label','Messages'])
df.head()

Unnamed: 0,Label,Messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# Encoding the label column for detection
df['Label'] = df.Label.map({'ham':0, 'spam':1})
df.head()

Unnamed: 0,Label,Messages
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


Now let's create a list containg the spam words

In [4]:
# Creating a list containing the frequently used spam words.
spamwords = ['Hello, how are you!',
                'Win money, win from home.',
                'Call me now.',
                'Hello, Call hello you tomorrow?']

In [5]:
# Getting the unique words from the list
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cv.fit(spamwords)
words= cv.get_feature_names()
words

['are',
 'call',
 'from',
 'hello',
 'home',
 'how',
 'me',
 'money',
 'now',
 'tomorrow',
 'win',
 'you']

In [6]:
# Creating an array for spam words
wordarray = cv.transform(spamwords).toarray()
wordarray

array([[1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 2, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
       [0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 1]], dtype=int64)

In [7]:
# Creating a matrix for spam words from the earlier array
spam_matrix = pd.DataFrame(data=wordarray, columns=words)
spam_matrix

Unnamed: 0,are,call,from,hello,home,how,me,money,now,tomorrow,win,you
0,1,0,0,1,0,1,0,0,0,0,0,1
1,0,0,1,0,1,0,0,1,0,0,2,0
2,0,1,0,0,0,0,1,0,1,0,0,0
3,0,1,0,2,0,0,0,0,0,1,0,1


In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['Messages'], df['Label'], random_state=123)
print('Samples in the total set: {}'.format(df.shape[0]))
print('Samples in the training set: {}'.format(X_train.shape[0]))
print('Samples in the test set: {}'.format(X_test.shape[0]))

Samples in the total set: 5572
Samples in the training set: 4179
Samples in the test set: 1393


In [9]:
train= cv.fit_transform(X_train)
# Transforming the testing data
test= cv.transform(X_test)

In [10]:
#Fitting the model with MultinomialNaiveBayes
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(train, y_train)
y_pred = nb.predict(test)

In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(y_test, y_pred)))
print('Precision score: ', format(precision_score(y_test, y_pred)))
print('Recall score: ', format(recall_score(y_test, y_pred)))
print('F1 score: ', format(f1_score(y_test, y_pred)))


Accuracy score:  0.9820531227566404
Precision score:  0.9562841530054644
Recall score:  0.9114583333333334
F1 score:  0.9333333333333332
