In [1]:
# import libraries
import pandas as pd
import numpy as np


In [2]:
# ML Packages For Vectorization of Text For Feature Extraction
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
# Read the data
df = pd.read_csv("spam_or_not_spam.csv")

In [4]:
# Print the 5th first data
df.head()

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


In [5]:
# total data
df.size

6000

In [6]:
# Columns of the data
df.columns

Index(['email', 'label'], dtype='object')

In [7]:
# Types of the data
df.dtypes

email    object
label     int64
dtype: object

In [8]:
# Checking for null values
df.isnull().isnull().sum()

email    0
label    0
dtype: int64

In [9]:
df_data = df[["email","label"]]

In [10]:
df_data.columns

Index(['email', 'label'], dtype='object')

In [11]:
df_x = df_data['email']
df_y = df_data['label']

In [12]:
# Convert a collection of text documents to a matrix of token counts
cv = CountVectorizer()
ex = cv.fit_transform(["Great song but check this out","What is this song?"])

In [13]:
ex.toarray()

array([[1, 1, 1, 0, 1, 1, 1, 0],
       [0, 0, 0, 1, 0, 1, 1, 1]], dtype=int64)

In [14]:
# See the vectorization
cv.get_feature_names()

['but', 'check', 'great', 'is', 'out', 'song', 'this', 'what']

In [15]:
corpus = df_x

corpus

0        date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...
1       martin a posted tassos papadopoulos the greek ...
2       man threatens explosion in moscow thursday aug...
3       klez the virus that won t die already the most...
4        in adding cream to spaghetti carbonara which ...
5        i just had to jump in here as carbonara is on...
6       the scotsman NUMBER august NUMBER playboy want...
7       martin adamson wrote isn t it just basically a...
8       the scotsman thu NUMBER aug NUMBER meaningful ...
9       i have been trying to research via sa mirrors ...
10      hello have you seen and discussed this article...
11      yes great minds think alike but even withput e...
12      on mon aug NUMBER NUMBER at NUMBER NUMBER NUMB...
13       from chris garrigues cwg exmh deepeddy com da...
14      spamassassin is hurting democracy owen URL int...
15      hi all apologies for the possible silly questi...
16       in forteana y d mcmann dmcmann b wrote robert...
17      in a n

In [16]:
cv = CountVectorizer()
cv

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [17]:
X = cv.fit_transform(corpus.astype('U').values) # Fit the Data

In [18]:
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]], dtype=int64)

In [19]:
cv.get_feature_names()

['__',
 '___',
 '____',
 '_____',
 '______',
 '_______',
 '________',
 '_________',
 '__________',
 '______________',
 '_______________',
 '________________',
 '___________________',
 '____________________',
 '_______________________',
 '________________________',
 '_________________________',
 '__________________________',
 '______________________________',
 '_______________________________',
 '________________________________',
 '_________________________________',
 '___________________________________',
 '______________________________________',
 '__________________________________________',
 '_____________________________________________',
 '______________________________________________',
 '_______________________________________________',
 '________________________________________________',
 '_________________________________________________',
 '__________________________________________________',
 '___________________________________________________',
 '_________________________

In [20]:
# Import for train and test the model
from sklearn.model_selection import train_test_split

In [21]:
# Implementation of the model
X_train, X_test, y_train, y_test = train_test_split(X, df_y, test_size=0.33, random_state=42)

In [22]:
X_train

<2010x34117 sparse matrix of type '<class 'numpy.int64'>'
	with 228906 stored elements in Compressed Sparse Row format>

In [23]:
# Implementation of Naive Bayes
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.9848484848484849

In [24]:
# Accuracy of our Model
print("Accuracy of Model",clf.score(X_test,y_test)*100,"%")

Accuracy of Model 98.48484848484848 %


In [25]:
## Predicting with our model
clf.predict(X_test)

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,

In [26]:
# Sample Prediciton
comment = ["Check this out"]
vect = cv.transform(comment).toarray()

In [27]:
clf.predict(vect)

array([0], dtype=int64)

In [28]:
class_dict = {'ham':0,'spam':1}

In [29]:
class_dict.values()

dict_values([0, 1])

In [30]:
if clf.predict(vect) == 1:
    print("Spam")
else:
    print("Ham")

Ham


In [31]:
# Sample Prediciton 2
comment1 = ["Great song Friend"]
vect = cv.transform(comment1).toarray()
clf.predict(vect)

array([0], dtype=int64)

In [32]:
import pickle

In [33]:
# Save the model
naivebayesML = open("spam_model.pkl","wb")

In [34]:
pickle.dump(clf,naivebayesML)

In [35]:
naivebayesML.close()

In [36]:
# Load the model
spam_model = open("spam_model.pkl","rb")

In [37]:
new_model = pickle.load(spam_model)

In [38]:
new_model

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [39]:
# Sample Prediciton 3
comment2 = ["Hey Music Fans I really appreciate all of you,but see this song too"]
vect = cv.transform(comment2).toarray()
new_model.predict(vect)

array([0], dtype=int64)

In [40]:
if new_model.predict(vect) == 1:
    print("Spam")
else:
    print("Ham")

Ham
