In [2]:
# Lets import allthe necessary libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [3]:
# Lets use pandas to read the "email.csv" file
df = pd.read_csv('spam_ham_dataset.csv')

In [4]:
# Lets print the first 5 rows
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


We can clearly see that the columns are the individual words, which in total are 3002 in numbers

In [5]:
df = pd.DataFrame(df)

# Lets read the values of "label" column
df.label_num.value_counts()
# Here, 0 means "Ham", and 1 means "Spam"

0    3672
1    1499
Name: label_num, dtype: int64

In [6]:
df.shape

(5171, 4)

In [7]:
# Lets  drop the column "Unnamed: 0" as it has no useful data
df = df.drop(['Unnamed: 0'],axis = 1)

In [8]:
# Printing the first five rows of the updataed dataframe
df.head()

Unnamed: 0,label,text,label_num
0,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,spam,"Subject: photoshop , windows , office . cheap ...",1
4,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [9]:
# lets detect and remove duplicate emails, if there are any
df.drop_duplicates(inplace = True)
df.shape

(4993, 3)

We can see that the shape of the dataframe has reduced, because there were duplicate emails, and they are now removed

In [10]:
# lets check if there are any null data in all columns
df.isnull().sum()

label        0
text         0
label_num    0
dtype: int64

In [11]:
# Now, lets develop X, that consists of all emails and a y, that consists of out label_num
X = df['text']
y = df.label_num

In [57]:
X

0       Subject: enron methanol ; meter # : 988291\r\n...
1       Subject: hpl nom for january 9 , 2001\r\n( see...
2       Subject: neon retreat\r\nho ho ho , we ' re ar...
3       Subject: photoshop , windows , office . cheap ...
4       Subject: re : indian springs\r\nthis deal is t...
                              ...                        
5165    Subject: fw : crosstex energy , driscoll ranch...
5166    Subject: put the 10 on the ft\r\nthe transport...
5167    Subject: 3 / 4 / 2000 and following noms\r\nhp...
5169    Subject: industrial worksheets for august 2000...
5170    Subject: important online banking alert\r\ndea...
Name: text, Length: 4993, dtype: object

# Let's represent text as numerical data

We will use CountVectorizer to "convert text into a matrix of token counts"

In [13]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

In [14]:
# Let's learn the vocabulary from X
count_vect.fit(X)

CountVectorizer()

In [15]:
# Lets examine the fitted vocabulaty
count_vect.get_feature_names()

['00',
 '000',
 '0000',
 '000000',
 '000000000002858',
 '000000000049773',
 '000080',
 '000099',
 '0001',
 '00018',
 '00020608',
 '0004',
 '0005',
 '0008',
 '001',
 '0010',
 '001001',
 '0012',
 '001452',
 '002',
 '0022',
 '00221',
 '0025',
 '0027',
 '0028',
 '0029',
 '00298',
 '003',
 '0030',
 '003002',
 '0031',
 '0033',
 '0038',
 '004',
 '0042',
 '0043',
 '0044',
 '0045',
 '0046',
 '0047',
 '0049',
 '005',
 '00501723',
 '0051',
 '00534580',
 '006',
 '006600',
 '0067',
 '007',
 '0071',
 '0074',
 '008',
 '0080',
 '0085201238',
 '009',
 '0090',
 '01',
 '010',
 '0100',
 '0101',
 '011',
 '0117',
 '012',
 '012603',
 '013',
 '0130',
 '014',
 '01405',
 '01408304990',
 '01474',
 '015',
 '016',
 '017',
 '0170',
 '01778',
 '0182',
 '0184',
 '019',
 '019017',
 '01915',
 '02',
 '020',
 '0200',
 '02010207',
 '021',
 '022',
 '0232',
 '024',
 '025',
 '0255',
 '025648',
 '026',
 '027',
 '028',
 '02886',
 '02897893',
 '029',
 '03',
 '030',
 '0300',
 '03090806',
 '031',
 '0310041',
 '03149',
 '032',
 '0

In [16]:
# Lets transform the training data into a matrix
X_dtm = count_vect.transform(X)

In [17]:
# This new X returns a sparse matrix which we will convert into a dense matrix
X_dtm

<4993x50447 sparse matrix of type '<class 'numpy.int64'>'
	with 447953 stored elements in Compressed Sparse Row format>

In [18]:
# Converting a sparse matrix into a dense matrix
X_dtm.toarray()

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 4, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [19]:
# Lets form a dataframe using the dense matrix as rows, and the features of fitted vocabulary as columns
pd.DataFrame(X_dtm.toarray(),columns = count_vect.get_feature_names())

Unnamed: 0,00,000,0000,000000,000000000002858,000000000049773,000080,000099,0001,00018,...,zynve,zyqtaqlt,zyrtec,zyyqywp,zzezrjok,zzn,zzo,zzocb,zzso,zzsyt
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4988,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4989,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4990,0,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4991,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# We will use multinominal navie bayes model to create our model

In [20]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB() 

In [61]:
len(X_dtm.toarray())

4993

In [62]:
len(y)

4993

In [21]:
# Lets train our model with original data, that is X and y
nb.fit(X_dtm,y)

MultinomialNB()

In [22]:
# Lets find out the prediction based on the actual in-sample data
y_pred = nb.predict(X_dtm)

In [23]:
# Lets calculate the prediction accuracy using accuracu_function, as it is a classification problem
from sklearn import metrics
print(metrics.accuracy_score(y,y_pred))

0.9901862607650711


The accuracy measured is 99 %, which is when the model is trained with data X, and y, and predictions were also made using X.

The accuracy measured in such a way is not a good estimator because, in the real life,this model will be faced with new data that it has not seen or been trained with. Thus, we will use train_test_split to split the X and y into training and testing sets respectively

# Using train_test_split

In [24]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 0, test_size = 0.4)

In [25]:
print(X_train.shape)
print(X_test.shape)
print(y_test.shape)
print(y_train.shape)

(2995,)
(1998,)
(1998,)
(2995,)


In [26]:
print(type(X_test))

<class 'pandas.core.series.Series'>


In [27]:
# Lets instantiate the vectorizer
vect = CountVectorizer()

In [28]:
# Lets learn the vocabulary from X_train, and convert it into a sparse matrix
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)

# equivalently: combine fit and transform into a single step
            X_train_dtm = vect.fit_transform(X_train)

In [29]:
vect.get_feature_names()

['00',
 '000',
 '0000',
 '000000',
 '000000000002858',
 '000080',
 '000099',
 '0001',
 '00020608',
 '0004',
 '001',
 '001001',
 '0012',
 '001452',
 '002',
 '0022',
 '0025',
 '0027',
 '00298',
 '003',
 '0030',
 '003002',
 '0031',
 '0033',
 '0038',
 '004',
 '0044',
 '0047',
 '005',
 '00534580',
 '006',
 '006600',
 '0067',
 '007',
 '0071',
 '008',
 '009',
 '01',
 '010',
 '0100',
 '0101',
 '011',
 '0117',
 '012',
 '013',
 '0130',
 '014',
 '01405',
 '01408304990',
 '015',
 '016',
 '017',
 '0170',
 '01778',
 '019',
 '019017',
 '01915',
 '02',
 '020',
 '0200',
 '02010207',
 '021',
 '022',
 '0232',
 '025',
 '026',
 '027',
 '028',
 '02886',
 '02897893',
 '029',
 '03',
 '030',
 '0300',
 '03090806',
 '03149',
 '0325567',
 '033',
 '0331',
 '034',
 '0347',
 '035',
 '0357',
 '036',
 '0361770',
 '0363',
 '036474336',
 '037',
 '0373',
 '0375',
 '038',
 '0380',
 '039',
 '0393',
 '04',
 '040',
 '0400',
 '0401',
 '041',
 '0413',
 '043',
 '0435',
 '0439',
 '044',
 '045',
 '046',
 '04604902',
 '0469',
 '04

In [30]:
# Lets convert X_dtm, which is a document term matrix or a sparse matrix, into a dense matrix
X_train_dtm = X_train_dtm.toarray()

In [31]:
# Lets print our new dense matrix, which is overrided into the previous sparse matrix 'X_dtm'
print(X_train_dtm)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 2 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [32]:
# Lets form a dataframe using the dense matrix as rows, and the features of fitted vocabulary as columns
pd.DataFrame(X_train_dtm, columns=vect.get_feature_names())

Unnamed: 0,00,000,0000,000000,000000000002858,000080,000099,0001,00020608,0004,...,zyban,zyjvit,zykfe,zynve,zyqtaqlt,zyrtec,zyyqywp,zzn,zzo,zzsyt
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2990,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2991,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2992,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2993,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Time to transform the X_test into a document term matrix

In [33]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)

In [34]:
# Lets instantiate a Multinomial Naive Bayes model
nb = MultinomialNB()

In [35]:
nb.fit(X_train_dtm,y_train)

MultinomialNB()

In [36]:
# Lets make prediction using a testing data that is: X_test
y_pred_new = nb.predict(X_test_dtm)

# Lets calculate the accuracy now
metrics.accuracy_score(y_test, y_pred_new)

0.9719719719719719

In [37]:
# Lets make a out of sample prediction for this updated model
spam_message = ['Hello sujan, you have won $50. Please send me your credit card number and confirmation code to confirm the prize']

# Lets transform the spam message into a sparse matrix
spam_message_dtm = vect.transform(spam_message)

In [65]:
spam_message_dtm

<1x37062 sparse matrix of type '<class 'numpy.int64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [39]:
#spam_message_dtm = pd.DataFrame(spam_message_dtm.toarray(), columns=vect.get_feature_names())

In [66]:
nb.predict(spam_message_dtm)

array([1], dtype=int64)

 array([1]) means the message "spam_message_dtm" is a spam message

# Lets try another testing spam message

In [41]:
# This email is predicted to be spam
spam_message2= ['Congratulations. You have won 100 pounds. Please go to this email address to confirm the prize. And, also send us your social security number']
nb.predict(vect.transform(spam_message2))

array([1], dtype=int64)

In [56]:
lis = ['Ham','Spam']

message = 'Sulav has won $ 10000. Please buy this product for $ 300. You can access the product form this link: http://ww.ssp.sp'
result = int(nb.predict(vect.transform([message])))
print(lis[result])

Spam


# Lets try a non-spam or ham message

In [42]:
# This email is predicted to be non-spam or ham
ham_message = ['Congratulations sujan for winning $300000. Lets party on the coming weekends on vegas. What do you say?']
nb.predict(vect.transform(ham_message))

array([0], dtype=int64)

In [43]:
message = '''
        Dear Students,

        We hope you and your family are doing well and keeping safe!

        We are delighted to inform you about the internship opportunity at the ING call center. Kindly send us your cv before 10 AM tomorrow (Sunday, 12 Sept. 2021)  if you are interested in the internship opportunity. 

        Please find the job description attached along with this email for your reference.

        Thanks and regards. '''
nb.predict(vect.transform([message]))

array([0], dtype=int64)

Since this is a binary classification problem, there exist the possibility of false positives, false negatives, True positives and True negatives. So, lets form our confuision matrix to find such cases.

In [44]:
# Lets create our confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred_new)
cm

array([[1412,   20],
       [  36,  530]], dtype=int64)

In [45]:
# save confusion matrix and slice into 4 pieces 
TP = cm[1,1]
TN = cm[0,0]
FP = cm[0,1]
FN = cm[1,0]

In the confusion matrix above, It is presented as: [[TN,FP],[FN,TP]], where:

        TN = True Positive
        FP = False Positive
        FN = False Negative
        TP = True Positive

In [46]:
# Here, using the confusion matrix, accuracy can be measured as ((FP+FN)/ (TP+TN+FP+FN))
print("The overall accuracy is: ", (TP+TN)/ float(TP+TN+FP+FN))

The overall accuracy is:  0.9719719719719719


# Thank you - By Sujan Neupane