# Email Spam detection with Machine learning

In [15]:
#Import libraries
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
import warnings
warnings.filterwarnings('ignore')

In [2]:
spam =pd.read_csv('messages.csv')

In [3]:
spam

Unnamed: 0,subject,message,label
0,job posting - apple-iss research center,content - length : 3386 apple-iss research cen...,0
1,,"lang classification grimes , joseph e . and ba...",0
2,query : letter frequencies for text identifica...,i am posting this inquiry for sergei atamas ( ...,0
3,risk,a colleague and i are researching the differin...,0
4,request book information,earlier this morning i was on the phone with a...,0
5,call for abstracts : optimality in syntactic t...,content - length : 4437 call for papers is the...,0
6,m . a . in scandinavian linguistics,m . a . in scandinavian linguistics at the uni...,0
7,call for papers : linguistics session of the m...,call for papers linguistics session - - midwes...,0
8,foreign language in commercials,content - length : 1937 greetings ! i ' m wond...,0
9,fulbright announcement : please post / dissemi...,fulbright announcement : please post / dissemi...,0


In [4]:
spam.head()

Unnamed: 0,subject,message,label
0,job posting - apple-iss research center,content - length : 3386 apple-iss research cen...,0
1,,"lang classification grimes , joseph e . and ba...",0
2,query : letter frequencies for text identifica...,i am posting this inquiry for sergei atamas ( ...,0
3,risk,a colleague and i are researching the differin...,0
4,request book information,earlier this morning i was on the phone with a...,0


In [5]:
spam.shape

(2893, 3)

In [7]:
#get the columns names
spam.columns

Index(['subject', 'message', 'label'], dtype='object')

In [8]:
#check for the duplicates
spam.drop_duplicates(inplace=True)

In [9]:
#Show new shape(New number of row and columns)
spam.shape

(2876, 3)

In [10]:
#Show the number of missing data (Nan,NAN,na) data for each column
spam.isnull().sum()

subject    62
message     0
label       0
dtype: int64

As we saw that in subject there is 62 missing vaalues 

In [11]:
#lets use fillna to fill missing values in subject column
spam['subject'].fillna('Missing',inplace=True)

In [12]:
#lets check the missing values again
spam.isnull().sum()

subject    0
message    0
label      0
dtype: int64

As we see there is no missing values in data

Lets download stopwords for the package

In [14]:
#Download the stopwords 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\my
[nltk_data]     pc\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [23]:
#Prepare the model
def process_text(message):
    #1 remove punctuatuation
    #2 remove stopwords
    #3 return a list of clean text
    
    #1
    nopunc =[char for char in message if char not in string.punctuation]
    nopunc=''.join(nopunc)
    
    #2
    clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    
    #3
    return clean_words

Lets use tokenization in NLP process

In [24]:
#Show the tokenization ( a list tokens also called lemmas)
spam['message'].head().apply(process_text)

0    [content, length, 3386, appleiss, research, ce...
1    [lang, classification, grimes, joseph, e, barb...
2    [posting, inquiry, sergei, atamas, satamas, um...
3    [colleague, researching, differing, degrees, r...
4    [earlier, morning, phone, friend, mine, living...
Name: message, dtype: object

Convert a collection of text to a matrix of tokens 

In [26]:
#Convert a collection of text to a matrix of tokens
from sklearn.feature_extraction.text import CountVectorizer
messages_bow = CountVectorizer(analyzer=process_text).fit_transform(spam['message'])

 Now lets do train test splitting of data

In [28]:
#Split the data 80% training and 20% testing
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(messages_bow,spam['label'],test_size=0.20,random_state=0)

In [29]:
#get the shape messages_bow
messages_bow.shape

(2876, 64661)

# Naive Bayes for MultinomialNB

Lets create and train naive bayes classifier

In [31]:
#Create and train naive bayes classifier
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB().fit(X_train,y_train)

In [33]:
#lets print the predictions
print(classifier.predict(X_train))

[0 0 0 ... 0 0 0]


Lets print the actual values

In [34]:
#Print actual values
print(y_train.values)

[0 0 0 ... 0 0 0]


In [36]:
#Evaluate the model on the gtraing data set
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
pred =classifier.predict(X_train)
print(classification_report(y_train,pred))
print()
print('confusion matrix :\n',confusion_matrix(y_train,pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1914
           1       0.99      0.99      0.99       386

   micro avg       1.00      1.00      1.00      2300
   macro avg       0.99      1.00      1.00      2300
weighted avg       1.00      1.00      1.00      2300


confusion matrix :
 [[1910    4]
 [   2  384]]


In [37]:
#Evaluate the model on the gtraing data set
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
pred =classifier.predict(X_train)
print(classification_report(y_train,pred))
print()
print('confusion matrix :\n',confusion_matrix(y_train,pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1914
           1       0.99      0.99      0.99       386

   micro avg       1.00      1.00      1.00      2300
   macro avg       0.99      1.00      1.00      2300
weighted avg       1.00      1.00      1.00      2300


confusion matrix :
 [[1910    4]
 [   2  384]]


# Print the accuracy_score

In [38]:
#Evaluate the model on the gtraing data set
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
pred =classifier.predict(X_train)
print(classification_report(y_train,pred))
print()
print('confusion matrix :\n',confusion_matrix(y_train,pred))
print()
print('Accuracy: ',accuracy_score(y_train, pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1914
           1       0.99      0.99      0.99       386

   micro avg       1.00      1.00      1.00      2300
   macro avg       0.99      1.00      1.00      2300
weighted avg       1.00      1.00      1.00      2300


confusion matrix :
 [[1910    4]
 [   2  384]]

Accuracy:  0.9973913043478261


Lets do evaluating on test data

In [40]:
#lets print the predictions
print(classifier.predict(X_test))

[0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0
 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 1
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0
 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0
 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0
 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0
 0 1 0 0 0 0 1 1 1 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0
 0 0 0 0 0 0 0 1 0 0 0 0 

In [41]:
#print the actual value
print(y_test.values)

[0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0
 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 1
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0
 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0
 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0
 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0
 0 1 0 0 0 0 1 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0
 0 0 0 0 0 0 0 1 0 0 0 0 

In [42]:
#Evaluate the model on the gtraing data set
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
pred =classifier.predict(X_test)
print(classification_report(y_test,pred))
print()
print('confusion matrix :\n',confusion_matrix(y_test,pred))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       494
           1       0.93      1.00      0.96        82

   micro avg       0.99      0.99      0.99       576
   macro avg       0.97      0.99      0.98       576
weighted avg       0.99      0.99      0.99       576


confusion matrix :
 [[488   6]
 [  0  82]]


# Accuracy score for test data

In [43]:
#Evaluate the model on the gtraing data set
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
pred =classifier.predict(X_test)
print(classification_report(y_test,pred))
print()
print('confusion matrix :\n',confusion_matrix(y_test,pred))
print()
print('Accuracy: ',accuracy_score(y_test, pred))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       494
           1       0.93      1.00      0.96        82

   micro avg       0.99      0.99      0.99       576
   macro avg       0.97      0.99      0.98       576
weighted avg       0.99      0.99      0.99       576


confusion matrix :
 [[488   6]
 [  0  82]]

Accuracy:  0.9895833333333334


The accuracy on test data is 98% so the model is very fine.