# Email Spam Detection Using Machine Learning

In [9]:
#importing libraries
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
#stopwords are commonly used words in a language (like "the", "and", "is") that are often removed during text preprocessing because they do not carry significant meaning for tasks like classification or clustering.
#The string module in Python is useful for various text processing tasks. It includes constants and classes that can help with common string operations such as removing punctuation, generating random strings, and more.

[nltk_data] Downloading package stopwords to C:\Users\Siddharth
[nltk_data]     Shukla\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


## Import the csv file and print first five rows

In [2]:
df=pd.read_csv("emails.csv")
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [4]:
# GET THE NO. OF ROWS AND COLUMNS
df.shape

(5728, 2)

In [5]:
#CHECK THE COLUMNS LABELS
df.columns

Index(['text', 'spam'], dtype='object')

## Check for duplicates and remove them

In [6]:
#Removing duplicates in original dataset
df.drop_duplicates(inplace=True)
#Display rows and column number after removing duplicates
print(df.shape)

(5695, 2)


## See the no. of missing data for each column

In [8]:
# provide count of missing values in each column of dataframe
print(df.isnull().sum())

text    0
spam    0
dtype: int64


## Download the Stopwords

In [10]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Siddharth
[nltk_data]     Shukla\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
# Now create a function to clean the text and return tokens
#->First remove punctuation and then remove the stopwords
def process(text):
    # Remove punctuation
    nopunc=[char for char in text if char not in string.punctuation]
    nopunc=''.join(nopunc)
    clean=[word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    return clean
# Now apply the function to the text data
df['text'].head().apply(process)

0    [Subject, naturally, irresistible, corporate, ...
1    [Subject, stock, trading, gunslinger, fanny, m...
2    [Subject, unbelievable, new, homes, made, easy...
3    [Subject, 4, color, printing, special, request...
4    [Subject, money, get, software, cds, software,...
Name: text, dtype: object

## Convert the text into the matrix of token counts

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
message=CountVectorizer(analyzer=process).fit_transform(df['text'])

In [14]:
#split the data 80% training and 20% test
from sklearn.model_selection import train_test_split
xtrain, xtest,ytrain,ytest=train_test_split(message,df['spam'],test_size=0.20,random_state=0)
# to see the shape of data
print(message.shape)

(5695, 37229)


## Create and train the Naives Bayes Classifier

In [15]:
from sklearn.naive_bayes import MultinomialNB
classifier=MultinomialNB().fit(xtrain,ytrain)

## See the Classifiers value and actual values on the dataset

In [16]:
print(classifier.predict(xtrain))  # predicted labels
print(ytrain.values)     #actual labels

[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]


In [18]:
# Evaluating the model on training dataset
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
pred=classifier.predict(xtrain)   # predicted labels
print("Training dataset results:")
print("Classification Report:")
print(classification_report(ytrain, pred))   #actual labels
print()
print("Confusion Matrix\n",confusion_matrix(ytrain,pred))
#This means:

# True Positives (TP): 3445 (Actual class 0 and predicted class 0)
# True Negatives (TN): 1098 (Actual class 1 and predicted class 1)
# False Positives (FP): 12 (Actual class 1 but predicted as class 0)
# False Negatives (FN): 1 (Actual class 0 but predicted as class 1)
print("Accuracy:",accuracy_score(ytrain,pred))

Training dataset results:
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3457
           1       0.99      1.00      0.99      1099

    accuracy                           1.00      4556
   macro avg       0.99      1.00      1.00      4556
weighted avg       1.00      1.00      1.00      4556


Confusion Matrix
 [[3445   12]
 [   1 1098]]
Accuracy: 0.9971466198419666


In [20]:
#print the predictions on test dataset
print(classifier.predict(xtest))
#print the actual values on test dataset
print(ytest.values)

[1 0 0 ... 0 0 0]
[1 0 0 ... 0 0 0]


## Evaluate the data on test dataset

In [21]:
# Evaluating the model on the test dataset
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
pred=classifier.predict(xtest)   # predicted labels
print("Training dataset results:")
print("Classification Report:")
print(classification_report(ytest, pred))   #actual labels
print()
print("Confusion Matrix\n",confusion_matrix(ytest,pred))
print("Accuracy:",accuracy_score(ytest,pred))

Training dataset results:
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       870
           1       0.97      1.00      0.98       269

    accuracy                           0.99      1139
   macro avg       0.98      0.99      0.99      1139
weighted avg       0.99      0.99      0.99      1139


Confusion Matrix
 [[862   8]
 [  1 268]]
Accuracy: 0.9920983318700615
