# DETECTION OF EMAIL SPAM WITH MACHINE LEARNING
Each of us has received spam emails at some point. A sort of email known as spam mail, sometimes known as junk mail, is one that is sent to a large number of people at once and typically includes cryptic messages, con games, or—most dangerously—phishing content.
In this project, I created an email spam detector using Python. I then trained the spam detector using machine learning to identify and categorize emails as spam or not.

In [1]:
#Importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")


In [2]:
#Loading the dataset
data = pd.read_csv("/kaggle/input/sms-spam-collection-dataset/spam.csv", encoding='Latin-1')
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [3]:
stop_words=stopwords.words('english')
print(stop_words)

NameError: name 'stopwords' is not defined

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
data = data[['v1', 'v2']]
data.columns = ['type', 'message']
data.head()

In [None]:
data.groupby('type').size()

# Data Vizualisation

In [None]:
data['type'].value_counts().plot(kind='barh', color= 'purple')

## Natural Language Processing

## Text Preprocessing and Stemming

In [None]:
#Importing the libraries
import re
import nltk
#Download the Stopwords from nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
#Using PorterStemmer for Stemming the Words
from nltk.stem.porter import PorterStemmer

In [None]:
stem1 = PorterStemmer()
corpus = []
for i in range(0, len(data)):
    show = re.sub('[^a-zA-Z]', ' ', data['message'][i])
    show = show.lower()
    show = show.split()
    show = [stem1.stem(word) for word in show if not word in stopwords.words('english')]
    show = ' '.join(show)
    corpus.append(show)

#Printing the first 6 values in the corpus list
corpus[1:7]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
count1 = CountVectorizer(max_features = 4000)
X = count1.fit_transform(corpus).toarray()
Y = pd.get_dummies(data['type'])
Y = Y.iloc[:, 1].values
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.20, random_state=42)



### DecisionTreeClassifier()

In [None]:
# Model 1 - DecisionTreeClassifier
dtcmodel= DecisionTreeClassifier()
dtcmodel.fit(x_train, y_train)

In [None]:
# Model 2 - RandomForestClassifier
rfcmodel= RandomForestClassifier()
rfcmodel.fit(x_train, y_train)
RandomForestClassifier()

In [None]:
# Model 3 - Multinomial Naïve Bayes
mnbmodel= MultinomialNB()
mnbmodel.fit(x_train, y_train)
MultinomialNB()

In [None]:
# Model 4 - SVC
svcmodel= SVC()
svcmodel.fit(x_train, y_train)

In [None]:
pre1 = dtcmodel.predict(x_test)
pre2 = rfcmodel.predict(x_test)
pre3 = mnbmodel.predict(x_test)
pre4 = svcmodel.predict(x_test)

In [None]:
# Model1 - DTC
print("Decision Tree Classifier")
print("Confusion Matrix: ")
print(confusion_matrix(y_test, pre1))
print("Accuracy: ", accuracy_score(y_test, pre1)*100)
print("-------------**---------------")

In [None]:
# Model2 - RFC
print("Random Forest Classifier")
print("Confusion Matrix: ")
print(confusion_matrix(y_test, pre2))
print("Accuracy: ", accuracy_score(y_test, pre2)*100)
print("-------------**---------------")

In [None]:
# Model3 - Multinomial Naïve Bayes
print("Multinomial Naïve Bayes")
print("Confusion Matrix: ")
print(confusion_matrix(y_test, pre3))
print("Accuracy: ", accuracy_score(y_test, pre3)*100)
print("-------------**---------------")

In [None]:
# Model4 - SVC
print("Support Vector Classifier")
print("Confusion Matrix: ")
print(confusion_matrix(y_test, pre4))
print("Accuracy: ", accuracy_score(y_test, pre4)*100)
print("-------------**---------------")

In [None]:
from sklearn.metrics import confusion_matrix
email = confusion_matrix(y_test, pre2)
import seaborn as sns
sns.heatmap(email, annot=True,cmap='Reds')

In [None]:
from sklearn.metrics import classification_report

rfcreport = classification_report(y_test, pre1)
print("Classification Report of RFC \n", rfcreport)
dtcreport = classification_report(y_test, pre2)
print("Classification Report of DTC \n", dtcreport)
mnbreport = classification_report(y_test, pre3)
print("Classification Report of MNB \n", mnbreport)
svcreport = classification_report(y_test, pre4)
print("Classification Report of SVC \n", svcreport)

In [None]:
import pickle


In [None]:

filename = "RFC.pkl"
pickle.dump(rfcmodel, open(filename, 'wb'))
filename = "DTC.pkl"
pickle.dump(dtcmodel, open(filename, 'wb'))
filename = "MNB.pkl"
pickle.dump(mnbmodel, open(filename, 'wb'))
filename = "SVM.pkl"
pickle.dump(svcmodel, open(filename, 'wb'))
print("Saved all Models")