## EMAIL SPAM DETECTION WITH MACHINE LEARNING

In [1]:
# importing required libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Load dataset
df = pd.read_csv("spam.csv",encoding='latin-1')
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [3]:
df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1,inplace=True)
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
# Label encoding

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df['v1'] = le.fit_transform(df['v1'])
df

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [5]:
df['v1'].value_counts()

v1
0    4825
1     747
Name: count, dtype: int64

In [6]:
# Splitting training and testing datasets

X = df['v2']
y = df['v1']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=34)

In [7]:
# Tokenizing the email text

vect = CountVectorizer()
X_train_vect = vect.fit_transform(X_train)
X_test_vect = vect.transform(X_test)

#### **Model Training : Naive Bayes** 
We'll use the Naive Bayes algorithm, which is very good at classification of things like the words in an email and how they're used to make a guess about whether the email is spam or not.


In [8]:
# Applying Naive bayes Classification

NB_classifier = MultinomialNB()
NB_classifier.fit(X_train_vect,y_train)


In [9]:
y_pred = NB_classifier.predict(X_test_vect)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [10]:
print(X_test.shape)
print(y_pred.shape)

(1393,)
(1393,)


In [11]:
result = pd.DataFrame()
result["Email"] = X_test
result["Prediction"] = le.inverse_transform(y_pred)
result = result.reset_index(drop="Index")
print(result)

                                                  Email Prediction
0     Every King Was Once A Crying Baby And Every Gr...        ham
1                    Hey check it da. I have listed da.        ham
2     No. On the way home. So if not for the long dr...        ham
3     You're not sure that I'm not trying to make xa...        ham
4                         Oh is it? Send me the address        ham
...                                                 ...        ...
1388            There is no sense in my foot and penis.        ham
1389                        So dont use hook up any how        ham
1390  I got a call from a landline number. . . I am ...        ham
1391  Am surfing online store. For offers do you wan...        ham
1392  wiskey Brandy Rum Gin Beer Vodka Scotch Shampa...        ham

[1393 rows x 2 columns]


In [12]:
accuracy = accuracy_score(y_test,y_pred)
print("Accuracy :",accuracy)

Accuracy : 0.9856424982053122


In [13]:
report = classification_report(y_test, y_pred, target_names=["ham", "spam"])
print(report)

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1208
        spam       0.97      0.92      0.94       185

    accuracy                           0.99      1393
   macro avg       0.98      0.96      0.97      1393
weighted avg       0.99      0.99      0.99      1393

