# E-MAIL SPAM CLASSIFIER

In [1]:
# For importing neccessary libraries:-
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import nltk

In [3]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\RISHABH
[nltk_data]     JOHRI\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# For creating dataframe and fetching csv file:-
data=pd.read_csv("spam.csv",encoding='ISO-8859-1')
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


# Exploratory Data Analysis(EDA):-

In [6]:
# For fetching shape of the dataset:-
data.shape

(5572, 5)

In [7]:
# For checking null values in the dataset:-
data.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [8]:
# For checking info about the dataset:-
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


Here, as observed there are 3 columns Unnamed: 2,3,4 which are not required as it contains all null values
which are not requitred for model building.So we are dropping these columns.

In [10]:
# For dropping 3 columns which is not required for further processes:-
data.drop(["Unnamed: 2","Unnamed: 3","Unnamed: 4"],axis=1,inplace=True)

In [11]:
# For checking whether the columns has been dropped or not\:-
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Now, there are 2 columns left in this dataset.

In [13]:
# For segmenting spam and ham into 1 and 0 respectively:-
data["spam"] = data["v1"].map({"spam":1, "ham":0}).astype(int)

In [14]:
# For checking above effect:-
data.head()

Unnamed: 0,v1,v2,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


# Tokenisation:-

In [15]:
# For defining tokeniser for v2 :-
def tokenizer(text):
    return text.split()

In [16]:
# For applying tokenisation for seperating words:-
data["v2"]= data["v2"].apply(tokenizer)

In [17]:
# For checking tokenisation effect:-
data

Unnamed: 0,v1,v2,spam
0,ham,"[Go, until, jurong, point,, crazy.., Available...",0
1,ham,"[Ok, lar..., Joking, wif, u, oni...]",0
2,spam,"[Free, entry, in, 2, a, wkly, comp, to, win, F...",1
3,ham,"[U, dun, say, so, early, hor..., U, c, already...",0
4,ham,"[Nah, I, don't, think, he, goes, to, usf,, he,...",0
...,...,...,...
5567,spam,"[This, is, the, 2nd, time, we, have, tried, 2,...",1
5568,ham,"[Will, Ì_, b, going, to, esplanade, fr, home?]",0
5569,ham,"[Pity,, *, was, in, mood, for, that., So...any...",0
5570,ham,"[The, guy, did, some, bitching, but, I, acted,...",0


# Stemming :-

In [18]:
from nltk.stem import SnowballStemmer
porter = SnowballStemmer("english",ignore_stopwords = False)

In [19]:
# For defining stemming :-
def stem(text):
    return [porter.stem(word) for word in text]

In [20]:
data["v2"]= data["v2"].apply(stem)

In [21]:
# For checking above stemming effect:-
data

Unnamed: 0,v1,v2,spam
0,ham,"[go, until, jurong, point,, crazy.., avail, on...",0
1,ham,"[ok, lar..., joke, wif, u, oni...]",0
2,spam,"[free, entri, in, 2, a, wkli, comp, to, win, f...",1
3,ham,"[u, dun, say, so, earli, hor..., u, c, alreadi...",0
4,ham,"[nah, i, don't, think, he, goe, to, usf,, he, ...",0
...,...,...,...
5567,spam,"[this, is, the, 2nd, time, we, have, tri, 2, c...",1
5568,ham,"[will, ì_, b, go, to, esplanad, fr, home?]",0
5569,ham,"[pity,, *, was, in, mood, for, that., so...ani...",0
5570,ham,"[the, guy, did, some, bitch, but, i, act, like...",0


# Lemmatisation:-

In [22]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [23]:
def lemmit(text):
    return [lemmatizer.lemmatize(word, pos='a') for word in text]

In [24]:
data["v2"]= data["v2"].apply(lemmit)

In [25]:
data

Unnamed: 0,v1,v2,spam
0,ham,"[go, until, jurong, point,, crazy.., avail, on...",0
1,ham,"[ok, lar..., joke, wif, u, oni...]",0
2,spam,"[free, entri, in, 2, a, wkli, comp, to, win, f...",1
3,ham,"[u, dun, say, so, earli, hor..., u, c, alreadi...",0
4,ham,"[nah, i, don't, think, he, goe, to, usf,, he, ...",0
...,...,...,...
5567,spam,"[this, is, the, 2nd, time, we, have, tri, 2, c...",1
5568,ham,"[will, ì_, b, go, to, esplanad, fr, home?]",0
5569,ham,"[pity,, *, was, in, mood, for, that., so...ani...",0
5570,ham,"[the, guy, did, some, bitch, but, i, act, like...",0


# For Removing Stopwords:-

In [26]:
# For downloading stopwords:-
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to C:\Users\RISHABH
[nltk_data]     JOHRI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
from nltk.corpus import stopwords
stop_words = stopwords.words("english")

In [28]:
def stop(text):
    review = [word for word in text if word not in stop_words]
    return review

In [30]:
# For applying stopwords on v2:-
data["v2"]= data["v2"].apply(stop)

In [31]:
# For checking the above effect:-
data.head()

Unnamed: 0,v1,v2,spam
0,ham,"[go, jurong, point,, crazy.., avail, onli, bug...",0
1,ham,"[ok, lar..., joke, wif, u, oni...]",0
2,spam,"[free, entri, 2, wkli, comp, win, fa, cup, fin...",1
3,ham,"[u, dun, say, earli, hor..., u, c, alreadi, sa...",0
4,ham,"[nah, think, goe, usf,, live, around, though]",0


In [32]:
data["v2"] = data["v2"].apply(" ".join)

In [33]:
# For checking the dataset again:-
data.head()

Unnamed: 0,v1,v2,spam
0,ham,"go jurong point, crazy.. avail onli bugi n gre...",0
1,ham,ok lar... joke wif u oni...,0
2,spam,free entri 2 wkli comp win fa cup final tkts 2...,1
3,ham,u dun say earli hor... u c alreadi say...,0
4,ham,"nah think goe usf, live around though",0


# Vectorisation:-

In [34]:
# For importing sklearn
import sklearn
sklearn.__version__

'1.1.2'

In [35]:
# For importing TFID Vectoriser:-
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfobj = TfidfVectorizer()
x = tfidfobj.fit_transform(data["v2"])

In [36]:
x

<5572x8122 sparse matrix of type '<class 'numpy.float64'>'
	with 50267 stored elements in Compressed Sparse Row format>

In [37]:
y = data.spam.values

In [38]:
y

array([0, 0, 1, ..., 0, 0, 0])

# Training Phase:-

In [39]:
from sklearn.model_selection import train_test_split as split

In [40]:
x_train, x_test, y_train, y_test = split(x, y, test_size = 0.33, random_state = 42)

In [42]:
# For executing x_train:-
x_train

<3733x8122 sparse matrix of type '<class 'numpy.float64'>'
	with 33505 stored elements in Compressed Sparse Row format>

In [43]:
# For executing y_train:-
y_train

array([0, 0, 0, ..., 0, 0, 0])

In [44]:
# For testing with LogesticRegression:-
from sklearn.linear_model import LogisticRegression as lgr

In [45]:
lrobj = lgr()
lrobj.fit(x_train, y_train)

In [46]:
# For  getting predictions:- 
y_pred = lrobj.predict(x_test)

In [47]:
y_pred

array([0, 0, 0, ..., 0, 0, 1])

In [49]:
Result = pd.DataFrame({"Actual Values":y_test, "Predicted Values":y_pred})
Result

Unnamed: 0,Actual Values,Predicted Values
0,0,0
1,0,0
2,1,0
3,0,0
4,1,1
...,...,...
1834,0,0
1835,0,0
1836,0,0
1837,0,0


Here, as seen above the Final predictions have above values with 1839 rows and 2 columns.

In [50]:
# For printing accuracy scores:-
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, y_pred)

In [51]:
"Accuracy Score : {}".format(score)

'Accuracy Score : 0.9467101685698749'

# Saving Model:-

In [None]:
# For importing required libraries:-
import pickle
Name='Email spam'
pickle.dump(Result,open(Name,'wb'))

So, here the Accuracy score is coming 94.6 % which is very good.

Thanks