# OASIS INFOBYTE (TASK-2) ----- EMAIL SPAM DETECTION WITH ML

### IMPORTING PACKAGES

In [1]:



import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB



### READING DATASET

In [2]:

df = pd.read_csv("spam.csv")

### DETAIL PREPROCESSING

In [3]:

df


Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
df.columns

Index(['v1', 'v2'], dtype='object')

In [5]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df.tail()

Unnamed: 0,v1,v2
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [7]:
df.shape

(5572, 2)

In [8]:
df.describe()

Unnamed: 0,v1,v2
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


### INSPECTING DATA

In [9]:
df.groupby('v1').describe()

Unnamed: 0_level_0,v2,v2,v2,v2
Unnamed: 0_level_1,count,unique,top,freq
v1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


### CONVERSION INTO NUMERICAL VALUES AND PUTTING THEM INTO NEW COLUMN 'SPAM'

In [10]:

df['spam']=df['v1'].apply(lambda x:1 if x=='spam' else 0)

In [11]:
df

Unnamed: 0,v1,v2,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will ü b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


### SPLITTING TEST SET AND TRAIN SET BY 0.25:0.75

In [12]:
x_train, x_test, y_train, y_test = train_test_split(df.v2, df.spam, test_size=0.25)

In [13]:
x_train.describe()

count                       4179
unique                      3932
top       Sorry, I'll call later
freq                          21
Name: v2, dtype: object

In [14]:
y_train.describe()

count    4179.000000
mean        0.131371
std         0.337846
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: spam, dtype: float64

### WORD COUNT IN MATRIX

In [15]:

cv = CountVectorizer()
x_train_count = cv.fit_transform(x_train.values)

In [16]:
x_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### TRAINING THE MODEL

In [17]:

model = MultinomialNB()
model.fit(x_train_count, y_train)

### PRETESTING HAM MESSAGES

In [18]:

email_ham = ["lets go and party"]
email_ham_count = cv.transform(email_ham)
model.predict(email_ham_count)

array([0], dtype=int64)

### PRETESTING SPAM MESSAGES

In [19]:

email_spam = ['click to gain reward']
email_spam_count = cv.transform(email_spam)
model.predict(email_spam_count)

array([1], dtype=int64)

### TRAINING FINAL MODEL AND CHECKING ACCURACY

In [20]:

x_test_count = cv.transform(x_test)
model.score(x_test_count, y_test)

0.9899497487437185

### SAVING THE MODEL

In [21]:
import joblib as jb

In [22]:
jb.dump(model, 'email_spam_detection')

['email_spam_detection']

In [23]:
model = jb.load('email_spam_detection')

In [24]:
df.columns

Index(['v1', 'v2', 'spam'], dtype='object')