# Spam Email detection

# Problem Statement:
We’ve all been the recipient of spam emails before. Spam mail, or junk mail, is a type of email
that is sent to a massive number of users at one time, frequently containing cryptic
messages, scams, or most dangerously, phishing content.

# Importing necessary libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split as tts
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings('ignore')

# Loading the dataset

In [2]:
data=pd.read_csv('spam1.csv')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


# Data Analysis

In [3]:
data.shape

(5570, 5)

In [4]:
data.isnull().sum()

v1               0
v2               0
Unnamed: 2    5520
Unnamed: 3    5558
Unnamed: 4    5564
dtype: int64

In [5]:
data['v1'].value_counts()

ham     4823
spam     747
Name: v1, dtype: int64

In [6]:
# After observing the data, we can use Naive Bayes Classifier to build the model.

# Dropping redundant columns

In [7]:
data.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Preprocessing

In [8]:
encoder=LabelEncoder()
data['v1']=encoder.fit_transform(data['v1'])
data.head()

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


# Splitting the dataset

In [9]:
X=data['v2']
Y=data['v1']

In [10]:
xtrain,xtest,ytrain,ytest=tts(X,Y,random_state=42,test_size=0.3)


# Turning mails into numeric values

In [11]:
cv= CountVectorizer()

xtrain_values=cv.fit_transform(xtrain.values)
xtest_values=cv.transform(xtest.values)

In [19]:
xtrain_values

<3899x7229 sparse matrix of type '<class 'numpy.int64'>'
	with 52279 stored elements in Compressed Sparse Row format>

# Training the model

In [13]:
classifier=MultinomialNB()

classifier.fit(xtrain_values,ytrain)

MultinomialNB()

# Performance

In [14]:
classifier.score(xtest_values,ytest)

0.9850388988629563

# Testing

In [15]:
spam1= ['Reward money free']
spam1_count=cv.transform(spam1)
classifier.predict(spam1_count)

# here the output 1 means 'spam' and output 0 means 'ham' as we used label encoding  .

array([1])

In [16]:
# Hence we see that the model correctly identifies spam and ham mails after training the data