#                                            Email Spam Filtering

# Importing Libraries

In [0]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import string
import matplotlib.pyplot as plt

# Reading Dataset

In [0]:
dataset = pd.read_csv('/content/hamspam.tsv',sep='\t',names=['Output','Message'])

# Exploratory Data Analysis

In [4]:
dataset.head()

Unnamed: 0,Output,Message
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5568 entries, 0 to 5567
Data columns (total 2 columns):
Output     5568 non-null object
Message    5568 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [6]:
dataset.describe()

Unnamed: 0,Output,Message
count,5568,5568
unique,2,5165
top,ham,"Sorry, I'll call later"
freq,4822,30


In [0]:
dataset['Length'] = dataset['Message'].apply(len)

In [8]:
dataset.head()

Unnamed: 0,Output,Message,Length
0,ham,I've been searching for the right words to tha...,196
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
2,ham,"Nah I don't think he goes to usf, he lives aro...",61
3,ham,Even my brother is not like to speak with me. ...,77
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,35


In [9]:
dataset.groupby('Output').count()

Unnamed: 0_level_0,Message,Length
Output,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,4822,4822
spam,746,746


In [10]:
dataset['Length'].describe()

count    5568.000000
mean       80.487428
std        59.950961
min         2.000000
25%        36.000000
50%        62.000000
75%       122.000000
max       910.000000
Name: Length, dtype: float64

# Data Preprocessing

In [0]:
y = dataset['Output'].values

In [12]:
y

array(['ham', 'spam', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

## Convert Value Of Ham = 1 & Spam = 0

In [0]:
dataset.loc[dataset['Output']=="ham","Output"] = 1

In [0]:
dataset.loc[dataset['Output']=="spam","Output"] = 0

In [15]:
dataset.head()

Unnamed: 0,Output,Message,Length
0,1,I've been searching for the right words to tha...,196
1,0,Free entry in 2 a wkly comp to win FA Cup fina...,155
2,1,"Nah I don't think he goes to usf, he lives aro...",61
3,1,Even my brother is not like to speak with me. ...,77
4,1,I HAVE A DATE ON SUNDAY WITH WILL!!,35


## Process On Messages

In [0]:
def cleanMessage(message):
    nonPunc = [char for char in message if char not in string.punctuation]
    nonPunc = "".join(nonPunc)
    return nonPunc

In [0]:
dataset['Message'] = dataset['Message'].apply(cleanMessage)

In [18]:
dataset.head()

Unnamed: 0,Output,Message,Length
0,1,Ive been searching for the right words to than...,196
1,0,Free entry in 2 a wkly comp to win FA Cup fina...,155
2,1,Nah I dont think he goes to usf he lives aroun...,61
3,1,Even my brother is not like to speak with me T...,77
4,1,I HAVE A DATE ON SUNDAY WITH WILL,35


In [0]:
CV = CountVectorizer(stop_words="english")

## Independent And depended Data Organization

In [0]:
X = dataset['Message'].values

In [0]:
y = dataset['Output'].values

## Train And Test Data

In [0]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

## Data Munging

### Data Munging (or Data Wrangling), it means preparing your data for a dedicated purpose - taking the data from its raw state and transforming and mapping into another format, normally for use beyond its original intent.

In [0]:
X_train_CV = CV.fit_transform(X_train)

# 1. Naive Bayse :- Multinomial

In [0]:
NB = MultinomialNB()

In [25]:
NB.fit(X_train_CV,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [0]:
X_test_CV = CV.transform(X_test)

In [0]:
Y_predict = NB.predict(X_test_CV)

In [0]:
result = accuracy_score(y_test,Y_predict)

In [29]:
print("Accuracy Of Prediction :-",result*100)

Accuracy Of Prediction :- 98.02513464991023


## Realtime Application Of Spam Filtering :- 

In [34]:
email  = input("Enter Email :- ")
body = input("Enter Body Of Content :- ")
bodyInput = CV.transform([body])
result = NB.predict(bodyInput)
if(result[0]==0):
    print("This Is Spam Mail")
else:
    print("Email Sent")

Enter Email :- saurabh.cegian@gmail.com
Enter Body Of Content :- you won lottery today.
This Is Spam Mail
