## **Author : Rohith Krishna S**

## Title : Email Spam Detection with Machine Learning

**Objective:**
Email spam detector using Python. Use Machine Learning to train the spam detector to recognize and classify emails into spam and non-spam.

In [None]:
import numpy as np
import pandas as pd
import os
from google.colab import files

In [None]:
data = pd.read_csv(r'spam.csv', encoding = "ISO-8859-1")
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
data.drop(data.columns[[2,3,4]], axis=1, inplace=True)
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
data.columns

Index(['v1', 'v2'], dtype='object')

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [None]:
data.isna().sum()

v1    0
v2    0
dtype: int64

In [None]:
data['spam']=data['v1'].apply(lambda x:1 if x=='spam' else 0)
data.head(5)

Unnamed: 0,v1,v2,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(data.v2,data.spam,test_size=0.25)

In [None]:
#CounterVectorizer Convert the text into matrics
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
from sklearn.pipeline import Pipeline
clf=Pipeline([
    ('vectorizer',CountVectorizer()),
    ('nb',MultinomialNB())
])

In [None]:
clf.fit(X_train,y_train)

In [None]:
emails=[ 'Free entry in 2 a wkly comp to win FA Cup final.', 'The cat is on the wall, this is not a mail. Aprl fool and chilen day.'
]

In [None]:
clf.predict(emails)

array([1, 0])

In [None]:
clf.score(X_test,y_test)

0.9892318736539842