# **Email Spam Detection**

### Load DataSet

In [1]:
import pandas as pd
df=pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Data Preprocessing 

In [2]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [3]:
df['Spam']=df.Category.map({'ham':0,'spam':1})
df.head()

Unnamed: 0,Category,Message,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df.Message,df.Spam, test_size=0.3, random_state=42)

In [6]:
len(X_train),len(X_test)

(3900, 1672)

### Converting text to integer Matrix

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
vectorizer = CountVectorizer()
X_train_count = vectorizer.fit_transform(X_train.values)
vectorizer.get_feature_names_out()
X_train_count.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### Training Model using **Multinomial Naive Bayes**

In [9]:
from sklearn.naive_bayes import MultinomialNB

In [10]:
model=MultinomialNB()
model.fit(X_train_count,y_train)

In [11]:
emails=['I m gonna be home soon and i dont want to talk about this stuff anymore tonight, k? I ve cried enough today',
'SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info']
emails_count=vectorizer.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

In [12]:
X_test_count=vectorizer.transform(X_test)
model.score(X_test_count,y_test)

0.9904306220095693

### Training Model using **Pipeline** method *(Short-cut)*

In [13]:
from sklearn.pipeline import Pipeline

In [14]:
clf=Pipeline([
    ('Vectorizer',CountVectorizer()),
    ('nb',MultinomialNB())])

In [15]:
clf.fit(X_train,y_train)

In [16]:
clf.score(X_test,y_test)

0.9904306220095693

In [17]:
clf.predict(emails)

array([0, 1], dtype=int64)