### Multinomial navie bayes - detecting email spam

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [2]:
# load data
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [9]:
# add spam col
df['spam'] = df.Category.apply(lambda x: 1 if x=='spam' else 0)
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [8]:
y = df.spam
y.head()

0    0
1    0
2    1
3    0
4    0
Name: spam, dtype: int64

In [20]:
x = df.Message
x.head()

5572

In [32]:
# convert text to a matrix
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()
x_counts = v.fit_transform(x)     # with word cound for each unique word
len(x_counts.toarray())

5572

In [33]:
x_counts.toarray()[0:5]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [34]:
# split data -> train,test
x_train, x_test, y_train, y_test = train_test_split(x_counts, y,test_size=0.2 )

In [35]:
# train model
model = MultinomialNB()
model.fit(x_train, y_train)

MultinomialNB()

In [36]:
model.score(x_test, y_test)

0.9811659192825112

In [39]:
##########
email = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]
email_counts = v.transform(email)
model.predict(email_counts)

array([0, 1], dtype=int64)

_using sklearn pipeline_

In [42]:
from sklearn.pipeline import Pipeline
cv_nb = Pipeline([
    ('countvec', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [43]:
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.2 )

In [44]:
cv_nb.fit(x_train, y_train)

Pipeline(steps=[('countvec', CountVectorizer()), ('nb', MultinomialNB())])

In [45]:
cv_nb.score(x_test, y_test)

0.9865470852017937

In [46]:
########
cv_nb.predict(email)

array([0, 1], dtype=int64)