# SMS Spam Classification Using Naive Bayes Classification

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [19]:
df = pd.read_csv('spam.csv', encoding='latin-1', usecols=['Category', 'v2'])
df.head()

Unnamed: 0,Category,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [20]:
df.groupby('Category').describe()

Unnamed: 0_level_0,v2,v2,v2,v2
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4824,4510,"Sorry, I'll call later",30
spam,747,658,Please call our customer service representativ...,4


In [21]:
df['spam']=df['Category'].apply(lambda x: 1 if x=='spam' else 0)
df.head()

Unnamed: 0,Category,v2,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train, y_test = train_test_split(df.Category,df.spam,test_size=0.25)

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()
X_train_count = v.fit_transform(X_train.values)
X_train_count.toarray()[:3]

array([[1, 0],
       [1, 0],
       [1, 0]])

In [26]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_count, y_train)

MultinomialNB()

In [30]:
emails = ['Hey mohan, can we get together to have a date tomorrow?',
          'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!']
emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 0])

In [32]:
X_test_count = v.transform(X_test)
model.score(X_test_count, y_test)

1.0

In [33]:
from sklearn.pipeline import Pipeline
clf = Pipeline([('vectorizer', CountVectorizer()),
                ('nb', MultinomialNB())])

In [34]:
clf.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [35]:
clf.score(X_test, y_test)

1.0