# Spam Detection

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [2]:
data = pd.read_csv("spam.csv", encoding= 'latin-1')
data.head()

Unnamed: 0,class,message,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


#### Class and message is needed to train a machine learning model for spam detection and hence selecting these 2 columns 

In [4]:
data = data[['class','message']]
data.head()

Unnamed: 0,class,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
data.isnull().sum()

class      0
message    0
dtype: int64

In [6]:
data['class'].value_counts()

ham     4825
spam     747
Name: class, dtype: int64

In [7]:
data['class_num'] = data['class'].map({'ham':0,'spam':1})
data.head()

Unnamed: 0,class,message,class_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [8]:
X = data.message
y = data.class_num
print(X.shape)
print(y.shape)

(5572,)
(5572,)


In [9]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4179,)
(1393,)
(4179,)
(1393,)


In [10]:
cv = CountVectorizer()
X_train_dtm = cv.fit_transform(X_train)
X_train_dtm

<4179x7496 sparse matrix of type '<class 'numpy.int64'>'
	with 55614 stored elements in Compressed Sparse Row format>

In [11]:
X_test_dtm = cv.transform(X_test)
X_test_dtm

<1393x7496 sparse matrix of type '<class 'numpy.int64'>'
	with 17010 stored elements in Compressed Sparse Row format>

In [12]:
nb = MultinomialNB()

In [13]:
nb.fit(X_train_dtm,y_train)

MultinomialNB()

In [14]:
y_pred_class = nb.predict(X_test_dtm)
y_pred_class

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [15]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test,y_pred_class)

0.9856424982053122

In [16]:
# calculate AUC
metrics.roc_auc_score(y_test,y_pred_class)

0.9633690574333608

In [18]:
sample = input('Enter a message:')
dt = cv.transform([sample]).toarray()
print(nb.predict(dt))

Enter a message:Come to school by 9AM
[0]
