# Spam Filter using Naive Bayes Classifier

### Read the data

In [1]:
import pandas as pd

In [2]:
sms = pd.read_table("sms.tsv", names=['label', 'msg'])
sms.head()

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### convert label to a numerical values

In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
sms.label = LabelEncoder().fit_transform(sms.label)
sms.head()

Unnamed: 0,label,msg
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


### Split the data into x and y

In [5]:
x = sms.msg
y = sms.label

## Feature Extraction

### Convert messages to numerical values 

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
v = CountVectorizer()

#### Example to show you how CountVectorizer works

In [8]:
z = v.fit_transform(["Hello", "This Is DS610", "My name is Ahmed Ali Ahmed",
                    "Hi What's up", "My number is 444-444", "123-567"])

In [9]:
v.get_feature_names()

['123',
 '444',
 '567',
 'ahmed',
 'ali',
 'ds610',
 'hello',
 'hi',
 'is',
 'my',
 'name',
 'number',
 'this',
 'up',
 'what']

In [10]:
z.toarray()

array([[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 2, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1],
       [0, 2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0],
       [1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

## Back to our data
### extract featues from sms data 

In [11]:
x = v.fit_transform(x)

In [12]:
v.get_feature_names()

['00',
 '000',
 '000pes',
 '008704050406',
 '0089',
 '0121',
 '01223585236',
 '01223585334',
 '0125698789',
 '02',
 '0207',
 '02072069400',
 '02073162414',
 '02085076972',
 '021',
 '03',
 '04',
 '0430',
 '05',
 '050703',
 '0578',
 '06',
 '07',
 '07008009200',
 '07046744435',
 '07090201529',
 '07090298926',
 '07099833605',
 '07123456789',
 '0721072',
 '07732584351',
 '07734396839',
 '07742676969',
 '07753741225',
 '0776xxxxxxx',
 '07781482378',
 '07786200117',
 '077xxx',
 '078',
 '07801543489',
 '07808',
 '07808247860',
 '07808726822',
 '07815296484',
 '07821230901',
 '078498',
 '07880867867',
 '0789xxxxxxx',
 '07946746291',
 '0796xxxxxx',
 '07973788240',
 '07xxxxxxxxx',
 '08',
 '0800',
 '08000407165',
 '08000776320',
 '08000839402',
 '08000930705',
 '08000938767',
 '08001950382',
 '08002888812',
 '08002986030',
 '08002986906',
 '08002988890',
 '08006344447',
 '0808',
 '08081263000',
 '08081560665',
 '0825',
 '083',
 '0844',
 '08448350055',
 '08448714184',
 '0845',
 '08450542832',
 '084

### split the data into training set and test set

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.25)

### Apply Multinomial Naive Bayes Classifier on the extracted features

In [15]:
from sklearn.naive_bayes import MultinomialNB

In [16]:
model = MultinomialNB()

In [17]:
model.fit(x_train, y_train)

MultinomialNB()

In [18]:
y_predict = model.predict(x_test)

### Evaluation

In [19]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

In [20]:
accuracy_score(y_test, y_predict)

0.9820531227566404

In [21]:
confusion_matrix(y_test, y_predict)

array([[1197,   13],
       [  12,  171]])

In [22]:
precision_score(y_test, y_predict)

0.9293478260869565

In [23]:
recall_score(y_test, y_predict)

0.9344262295081968

In [24]:
f1_score(y_test, y_predict)

0.9318801089918256

### Cross Validation

In [25]:
from sklearn.model_selection import cross_validate

In [26]:
score = cross_validate(model, x, y, cv= 10)

In [27]:
score

{'fit_time': array([0.00505781, 0.00443697, 0.00302362, 0.00278473, 0.00288057,
        0.00282717, 0.00277615, 0.00266981, 0.00270152, 0.00294542]),
 'score_time': array([0.00077248, 0.00058699, 0.00053692, 0.0004971 , 0.00069594,
        0.00051856, 0.00053525, 0.00049114, 0.00049186, 0.00051403]),
 'test_score': array([0.98207885, 0.97849462, 0.97845601, 0.98025135, 0.97845601,
        0.97845601, 0.98204668, 0.98384201, 0.97127469, 0.99102334])}

### Test our model

In [28]:
test_msg = v.transform(["click here>> http://wap. xxxmobilemovieclub.com?n=QJKGIGHJJGCBL"])

In [29]:
test_y = model.predict(test_msg)

In [30]:
test_y

array([1])