In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

Load and describe the datasets

In [2]:
data = pd.read_table('E:\datasets\SMSSpamCollection', sep = '\t', names = ['label', 'content'])
data.head(5)
data.isnull().sum()
data['label']

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: label, Length: 5572, dtype: object

Preprocessing

In [3]:
data['label'] = data.label.map({'ham':0, 'spam':1})
data['content'] = data.content.map(lambda x: x.lower())
data['label']

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: label, Length: 5572, dtype: int64

We split the train dataset and test dataset

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(data['content'],
                                                    data['label'],
                                                    test_size=0.2,
                                                    random_state=1)
X_train.head()
Y_train.head()

1642    0
2899    0
480     0
3485    0
157     0
Name: label, dtype: int64

We turn the data into matrics

In [5]:
count_vector = CountVectorizer(stop_words='english')
train_data = count_vector.fit_transform(X_train)
test_data = count_vector.transform(X_test)

Fit the naive bayes model

In [6]:
naive_bayes = MultinomialNB()
naive_bayes.fit(train_data, Y_train)

MultinomialNB()

Evaluation
    we use accuracy, precision, recall and f1 score to measure our model performance

In [7]:
predictions = naive_bayes.predict(test_data)
print('Accuracy score: ', format(accuracy_score(Y_test, predictions)))
print('Precision score: ', format(precision_score(Y_test, predictions)))
print('Recall score: ', format(recall_score(Y_test, predictions)))
print('F1 score: ', format(f1_score(Y_test, predictions)))

Accuracy score:  0.9910313901345291
Precision score:  0.9790209790209791
Recall score:  0.9523809523809523
F1 score:  0.9655172413793104
