In [57]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report

In [8]:
messages = pd.read_csv('SMSSpamCollection.csv', sep='\t',
                           names=["label", "message"])

In [9]:
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [50]:
# split data
msg_train, msg_test, label_train, label_test = \
train_test_split(messages['message'], messages['label'], test_size=0.2)

In [51]:
vectorizer = CountVectorizer()
vectorizer_mess = vectorizer.fit_transform(msg_train)

In [52]:
label_detect_model = MultinomialNB()

In [53]:
label_train

5080     ham
2044    spam
3608     ham
1492    spam
395      ham
        ... 
3202     ham
1889     ham
3620    spam
2270     ham
2914     ham
Name: label, Length: 4457, dtype: object

In [54]:
label_detect_model.fit(vectorizer_mess, label_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [58]:
# test with test dataset
example_counts = vectorizer.transform(msg_test)
predictions = label_detect_model.predict(example_counts)

In [60]:
print(confusion_matrix(predictions, label_test))

[[966   6]
 [  4 139]]


In [61]:
print(classification_report(predictions, label_test))

              precision    recall  f1-score   support

         ham       1.00      0.99      0.99       972
        spam       0.96      0.97      0.97       143

    accuracy                           0.99      1115
   macro avg       0.98      0.98      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [67]:
# test with own message
examples = ['Hello John', "I'm so sorry", "Free bitcoin"]
ex_counts = vectorizer.transform(examples)
predict = label_detect_model.predict(ex_counts)

print(predict)

['ham' 'ham' 'spam']
