In [1]:
# We will use a collection of SMS spam messages as data.
# get data at https://archive.ics.uci.edu/ml/datasets/sms+spam+collection
import pandas as pd
df = pd.read_table('data/SMSSpamCollection.tsv', sep='\t', header=None,
				   names=['label', 'message'])

In [2]:
## Looks like that
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
## nltk is a text-processing library
import nltk
# it has models for different languages, here we download one
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/ao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
from nltk.stem import PorterStemmer

## CountVectorizer counts entries of every token in the collection
from sklearn.feature_extraction.text import CountVectorizer

def extract_features(messages: pd.Series, vocabulary=None):
	stemmer = PorterStemmer()

	## Let's clean messages from non-text characters
	messages = messages.map(lambda x: x.lower().replace('[^\w\s]', ''))
	## transform free text into a collection of words(list of strings)
	messages = messages.apply(nltk.word_tokenize)
	## the idea is that spam sms has high likelihood for certain words. We use
	## stemming to exclude different word forms and count only word stems
	messages = messages.apply(lambda x: [stemmer.stem(y) for y in x])
	messages = messages.apply(lambda x: ' '.join(x))

	params = {}
	if vocabulary:
		params['vocabulary'] = vocabulary
	## CountVectorizer counts entries of every token in the collection
	count_vect = CountVectorizer(**params)
	counts = count_vect.fit_transform(messages)

	return count_vect, counts

In [5]:
## We need to transform categories into machine-readable values(probabilities)
df['label'] = df.label.map({'ham': 0, 'spam': 1})

In [6]:
count_vect, counts = extract_features(df['message'])

In [7]:
## Inside is the list of all stems and their counts
print(count_vect.get_feature_names()[1500:1520])
print(counts.toarray())

['board', 'boat', 'boatin', 'bob', 'bodi', 'boggi', 'bognor', 'bold', 'bold2', 'bollox', 'boltblu', 'bomb', 'bone', 'bong', 'bonu', 'boo', 'boob', 'book', 'bookedth', 'bookmark']
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [8]:
from sklearn.model_selection import train_test_split
## we split some part of our dataset to validate trained model later
X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.1, random_state=69)

In [9]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB().fit(X_train, y_train)

In [10]:
import numpy as np

predicted = model.predict(X_test)

## it better be close to 1 as possible, means we trained model well
print(np.mean(predicted == y_test))

0.985663082437276


In [11]:
from sklearn.metrics import confusion_matrix
## show number of false positives and negatives
print(confusion_matrix(y_test, predicted))

[[479   3]
 [  5  71]]


In [12]:
### Let's experiment with random string outside the dataset!
text_to_check = [
	'I bet you win tonight haha',
	'Signup via the link and receive a free bonus! http://bit.ly/XdgfA',
]
new_count_vect, new_counts = extract_features(
	pd.Series(text_to_check), count_vect.get_feature_names())


In [13]:
print('There are model\'s predictions for strings:')
list(zip(map(bool, model.predict(new_counts)), text_to_check))

There are model's predictions for strings:


[(False, 'I bet you win tonight haha'),
 (True, 'Signup via the link and receive a free bonus! http://bit.ly/XdgfA')]