In [24]:
# this notebook uses a training data set available via the books github to train an LDA (linear discriminant analysis) classifier to detect if a sms is spam or not
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.casual import casual_tokenize
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import plot_confusion_matrix, confusion_matrix
# lets get the data into a dataframe
sms = pd.read_csv('sms-spam.csv', index_col=0)

sms['spam'] = sms.spam.astype(int)
index = ['sms{}{}'.format(i, '!'*j) for (i,j) in zip(range(len(sms)), sms.spam)]
sms = pd.DataFrame(sms.values, columns=sms.columns, index=index)
sms['spam'] = sms.spam.astype(int)
print(sms.head())

       spam                                               text
sms0      0  Go until jurong point, crazy.. Available only ...
sms1      0                      Ok lar... Joking wif u oni...
sms2!     1  Free entry in 2 a wkly comp to win FA Cup fina...
sms3      0  U dun say so early hor... U c already then say...
sms4      0  Nah I don't think he goes to usf, he lives aro...


In [25]:
# create the tokens and tf idf vectors
tfidf_model = TfidfVectorizer(tokenizer=casual_tokenize)
tfidf_docs = tfidf_model.fit_transform(raw_documents=sms.text).toarray()
print(tfidf_docs.shape)
print(sms.spam.sum())

(4837, 9232)
638


In [26]:
# start with the actual LDA by calculating the centroids for spam and not spam
mask = sms.spam.astype(bool).values
# use axis=0 because the tfidf vectors are row vectors, but we want to calculate each column independently
spam_centroid = tfidf_docs[mask].mean(axis=0)
ham_centroid = tfidf_docs[~mask].mean(axis=0)


In [27]:
# substract one centroid from the other to get the line between themn
spamminess_score = tfidf_docs.dot(spam_centroid - ham_centroid)
print(spamminess_score.round(2))
print(len(spamminess_score))

[-0.01 -0.02  0.04 ... -0.01 -0.    0.  ]
4837


In [28]:
# rescale the actual score for prediction and predict!
sms['lda_score'] = MinMaxScaler().fit_transform(spamminess_score.reshape(-1,1))
sms['lda_predict'] = (sms.lda_score > .5).astype(int)
sms['spam lda_predict lda_score'.split()].round(2).head(6)

Unnamed: 0,spam,lda_predict,lda_score
sms0,0,0,0.23
sms1,0,0,0.18
sms2!,1,1,0.72
sms3,0,0,0.18
sms4,0,0,0.29
sms5!,1,1,0.55


In [29]:
# check the performance in the training set. WARNING: This is not indicative of the actual performance of the model - the book is clear on that, so I thought I add that here just in case any one just sees this example and not the accompanying text.
(1. - (sms.spam - sms.lda_predict).abs().sum() / len(sms)).round(3)

0.977

In [30]:
# lets have a look at the confusion matrix (again, training set only)
cm = confusion_matrix(sms.spam, sms.lda_predict)
print(cm)

[[4135   64]
 [  45  593]]


In [None]:
# so we got 64 false positives and 45 false negatives. quite good! but: training set only, so the performance is expected to be good!