In [1]:
# this notebook uses a training data set available via the books github to train an LDA (linear discriminant analysis) classifier to detect if a sms is spam or not
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.tokenize.casual import casual_tokenize
from sklearn.decomposition import PCA, TruncatedSVD, LatentDirichletAllocation as LDiA
from sklearn.model_selection import train_test_split



# lets get the data into a dataframe
sms = pd.read_csv('sms-spam.csv', index_col=0)

sms['spam'] = sms.spam.astype(int)
index = ['sms{}{}'.format(i, '!'*j) for (i,j) in zip(range(len(sms)), sms.spam)]
sms = pd.DataFrame(sms.values, columns=sms.columns, index=index)
sms['spam'] = sms.spam.astype(int)
print(sms.head())

       spam                                               text
sms0      0  Go until jurong point, crazy.. Available only ...
sms1      0                      Ok lar... Joking wif u oni...
sms2!     1  Free entry in 2 a wkly comp to win FA Cup fina...
sms3      0  U dun say so early hor... U c already then say...
sms4      0  Nah I don't think he goes to usf, he lives aro...


In [2]:
# calculate tf idf vectors for the sms messages
tfidf = TfidfVectorizer(tokenizer=casual_tokenize)
tfidf_docs = tfidf.fit_transform(raw_documents=sms.text).toarray()
len(tfidf.vocabulary_)

9232

In [3]:
tfidf_docs = pd.DataFrame(tfidf_docs)
tfidf_docs = tfidf_docs - tfidf_docs.mean()
print(tfidf_docs.shape)

(4837, 9232)


In [4]:
# try PCA for dimension reduction first
pca = PCA(n_components=16)
pca = pca.fit(tfidf_docs)
pca_topic_vectors = pca.transform(tfidf_docs)
columns = ['topic{}'.format(i) for i in range(pca.n_components)]
pca_topic_vectors = pd.DataFrame(pca_topic_vectors, columns=columns, index=index)
pca_topic_vectors.round(3).head(6)

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,topic10,topic11,topic12,topic13,topic14,topic15
sms0,0.201,0.003,0.037,0.011,-0.019,-0.053,0.039,-0.065,0.011,-0.084,0.008,-0.0,-0.007,-0.041,-0.013,0.025
sms1,0.404,-0.094,-0.078,0.051,0.1,0.047,0.023,0.065,0.023,-0.024,-0.005,0.037,-0.049,-0.007,0.052,-0.038
sms2!,-0.03,-0.048,0.09,-0.067,0.091,-0.044,0.0,-0.002,-0.056,0.051,0.124,0.031,-0.028,-0.009,-0.044,0.048
sms3,0.329,-0.033,-0.035,-0.016,0.052,0.056,-0.167,-0.072,0.061,-0.108,0.021,0.017,-0.076,-0.037,0.023,-0.064
sms4,0.002,0.031,0.038,0.034,-0.075,-0.093,-0.044,0.061,-0.045,0.03,0.028,-0.006,-0.02,0.038,-0.086,-0.025
sms5!,-0.016,0.059,0.014,-0.006,0.122,-0.04,0.005,0.166,-0.022,0.063,0.043,0.049,0.056,0.061,0.006,0.005


In [5]:
column_nums, terms = zip(*sorted(zip(tfidf.vocabulary_.values(), tfidf.vocabulary_.keys())))


In [6]:
weights = pd.DataFrame(pca.components_, columns=terms, index=['topic{}'.format(i) for i in range(16)])
print(weights.head(4).round(3))

            !      "      #   #150  #5000      $      %      &      '      (  \
topic0 -0.071  0.008 -0.001 -0.000 -0.001  0.003 -0.000 -0.012 -0.007 -0.005   
topic1  0.063  0.008  0.000 -0.000 -0.001 -0.001 -0.002 -0.016 -0.016  0.001   
topic2  0.071  0.027  0.000  0.001  0.002  0.000  0.001  0.059  0.008  0.019   
topic3 -0.059 -0.032 -0.001 -0.000 -0.001  0.001 -0.003 -0.028  0.001 -0.010   

        ...   ü'll      –    —      ‘      ’      “      …      ┾    〨ud  \
topic0  ...  0.003 -0.000 -0.0 -0.004 -0.001 -0.001 -0.002  0.001  0.001   
topic1  ...  0.002  0.001 -0.0  0.004 -0.001 -0.001  0.003  0.001  0.001   
topic2  ...  0.000  0.001 -0.0  0.002  0.000  0.001  0.002 -0.001 -0.001   
topic3  ... -0.001 -0.001  0.0  0.000 -0.000 -0.000  0.001  0.001  0.001   

            鈥  
topic0  0.001  
topic1  0.001  
topic2 -0.001  
topic3  0.001  

[4 rows x 9232 columns]


In [7]:
# lets find discount spam sms predicting words
deals = weights['! ;) :) half off free crazy deal only $ 80 %'.split()].round(3) * 100
print(deals)

            !   ;)    :)  half  off  free  crazy  deal  only    $   80    %
topic0   -7.1  0.1  -0.5  -0.0 -0.4  -2.0   -0.0  -0.1  -2.2  0.3 -0.0 -0.0
topic1    6.3  0.0   7.4   0.1  0.4  -2.3   -0.2  -0.1  -3.8 -0.1 -0.0 -0.2
topic2    7.1  0.2  -0.1   0.0  0.3   4.4    0.1  -0.1   0.7  0.0  0.0  0.1
topic3   -5.9 -0.3  -7.1   0.2  0.3  -0.2    0.0   0.1  -2.3  0.1 -0.1 -0.3
topic4   38.1 -0.1 -12.4  -0.1 -0.2   9.9    0.1  -0.2   3.0  0.3  0.1 -0.1
topic5  -26.4  0.1  -1.5  -0.4 -0.7  -1.4   -0.6  -0.2  -1.8 -0.9  0.0  0.0
topic6  -11.1 -0.5  19.8  -0.4 -0.9  -0.6   -0.2  -0.1  -1.4 -0.0 -0.0 -0.1
topic7   16.6  0.1 -17.8   0.7  0.8  -2.9    0.0   0.1  -1.9 -0.3  0.0 -0.1
topic8   34.0  0.1   5.3  -0.4 -0.6   0.1   -0.4  -0.4   3.2 -0.6 -0.0 -0.2
topic9    7.5 -0.3  16.4   1.4 -0.9   6.3   -0.5  -0.4   3.2 -0.5 -0.0  0.0
topic10 -32.2 -0.2 -10.3   0.1  0.1  12.3    0.1   0.0   0.3 -0.0 -0.0 -0.2
topic11  19.0  0.4  29.6   0.5  1.3  -4.9    0.1   0.2   0.0 -0.5 -0.0 -0.4
topic12  26.

In [8]:
print(deals.T.sum())

topic0    -11.9
topic1      7.5
topic2     12.7
topic3    -15.5
topic4     38.4
topic5    -33.8
topic6      4.5
topic7     -4.7
topic8     40.1
topic9     32.2
topic10   -30.0
topic11    45.3
topic12    -5.3
topic13    49.3
topic14    24.4
topic15    -4.8
dtype: float64


In [9]:
# try out truncated svd instead
svd = TruncatedSVD(n_components=16, n_iter=100)
svd_topic_vectors = svd.fit_transform(tfidf_docs.values)
svd_topic_vectors = pd.DataFrame(svd_topic_vectors, columns=columns, index=index)
svd_topic_vectors.round(3).head(6)

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,topic10,topic11,topic12,topic13,topic14,topic15
sms0,0.201,0.003,0.037,0.011,-0.019,-0.053,0.039,-0.066,0.012,-0.083,0.007,-0.007,0.002,-0.036,-0.014,0.037
sms1,0.404,-0.094,-0.078,0.051,0.1,0.047,0.023,0.065,0.023,-0.024,-0.004,0.036,0.043,-0.021,0.051,-0.042
sms2!,-0.03,-0.048,0.09,-0.067,0.091,-0.043,-0.0,-0.001,-0.057,0.051,0.125,0.023,0.026,-0.02,-0.042,0.052
sms3,0.329,-0.033,-0.035,-0.016,0.052,0.056,-0.166,-0.074,0.063,-0.108,0.022,0.023,0.073,-0.046,0.022,-0.07
sms4,0.002,0.031,0.038,0.034,-0.075,-0.093,-0.044,0.061,-0.045,0.029,0.028,-0.009,0.027,0.034,-0.083,-0.021
sms5!,-0.016,0.059,0.014,-0.006,0.122,-0.04,0.005,0.167,-0.023,0.064,0.041,0.055,-0.037,0.075,-0.001,0.02


In [10]:
# use LDiA now
# start by getting the BOW representation of the sms
np.random.seed(42)

counter = CountVectorizer(tokenizer=casual_tokenize)
bow_docs = pd.DataFrame(counter.fit_transform(raw_documents=sms.text).toarray(), index=index)
column_nums, terms = zip(*sorted(zip(counter.vocabulary_.values(), counter.vocabulary_.keys())))
bow_docs.columns = terms

print(sms.loc['sms0'].text)
print(bow_docs.loc['sms0'][bow_docs.loc['sms0'] > 0].head())

Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
,            1
..           1
...          2
amore        1
available    1
Name: sms0, dtype: int64


In [11]:
ldia = LDiA(n_components=16, learning_method='batch')
# now we fit that model to our data, which will give us 16 "topics" or dimensions
ldia = ldia.fit(bow_docs)
print(ldia.components_.shape)

(16, 9232)


In [12]:
components = pd.DataFrame(ldia.components_.T, index=terms, columns=columns)
print(components.round(2).head(3))

   topic0  topic1  topic2  topic3  topic4  topic5  topic6  topic7  topic8  \
!  184.03   15.00   72.22  394.95   45.48   36.14    9.55   44.81    0.43   
"    0.68    4.22    2.41    0.06  152.35    0.06    0.06    0.06    0.45   
#    0.06    0.06    0.06    0.06    0.06    2.07    0.06    0.06    0.06   

   topic9  topic10  topic11  topic12  topic13  topic14  topic15  
!   90.23    37.42    44.18    64.40   297.29    41.16    11.70  
"    0.68     8.42    11.42     0.07    62.72    12.27     0.06  
#    0.06     0.06     0.06     1.07     4.05     0.06     0.06  


In [13]:
print(components.topic3.sort_values(ascending=False)[:10])

!       394.952246
.       218.049724
to      119.533134
u       118.857546
call    111.948541
£       107.358914
,        96.954384
*        90.314783
your     90.215961
is       75.750037
Name: topic3, dtype: float64


In [14]:
# now lets compute the topic vectors for all documents
ldia16_topic_vectors = ldia.transform(bow_docs)
ldia16_topic_vectors = pd.DataFrame(ldia16_topic_vectors, index=index, columns=columns)
print(ldia16_topic_vectors.round(2).head(3))

       topic0  topic1  topic2  topic3  topic4  topic5  topic6  topic7  topic8  \
sms0     0.00    0.62    0.00    0.00    0.00    0.00    0.00    0.00    0.34   
sms1     0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.78   
sms2!    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00   

       topic9  topic10  topic11  topic12  topic13  topic14  topic15  
sms0     0.00     0.00     0.00     0.00     0.00     0.00     0.00  
sms1     0.01     0.01     0.12     0.01     0.01     0.01     0.01  
sms2!    0.98     0.00     0.00     0.00     0.00     0.00     0.00  


In [17]:
# lets see if we can build a classifier based on LDiA and LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
X_train, X_test, y_train, y_test = train_test_split(ldia16_topic_vectors, sms.spam, test_size=0.5, random_state=271828)
lda = LDA(n_components=1)
lda = lda.fit(X_train, y_train)
sms['ldia16_spam'] = lda.predict(ldia16_topic_vectors)
print(round(float(lda.score(X_test, y_test)),2))

0.94


In [18]:
# lets compare the result (94% accuracy on test set) to a pure tfidf approach

X_train, X_test, y_train, y_test = train_test_split(tfidf_docs, sms.spam.values, test_size=0.5, random_state=271828)
lda = LDA(n_components=1)
lda = lda.fit(X_train, y_train)
print(round(float(lda.score(X_train, y_train)),3))
print(round(float(lda.score(X_test, y_test)),3))

1.0
0.748
