In [17]:
# this notebook uses a training data set available via the books github to train an LDA (linear discriminant analysis) classifier to detect if a sms is spam or not
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.tokenize.casual import casual_tokenize
from sklearn.decomposition import PCA, TruncatedSVD, LatentDirichletAllocation as LDiA



# lets get the data into a dataframe
sms = pd.read_csv('sms-spam.csv', index_col=0)

sms['spam'] = sms.spam.astype(int)
index = ['sms{}{}'.format(i, '!'*j) for (i,j) in zip(range(len(sms)), sms.spam)]
sms = pd.DataFrame(sms.values, columns=sms.columns, index=index)
sms['spam'] = sms.spam.astype(int)
print(sms.head())

       spam                                               text
sms0      0  Go until jurong point, crazy.. Available only ...
sms1      0                      Ok lar... Joking wif u oni...
sms2!     1  Free entry in 2 a wkly comp to win FA Cup fina...
sms3      0  U dun say so early hor... U c already then say...
sms4      0  Nah I don't think he goes to usf, he lives aro...


In [2]:
# calculate tf idf vectors for the sms messages
tfidf = TfidfVectorizer(tokenizer=casual_tokenize)
tfidf_docs = tfidf.fit_transform(raw_documents=sms.text).toarray()
len(tfidf.vocabulary_)

9232

In [3]:
tfidf_docs = pd.DataFrame(tfidf_docs)
tfidf_docs = tfidf_docs - tfidf_docs.mean()
print(tfidf_docs.shape)

(4837, 9232)


In [4]:
# try PCA for dimension reduction first
pca = PCA(n_components=16)
pca = pca.fit(tfidf_docs)
pca_topic_vectors = pca.transform(tfidf_docs)
columns = ['topic{}'.format(i) for i in range(pca.n_components)]
pca_topic_vectors = pd.DataFrame(pca_topic_vectors, columns=columns, index=index)
pca_topic_vectors.round(3).head(6)

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,topic10,topic11,topic12,topic13,topic14,topic15
sms0,0.201,0.003,0.037,0.011,-0.019,-0.053,0.039,-0.065,0.014,-0.083,0.009,-0.008,-0.027,-0.014,-0.006,0.036
sms1,0.404,-0.094,-0.078,0.051,0.1,0.047,0.023,0.066,0.021,-0.023,-0.008,0.052,-0.031,-0.018,0.035,-0.009
sms2!,-0.03,-0.048,0.09,-0.067,0.09,-0.044,-0.0,-0.003,-0.054,0.051,0.129,0.01,-0.042,0.021,-0.003,0.061
sms3,0.329,-0.033,-0.035,-0.016,0.052,0.056,-0.166,-0.072,0.061,-0.107,0.019,0.043,-0.072,-0.016,0.008,-0.074
sms4,0.002,0.031,0.038,0.034,-0.075,-0.093,-0.044,0.062,-0.046,0.03,0.027,-0.003,-0.008,0.061,-0.081,0.016
sms5!,-0.016,0.059,0.014,-0.006,0.122,-0.04,0.005,0.166,-0.024,0.062,0.041,0.05,0.082,0.036,-0.002,-0.047


In [6]:
column_nums, terms = zip(*sorted(zip(tfidf.vocabulary_.values(), tfidf.vocabulary_.keys())))


In [7]:
weights = pd.DataFrame(pca.components_, columns=terms, index=['topic{}'.format(i) for i in range(16)])
print(weights.head(4).round(3))

            !      "      #   #150  #5000      $      %      &      '      (  \
topic0 -0.071  0.008 -0.001 -0.000 -0.001  0.003 -0.000 -0.012 -0.007 -0.005   
topic1  0.064  0.008  0.000 -0.000 -0.001 -0.001 -0.002 -0.016 -0.016  0.001   
topic2  0.071  0.027  0.000  0.001  0.002  0.000  0.001  0.059  0.008  0.019   
topic3 -0.059 -0.032 -0.001 -0.000 -0.001  0.001 -0.003 -0.028  0.001 -0.010   

        ...   ü'll      –    —      ‘      ’      “      …      ┾    〨ud  \
topic0  ...  0.003 -0.000 -0.0 -0.004 -0.001 -0.001 -0.002  0.001  0.001   
topic1  ...  0.002  0.001 -0.0  0.004 -0.001 -0.001  0.003  0.001  0.001   
topic2  ...  0.000  0.001 -0.0  0.002  0.000  0.001  0.002 -0.001 -0.001   
topic3  ... -0.001 -0.001  0.0  0.000 -0.000 -0.000  0.001  0.001  0.001   

            鈥  
topic0  0.001  
topic1  0.001  
topic2 -0.001  
topic3  0.001  

[4 rows x 9232 columns]


In [8]:
# lets find discount spam sms predicting words
deals = weights['! ;) :) half off free crazy deal only $ 80 %'.split()].round(3) * 100
print(deals)

            !   ;)    :)  half  off  free  crazy  deal  only    $   80    %
topic0   -7.1  0.1  -0.5  -0.0 -0.4  -2.0   -0.0  -0.1  -2.2  0.3 -0.0 -0.0
topic1    6.4  0.0   7.4   0.1  0.4  -2.3   -0.2  -0.1  -3.8 -0.1 -0.0 -0.2
topic2    7.1  0.2  -0.1   0.0  0.3   4.4    0.1  -0.1   0.7  0.0  0.0  0.1
topic3   -5.9 -0.3  -7.1   0.2  0.3  -0.3    0.0   0.1  -2.3  0.1 -0.1 -0.3
topic4   38.0 -0.1 -12.4  -0.1 -0.2   9.9    0.1  -0.2   3.1  0.3  0.1 -0.1
topic5  -26.6  0.1  -1.6  -0.3 -0.7  -1.4   -0.6  -0.2  -1.8 -0.9  0.0 -0.0
topic6  -10.9 -0.5  19.9  -0.4 -0.9  -0.5   -0.2  -0.1  -1.4 -0.0 -0.0 -0.1
topic7   16.0  0.1 -17.4   0.7  0.8  -2.8    0.0   0.0  -1.7 -0.3  0.0 -0.1
topic8   35.1  0.2   5.4  -0.4 -0.6  -0.0   -0.4  -0.4   3.1 -0.6 -0.0 -0.2
topic9    6.1 -0.3  16.6   1.4 -0.9   6.6   -0.5  -0.4   3.2 -0.5 -0.0 -0.0
topic10 -30.4 -0.2 -10.2   0.1  0.1  12.1    0.1   0.0  -0.0 -0.0 -0.0 -0.1
topic11  16.3  0.4  42.2   0.4  1.5  -3.7    0.1   0.0   1.7 -0.4 -0.0 -0.4
topic12  28.

In [9]:
print(deals.T.sum())

topic0    -11.9
topic1      7.6
topic2     12.7
topic3    -15.6
topic4     38.4
topic5    -34.0
topic6      4.9
topic7     -4.7
topic8     41.2
topic9     31.3
topic10   -28.5
topic11    58.1
topic12    25.5
topic13    29.3
topic14    23.4
topic15   -20.2
dtype: float64


In [11]:
# try out truncated svd instead
svd = TruncatedSVD(n_components=16, n_iter=100)
svd_topic_vectors = svd.fit_transform(tfidf_docs.values)
svd_topic_vectors = pd.DataFrame(svd_topic_vectors, columns=columns, index=index)
svd_topic_vectors.round(3).head(6)

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,topic10,topic11,topic12,topic13,topic14,topic15
sms0,0.201,0.003,0.037,0.011,-0.019,-0.053,0.039,-0.066,0.012,-0.083,0.007,-0.007,0.002,-0.036,-0.014,0.037
sms1,0.404,-0.094,-0.078,0.051,0.1,0.047,0.023,0.065,0.023,-0.024,-0.004,0.036,0.043,-0.021,0.051,-0.042
sms2!,-0.03,-0.048,0.09,-0.067,0.091,-0.043,-0.0,-0.001,-0.057,0.051,0.125,0.023,0.026,-0.02,-0.042,0.052
sms3,0.329,-0.033,-0.035,-0.016,0.052,0.056,-0.166,-0.074,0.063,-0.108,0.022,0.023,0.073,-0.046,0.022,-0.07
sms4,0.002,0.031,0.038,0.034,-0.075,-0.093,-0.044,0.061,-0.045,0.029,0.028,-0.009,0.027,0.034,-0.083,-0.021
sms5!,-0.016,0.059,0.014,-0.006,0.122,-0.04,0.005,0.167,-0.023,0.064,0.041,0.055,-0.037,0.075,-0.001,0.02


In [16]:
# use LDiA now
# start by getting the BOW representation of the sms
np.random.seed(42)

counter = CountVectorizer(tokenizer=casual_tokenize)
bow_docs = pd.DataFrame(counter.fit_transform(raw_documents=sms.text).toarray(), index=index)
column_nums, terms = zip(*sorted(zip(counter.vocabulary_.values(), counter.vocabulary_.keys())))
bow_docs.columns = terms

print(sms.loc['sms0'].text)
print(bow_docs.loc['sms0'][bow_docs.loc['sms0'] > 0].head())

       !  "  #  #150  #5000  $  %  &  '  (  ...  ü'll  –  —  ‘  ’  “  …  ┾  \
sms0   0  0  0     0      0  0  0  0  0  0  ...     0  0  0  0  0  0  0  0   
sms1   0  0  0     0      0  0  0  0  0  0  ...     0  0  0  0  0  0  0  0   
sms2!  0  0  0     0      0  0  0  1  1  1  ...     0  0  0  0  0  0  0  0   
sms3   0  0  0     0      0  0  0  0  0  0  ...     0  0  0  0  0  0  0  0   
sms4   0  0  0     0      0  0  0  0  0  0  ...     0  0  0  0  0  0  0  0   

       〨ud  鈥  
sms0     0  0  
sms1     0  0  
sms2!    0  0  
sms3     0  0  
sms4     0  0  

[5 rows x 9232 columns]
Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
,            1
..           1
...          2
amore        1
available    1
Name: sms0, dtype: int64


In [19]:
ldia = LDiA(n_components=16, learning_method='batch')
# now we fit that model to our data, which will give us 16 "topics" or dimensions
ldia = ldia.fit(bow_docs)
print(ldia.components_.shape)

(16, 9232)


In [20]:
components = pd.DataFrame(ldia.components_.T, index=terms, columns=columns)
print(components.round(2).head(3))

   topic0  topic1  topic2  topic3  topic4  topic5  topic6  topic7  topic8  \
!   20.03   20.44    0.85   52.68    0.06  204.72   36.35  115.52   11.72   
"    0.06    9.00    0.06    0.06    0.12    0.06   94.83    2.48    0.06   
#    0.06    0.06    0.06    0.06    0.06    0.06    0.06    0.06    2.06   

   topic9  topic10  topic11  topic12  topic13  topic14  topic15  
!   31.97   134.04    38.21   100.30   135.82   481.08     5.19  
"   21.29     3.80     0.06     7.41     0.06    92.34    24.30  
#    0.06     0.06     0.06     0.06     0.06     5.06     0.06  


In [23]:
print(components.topic3.sort_values(ascending=False)[:10])

to      174.492758
the     132.447021
and     103.139835
.        95.918658
your     64.338532
from     62.993171
for      59.914714
of       59.226303
free     59.122778
call     55.558074
Name: topic3, dtype: float64


In [None]:
# now lets compute the topic vectors for all documents
ldia16_topic_vectors = ldia.transform(bow_docs)
ldia16_topic_vectors = pd.DataFrame(ldia16_topic_vectors, index=index, columns=columns)