# basic LSA/LDA clustering with k-means

see:

https://docs.google.com/presentation/d/1yyCyEBH3m8V8IrrXeDkAidKv18chxkXyVY179OaFXTc/edit?usp=sharing

In [52]:
import codecs
from gensim_wrapper import LdaTransformer, LsiTransformer

In [53]:
# read in files
f_sents = codecs.open('datasets/brown_sents.txt', 'rb', encoding='utf8')
f_classes = codecs.open('datasets/brown_topics.txt', 'rb', encoding='utf8')
sents = [sent.strip() for sent in f_sents.readlines()]
labels = [label.strip() for label in f_classes.readlines()]

In [54]:
# tokenize the files for LDA/LSA
sentlist = []
for sent in sents:
    sentlist.append(sent.split())

In [55]:
# turn string labels into integer indices
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
indexed_labels = encoder.fit_transform(labels)

In [56]:
# get our LDA/LSI vectors
lda = LsiTransformer()
lda_model = lda.fit(sentlist)
lda_vectors = lda.transform(sentlist)

In [57]:
# cool stuff for seeing the most informative words in each topic
mod = lda_model.model
mod.print_topics(6)

[(0,
  '0.187*"i" + 0.185*"he" + 0.157*"you" + 0.150*"she" + 0.141*"her" + 0.132*"had" + 0.129*"was" + 0.127*"his" + 0.123*"it" + 0.119*"that"'),
 (1,
  '0.297*"she" + 0.259*"her" + 0.246*"i" + 0.185*"you" + -0.148*"####" + -0.137*"##" + 0.118*"had" + -0.118*"is" + -0.113*"state" + -0.113*"#"'),
 (2,
  '-0.418*"you" + 0.377*"her" + 0.373*"she" + -0.200*"we" + -0.172*"i" + -0.133*"god" + 0.124*"had" + -0.120*"your" + -0.118*"do" + 0.111*"##"'),
 (3,
  '-0.428*"he" + 0.363*"she" + -0.317*"his" + 0.266*"her" + 0.182*"you" + -0.164*"him" + 0.151*"i" + -0.111*"had" + 0.090*"are" + -0.088*"mr"'),
 (4,
  '-0.341*"you" + -0.260*"##" + 0.257*"we" + -0.206*"####" + -0.165*"i" + 0.133*"god" + 0.124*"her" + -0.119*"year" + 0.119*"she" + 0.119*"our"'),
 (5,
  '-0.550*"i" + 0.394*"you" + -0.297*"my" + 0.190*"he" + 0.186*"she" + 0.146*"your" + -0.137*"me" + 0.112*"him" + 0.095*"her" + 0.084*"tax"')]

In [79]:
# classify data
from sklearn.cluster import KMeans

# make a classifier - change to correct number of clusters
kmeans = KMeans(n_clusters=6)
# """"train"""" the clssifier on our LSI output
kmeans.fit_transform(lda_vectors, indexed_labels)

array([[ 0.38770238,  0.38740793,  0.393154  ,  0.38222051,  0.43140659,
         0.33605692],
       [ 0.33033741,  0.3485736 ,  0.34906188,  0.32162806,  0.37720841,
         0.28489727],
       [ 0.36888334,  0.37988293,  0.39054674,  0.36213094,  0.41698736,
         0.33133754],
       ..., 
       [ 0.29822472,  0.33774039,  0.32552159,  0.28063631,  0.34698865,
         0.29263365],
       [ 0.34573966,  0.36404228,  0.36251995,  0.32580787,  0.38437554,
         0.32297012],
       [ 0.4637244 ,  0.49308723,  0.365383  ,  0.43671274,  0.46379215,
         0.46721956]], dtype=float32)

In [80]:
# get cluster indices from our data
from sklearn.metrics import accuracy_score
preds = kmeans.predict(lda_vectors)   

In [81]:
# get the cluster centers
kmeans.cluster_centers_

array([[  1.00013241e-01,  -4.93013337e-02,   3.21811140e-02,
         -2.37658694e-02,  -3.02435886e-02,  -1.30312406e-02,
          4.44982201e-02,  -7.34035508e-04,   1.42463474e-02,
          2.73123966e-03,   6.97068730e-03,  -4.78965417e-03,
         -1.86554692e-03,  -5.82366623e-03,  -1.46581689e-02,
          2.31503160e-03,   8.13646708e-04,   9.09469649e-03,
          1.15037151e-02,   1.63518416e-04,   3.37289274e-03,
         -2.93183886e-03,   1.82498095e-03,   1.33361369e-02,
          5.55813126e-03,   5.81832882e-03,  -2.39139819e-03,
          1.81832048e-03,  -4.61177016e-03,   4.79786796e-03,
         -6.74909214e-03,  -4.84608579e-03,  -1.35461311e-03,
          2.81274132e-03,  -1.86032522e-03,  -8.42650130e-04,
         -1.48801046e-04,  -6.66506123e-04,   1.74419838e-04,
         -4.33627138e-04,  -4.65157733e-04,  -1.28406181e-03,
          4.51450422e-03,   8.08688288e-04,  -3.29837482e-03,
         -1.06099434e-03,   3.48760327e-03,   4.62140073e-04,
        

In [84]:
# reduce data to 2-D data for graphing purposes
from sklearn.decomposition import PCA
reduced_lda = PCA(n_components=3).fit_transform(lda_vectors)

In [82]:
# now here we can plot the reduced_lda data against the clusters and centers
# but, i dont know how easily