In [10]:
documents = ["This little kitty came to play when I was eating at a restaurant.",
             "Merley has the best squooshy kitten belly.",
             "Google Translate app is incredible.",
             "If you open 100 tab in google you get a smiley face.",
             "Best cat photo I've ever taken.",
             "Climbing ninja cat.",
             "Impressed with google map feedback.",
             "Key promoter extension for Google Chrome."]

## K-Means Clustering

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

In [3]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

In [4]:
true_k = 2
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=2, n_init=1, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [5]:
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print

Top terms per cluster:
Cluster 0:
 google
 feedback
 map
 app
 impressed
 incredible
 translate
 key
 extension
 chrome
Cluster 1:
 cat
 best
 climbing
 ninja
 ve
 photo
 taken
 belly
 merley
 kitten


In [6]:
print("Prediction")
Y = vectorizer.transform(["chrome browser to open."])
prediction = model.predict(Y)
print(prediction)

Prediction
[0]


In [7]:
print("Prediction")
Y = vectorizer.transform(["My cat is hungry."])
prediction = model.predict(Y)
print(prediction)

Prediction
[1]


## Latent Dirichlet Allocation (LDA)

In [17]:
import gensim
import re
from nltk.tokenize import word_tokenize

plainDoc = []
lcDoc = []
tokens = []

for doc in documents:
    doc = re.sub("[^\w\s]", "", doc)
    plainDoc.append(doc)

for doc in plainDoc:
    doc = doc.lower()
    lcDoc.append(doc)
    
for doc in lcDoc:
    words = word_tokenize(doc)
    tokens.append(words)
    
print (tokens)
print()

[['this', 'little', 'kitty', 'came', 'to', 'play', 'when', 'i', 'was', 'eating', 'at', 'a', 'restaurant'], ['merley', 'has', 'the', 'best', 'squooshy', 'kitten', 'belly'], ['google', 'translate', 'app', 'is', 'incredible'], ['if', 'you', 'open', '100', 'tab', 'in', 'google', 'you', 'get', 'a', 'smiley', 'face'], ['best', 'cat', 'photo', 'ive', 'ever', 'taken'], ['climbing', 'ninja', 'cat'], ['impressed', 'with', 'google', 'map', 'feedback'], ['key', 'promoter', 'extension', 'for', 'google', 'chrome']]



In [37]:
id2word = gensim.corpora.Dictionary(tokens)
count = 0
for k, v in id2word.iteritems():
    print(k, v)
    count += 1


0 a
1 at
2 came
3 eating
4 i
5 kitty
6 little
7 play
8 restaurant
9 this
10 to
11 was
12 when
13 belly
14 best
15 has
16 kitten
17 merley
18 squooshy
19 the
20 app
21 google
22 incredible
23 is
24 translate
25 100
26 face
27 get
28 if
29 in
30 open
31 smiley
32 tab
33 you
34 cat
35 ever
36 ive
37 photo
38 taken
39 climbing
40 ninja
41 feedback
42 impressed
43 map
44 with
45 chrome
46 extension
47 for
48 key
49 promoter


In [38]:
bow_corpus = [id2word.doc2bow(doc) for doc in tokens]
for i in range(len(bow_corpus[0])):
    print("Word {} is \"{}\"".format(bow_corpus[0][i][0], 
                                id2word[bow_corpus[0][i][0]]))


Word 0 is "a"
Word 1 is "at"
Word 2 is "came"
Word 3 is "eating"
Word 4 is "i"
Word 5 is "kitty"
Word 6 is "little"
Word 7 is "play"
Word 8 is "restaurant"
Word 9 is "this"
Word 10 is "to"
Word 11 is "was"
Word 12 is "when"


In [22]:
tfidf = gensim.models.TfidfModel(dictionary=id2word, normalize=True)
tfidf_vectors = [tfidf[id2word.doc2bow(doc)] for doc in tokens]
print (tfidf_vectors)

[[(0, 0.1889822365046136), (1, 0.2834733547569204), (2, 0.2834733547569204), (3, 0.2834733547569204), (4, 0.2834733547569204), (5, 0.2834733547569204), (6, 0.2834733547569204), (7, 0.2834733547569204), (8, 0.2834733547569204), (9, 0.2834733547569204), (10, 0.2834733547569204), (11, 0.2834733547569204), (12, 0.2834733547569204)], [(13, 0.39391929857916763), (14, 0.2626128657194451), (15, 0.39391929857916763), (16, 0.39391929857916763), (17, 0.39391929857916763), (18, 0.39391929857916763), (19, 0.39391929857916763)], [(20, 0.4931969619160719), (21, 0.1643989873053573), (22, 0.4931969619160719), (23, 0.4931969619160719), (24, 0.4931969619160719)], [(0, 0.18814417367671946), (21, 0.09407208683835973), (25, 0.2822162605150792), (26, 0.2822162605150792), (27, 0.2822162605150792), (28, 0.2822162605150792), (29, 0.2822162605150792), (30, 0.2822162605150792), (31, 0.2822162605150792), (32, 0.2822162605150792), (33, 0.5644325210301584)], [(14, 0.30151134457776363), (34, 0.30151134457776363), (35

In [35]:
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                       num_topics=2, 
                                       id2word=id2word, 
                                       passes=2, 
                                       workers=2)

In [36]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.047*"you" + 0.047*"a" + 0.032*"google" + 0.028*"when" + 0.028*"in" + 0.028*"100" + 0.028*"tab" + 0.028*"if" + 0.028*"smiley" + 0.028*"this"
Topic: 1 
Words: 0.061*"google" + 0.041*"best" + 0.040*"cat" + 0.027*"chrome" + 0.027*"the" + 0.027*"squooshy" + 0.027*"promoter" + 0.027*"key" + 0.027*"merley" + 0.027*"kitten"
