In [1]:
#Text clustering with K-means
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

In [2]:
#Create document list
documents = ["google plus",
             "google mail",
             "Google Translate app",
             "If you open 100 tab in google you get a smiley face.",
             "Best cat photo",
             "Climbing ninja cat.",
             "Impressed with google map feedback.",
             "Key promoter extension for Google Chrome."]

In [3]:
#Transform the data
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)
print(X)

  (0, 17)	0.8945306593491452
  (0, 9)	0.4470065989271116
  (1, 12)	0.8945306593491452
  (1, 9)	0.4470065989271116
  (2, 1)	0.666709508226001
  (2, 21)	0.666709508226001
  (2, 9)	0.33316191751472424
  (3, 7)	0.4364477996888222
  (3, 19)	0.4364477996888222
  (3, 20)	0.4364477996888222
  (3, 0)	0.4364477996888222
  (3, 15)	0.4364477996888222
  (3, 9)	0.21809766329314145
  (4, 16)	0.6083131546128155
  (4, 3)	0.5098138992318764
  (4, 2)	0.6083131546128155
  (5, 14)	0.6083131546128155
  (5, 5)	0.6083131546128155
  (5, 3)	0.5098138992318764
  (6, 8)	0.5547248737231761
  (6, 13)	0.5547248737231761
  (6, 10)	0.5547248737231761
  (6, 9)	0.2772019902858183
  (7, 4)	0.4850877520486855
  (7, 6)	0.4850877520486855
  (7, 18)	0.4850877520486855
  (7, 11)	0.4850877520486855
  (7, 9)	0.24240357103271387


In [4]:
#Build the clusters
true_k = 3
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)


In [6]:
#Profile the clusters
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print

Top terms per cluster:
Cluster 0:
 google
 plus
 mail
 translate
 app
 map
 feedback
 impressed
 key
 promoter
Cluster 1:
 cat
 best
 photo
 ninja
 climbing
 translate
 google
 app
 chrome
 extension
Cluster 2:
 100
 smiley
 open
 face
 tab
 google
 feedback
 app
 best
 cat


In [7]:
#Use model for prediction
print("Prediction")
 
Y = vectorizer.transform(["google to open."])
prediction = model.predict(Y)
print(prediction)
 

Prediction
[0]


In [8]:
Y = vectorizer.transform(["My cat is hungry."])
prediction = model.predict(Y)
print(prediction)

[1]


In [9]:
Y = vectorizer.transform(["google feedback"])
prediction = model.predict(Y)
print(prediction)

[0]
