In [5]:
#https://datascience.stackexchange.com/questions/23969/sentence-similarity-prediction
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import numpy

texts = ["This first text talks about houses and dogs",
        "This is about airplanes and airlines",
        "This is about dogs and houses too, but also about trees",
        "Trees and dogs are main characters in this story",
        "This story is about batman and superman fighting each other", 
        "Nothing better than another story talking about airplanes, airlines and birds",
        "Superman defeats batman in the last round"]
vectorizer = TfidfVectorizer(stop_words="english")
X = vectorizer.fit_transform(texts)
print(X)
#X = vectorizer.fit_transform(texts)
# used words (axis in our multi-dimensional space)
words = vectorizer.get_feature_names()
print("words", words)
n_clusters=3
number_of_seeds_to_try=10
max_iter = 300
number_of_process=2 # seads are distributed
model = KMeans(n_clusters=n_clusters, max_iter=max_iter, n_init=number_of_seeds_to_try, n_jobs=number_of_process).fit(X)
labels = model.labels_
# indices of preferible words in each cluster
ordered_words = model.cluster_centers_.argsort()[:, ::-1]
ordered_words

  (0, 16)	0.5596754170004217
  (0, 15)	0.5596754170004217
  (0, 9)	0.4645786606417216
  (0, 7)	0.3971064382343257
  (1, 1)	0.7071067811865475
  (1, 0)	0.7071067811865475
  (2, 9)	0.60515811332262
  (2, 7)	0.5172690941469574
  (2, 17)	0.60515811332262
  (3, 7)	0.3690711741302499
  (3, 17)	0.43177993416898963
  (3, 10)	0.5201629673964019
  (3, 5)	0.5201629673964019
  (3, 12)	0.3690711741302499
  (4, 12)	0.41798437105722464
  (4, 2)	0.4890039560636677
  (4, 13)	0.4890039560636677
  (4, 8)	0.5891004391952713
  (5, 1)	0.3757037997205483
  (5, 0)	0.3757037997205483
  (5, 12)	0.32113915333960336
  (5, 3)	0.4526083494381602
  (5, 14)	0.4526083494381602
  (5, 4)	0.4526083494381602
  (6, 2)	0.4516351457444982
  (6, 13)	0.4516351457444982
  (6, 6)	0.544082434129559
  (6, 11)	0.544082434129559
words ['airlines', 'airplanes', 'batman', 'better', 'birds', 'characters', 'defeats', 'dogs', 'fighting', 'houses', 'main', 'round', 'story', 'superman', 'talking', 'talks', 'text', 'trees']
l [0 1 0 0 2 1 2

array([[ 7,  9, 17, 15, 16,  5, 10, 12,  1,  2,  3,  4,  8,  6, 11, 13,
        14,  0],
       [ 0,  1, 14,  3,  4, 12,  7,  2,  5,  6, 17, 16,  9, 10, 11, 13,
        15,  8],
       [ 2, 13,  8,  6, 11, 12,  1,  3,  4,  5, 17,  7, 16,  9, 10, 14,
        15,  0]])

In [7]:
print("centers:", model.cluster_centers_)
print("labels", labels)
print("intertia:", model.inertia_)

texts_per_cluster = numpy.zeros(n_clusters)
for i_cluster in range(n_clusters):
    for label in labels:
        if label==i_cluster:
            texts_per_cluster[i_cluster] +=1 

print("Top words per cluster:")
for i_cluster in range(n_clusters):
    print("Cluster:", i_cluster, "texts:", int(texts_per_cluster[i_cluster])),
    for term in ordered_words[i_cluster, :10]:
        print("\t"+words[term])

print("\n")
print("Prediction")

text_to_predict = "Why batman was defeated  by superman so easy?"
Y = vectorizer.transform([text_to_predict])
predicted_cluster = model.predict(Y)[0]
texts_per_cluster[predicted_cluster]+=1

print(text_to_predict)
print("Cluster:", predicted_cluster, "texts:", int(texts_per_cluster[predicted_cluster])),
for term in ordered_words[predicted_cluster, :10]:
    print("\t"+words[term])

centers: [[0.         0.         0.         0.         0.         0.17338766
  0.         0.42781557 0.         0.35657892 0.17338766 0.
  0.12302372 0.         0.         0.18655847 0.18655847 0.34564602]
 [0.54140529 0.54140529 0.         0.22630417 0.22630417 0.
  0.         0.         0.         0.         0.         0.
  0.16056958 0.         0.22630417 0.         0.         0.        ]
 [0.         0.         0.47031955 0.         0.         0.
  0.27204122 0.         0.29455022 0.         0.         0.27204122
  0.20899219 0.47031955 0.         0.         0.         0.        ]]
labels [0 1 0 0 2 1 2]
intertia: 2.303425701724866
Top words per cluster:
Cluster: 0 texts: 3
	dogs
	houses
	trees
	talks
	text
	characters
	main
	story
	airplanes
	batman
Cluster: 1 texts: 2
	airlines
	airplanes
	talking
	better
	birds
	story
	dogs
	batman
	characters
	defeats
Cluster: 2 texts: 2
	batman
	superman
	fighting
	defeats
	round
	story
	airplanes
	better
	birds
	characters


Prediction
Why ba