In [1]:
import numpy as np
from sklearn.cluster import KMeans
from collections import defaultdict

max_cluster = 20

doc_word = np.load("data/science2k-doc-word.npy")
word_doc = np.load("data/science2k-word-doc.npy")

def read_text(filename):
    li = []
    f = open(filename, 'r')
    for line in f:
        li.append(line.strip())
    f.close()
    return li

word_list = read_text("data/science2k-vocab.txt")
title_list = read_text("data/science2k-titles.txt")

In [2]:
def print_top10_col(col_list, centers, mean_vector, fname):
    for i in range(len(centers)):
        f = open(fname+'_c'+str(i+2)+'.txt', 'w')
        f.write('number of clusters:'+str(i+2)+'\n\n')
        for center in centers[i]:
            temp = center - mean_vector
            li = [(temp[i], i) for i in range(len(temp))]
            ind = temp.argsort()[-10:][::-1]
            for j in ind:
                f.write(col_list[j]+'\n')
            f.write('\n')
        f.close()

def print_top10_row(data, row_list, centers, predicts, fname):
    numCluster = 2
    for i in range(len(predicts)):
        d = defaultdict(list)
        for (j, p) in enumerate(predicts[i]):
            d[p].append((np.linalg.norm(data[j] - centers[i][p]), j))
        f = open(fname+'_c'+str(numCluster)+'.txt', 'w')
        f.write('number of clusters:'+str(numCluster)+'\n\n')
        numCluster += 1
        for j in range(numCluster):
            li = sorted(d[j])[:11]
            for idx in li:
                f.write(row_list[idx[1]]+'\n')
            f.write('\n')
        f.close()

def train_kmeans(data):
    centers = []
    predicts = []
    for numCluster in range(2, max_cluster + 1):
        k = KMeans(n_clusters = numCluster, random_state = 42).fit(data)
        centers.append(k.cluster_centers_)
        predicts.append(k.predict(data))
    mean_vector = np.mean(data, axis=0)
    return centers, predicts, mean_vector

In [3]:
doc_centers, doc_predicts, doc_mean = train_kmeans(doc_word)

In [5]:
print_top10_row(doc_word, title_list, doc_centers, doc_predicts, 'doc_K-Mean_top_documents')
print_top10_col(word_list, doc_centers, doc_mean, 'doc_K-Mean_top_words')

In [None]:
word_centers, word_predicts, word_mean = train_kmeans(word_doc)

In [None]:
print_top10_row(word_doc, word_list, word_centers, word_predicts, 'word_K-Mean_top_words')
print_top10_col(title_list, word_centers, word_mean, 'word_K-Mean_top_documents')