# Clustering exercises

This is a modified, more compact version of some of code from the notebook on clustering. You may wish to use this as a starting point for doing some of the exercises.

(Import the necessary libraries.)

In [None]:
from gensim.models import KeyedVectors
import sklearn.cluster
from collections import defaultdict
from pandas import DataFrame

## Setting up data

Note that the word vector files are found on the server, so you need to either work there or make copies.

Feel free to add your own words here, but note that the code only loads vectors for the most common words (`limit` argument to `load_word2vec_format`)

In [None]:
EN_WV_PATH = "/course_data/textmine/wordvecs/GoogleNews-vectors-negative300.bin"
FI_WV_PATH = "/course_data/textmine/wordvecs/pb34_wf_200_v2.bin"

wv = KeyedVectors.load_word2vec_format(FI_WV_PATH, binary=True, limit=20000)
words = [
    "koira", "kissa", "tietokone", "näyttö", "syödä", "juoda", "kävellä", "juosta", "yksi", "kaksi", "1", "2",
    "hyvä", "paha", "ruma", "onnellinen", "onneton", "ja", "tai", "mitä", "että", "oikein", "todella","hyvin"
]

# If you prefer to work with English, uncomment the following lines:
# wv = KeyedVectors.load_word2vec_format(EN_WV_PATH, binary=True, limit=20000)
# words= [
#     "dog", "cat", "computer", "monitor", "eat", "drink", "walk", "run", "one", "two", "1", "2",
#     "good", "bad", "ugly", "happy", "unhappy", "or", "what", "that", "right", "very", "well"
# ]

wv.init_sims(replace=True)    # normalize

word_indices = [wv.vocab[w].index for w in words]

## K-means clustering

In [None]:
k = sklearn.cluster.MiniBatchKMeans(batch_size=5000, n_clusters=20, random_state=1234)
distances = k.fit_transform(wv.vectors)

## Visualization

In [None]:
def group_by_label(words, labels):
    grouped = defaultdict(list)
    for word, label in zip(words, labels):
            grouped[label].append(word)
    return grouped


def show_grouped(words, labels):
    grouped = group_by_label(words, labels)
    return DataFrame(list(sorted(grouped.items())), columns=['label', 'words']).style.hide_index()


show_grouped(words, k.labels_[word_indices])