In [None]:
%pip install numpy scikit-learn gensim
%pip install --no-cache-dir --force-reinstall https://dm.cs.tu-dortmund.de/nats/nats25_07_01_word2vec-0.1-py3-none-any.whl
import nats25_07_01_word2vec

# Explore pre-trained word2vec embeddings

In [None]:
import hashlib
def md5_checksum(fname):
  'MD5 checksum function to validate that the large files were downloaded correctly'
  hash_md5 = hashlib.md5()
  with open(fname, "rb") as f:
    for chunk in iter(lambda: f.read(4096), b""):
      hash_md5.update(chunk)
  return hash_md5.hexdigest()

In [None]:
### Load the input data - do not modify
import json, gzip, urllib
import numpy as np
file_path, _ = urllib.request.urlretrieve("https://dm.cs.tu-dortmund.de/nats/data/minecraft-articles.json.gz")
raw = json.load(gzip.open(file_path, "rt", encoding="utf-8"))
titles, texts, classes = [x["title"] for x in raw], [x["text"] for x in raw], [x["heuristic"] for x in raw]

In [None]:
### Load the pretrained word2vec model from Google
# These files are large (approx. 200MB in total)! It will take some time.
all_paths = []
for url, checksum in [
    ["https://dm.cs.tu-dortmund.de/nats/data/w2v-google-news.wordvectors.got_subset.words.csv.gz", "de96f3aa4dee24c6905b79e9a5c6eeb8"],
    ["https://dm.cs.tu-dortmund.de/nats/data/w2v-google-news.wordvectors.got_subset.vectors00.npy", "045af08301542a0ddd560f85d07a198f"],
    ["https://dm.cs.tu-dortmund.de/nats/data/w2v-google-news.wordvectors.got_subset.vectors01.npy", "51c147e4bdee65c95cb47d96dd818a7b"],
    ["https://dm.cs.tu-dortmund.de/nats/data/w2v-google-news.wordvectors.got_subset.vectors02.npy", "7fcc8314539a9e7b83b7703fb97ce890"],
    ["https://dm.cs.tu-dortmund.de/nats/data/w2v-google-news.wordvectors.got_subset.vectors03.npy", "b68f15aca97993103852c62ce5cf49a5"],
    ["https://dm.cs.tu-dortmund.de/nats/data/w2v-google-news.wordvectors.got_subset.vectors04.npy", "8a39594c9e9aa3cf07f2aa624de16d96"],
    ["https://dm.cs.tu-dortmund.de/nats/data/w2v-google-news.wordvectors.got_subset.vectors05.npy", "96cc1f43c6865d700f7df479b5bb30c5"],
]:
    file_path, _ = urllib.request.urlretrieve(url)
    assert md5_checksum(file_path) == checksum, f"Corrupted file '{file_path}'. Please delete manually and restart this cell.\nIf the error persists, try reloading this page."
    all_paths.append(file_path)
words_path, vec_chunk_paths = all_paths[0], all_paths[1:]

In [None]:
# Import the vocabulary and vector representations into a KeyedVectors Gensim model
from gensim.models import KeyedVectors
with gzip.open(words_path, "rt") as f: words = f.read().split("\n")
vecs = np.concatenate([
    np.load(file_path)
    for file_path in vec_chunk_paths
], axis=0)
model = KeyedVectors(count=vecs.shape[0],vector_size=vecs.shape[1],dtype=vecs.dtype)
model.add_vectors(words, vecs)
model.fill_norms()

In [None]:
# Find the 10 most similar words to "Stone"
most_stone = None # words only
pass # Your solution here

In [None]:
nats25_07_01_word2vec.hidden_tests_6_0(most_stone)

## Verify the classic king-queen example

Verify that "King - Man + Woman = Queen", using the built-in function for this.

In [None]:
most_kmw = None # 10 nearest words to "king-man+woman" using the gensim API
pass # Your solution here

In [None]:
nats25_07_01_word2vec.hidden_tests_9_0(most_kmw)

## Try using Euclidean geometry

Get the vectors for king, man, queen, and woman.

Compute king-man+woman, and compute the distances to each of above four words. What word is closest?

In [None]:
king, man, queen, woman = None, None, None, None # get the word vectors
pass # Your solution here

In [None]:
target = king - man + woman
for word, vec in [("king", king), ("man", man), ("woman", woman), ("queen", queen)]:
    score = np.sqrt(((target - vec)**2).sum())
    print("distance(king - man + woman, %s) = %.5f" % (word, score))

In [None]:
nats25_07_01_word2vec.hidden_tests_13_0(queen, man, model, king, woman)

## Document representations

Represent each document as the average word2vec vector of all words present in the model. Do not normalize.

In [None]:
document_vectors = np.zeros((len(titles), 300))
from gensim.utils import tokenize
for i, (title, text) in enumerate(zip(titles, texts)):
    tokens = tokenize(title + "\n" + text)
    pass # Your solution here

In [None]:
nats25_07_01_word2vec.hidden_tests_16_0(titles, document_vectors)

## Find the document with the shortest vector

Note: this likely will be one of the longer documents.

In [None]:
shortest = None # Document number of the document with the shortest vector
pass # Your solution here
print(titles[shortest], len(texts[shortest]))

In [None]:
nats25_07_01_word2vec.hidden_tests_19_0(shortest, titles, document_vectors)

## Find the two most similar documents

Compute a similarity matrix, and find the largest pair of articles.

Do *not* use nested for loops, this will timeout (use the "Validate" option).

In [None]:
# Compute a similarity matrix
similarity_matrix = None
pass # Your solution here

In [None]:
nats25_07_01_word2vec.hidden_tests_22_0(similarity_matrix)

In [None]:
most_similar = None # Pair of two different documents
pass # Your solution here
print(titles[most_similar[0]], " and ", titles[most_similar[1]])
print(len(texts[most_similar[0]]), " and ", len(texts[most_similar[1]]))

In [None]:
nats25_07_01_word2vec.hidden_tests_24_0(similarity_matrix, most_similar)

## Find the two most similar longer documents

Now only consider documents that have at least 10000 characters in the body!

In [None]:
most_similar = None # Pair of two different documents
pass # Your solution here
print(titles[most_similar[0]], " and ", titles[most_similar[1]])
print("Lengths:", len(texts[most_similar[0]]), " and ", len(texts[most_similar[1]]))

In [None]:
nats25_07_01_word2vec.hidden_tests_27_0(similarity_matrix, most_similar, titles, texts)


## Run k-means and spherical k-means

Cluster the document vectors (*not* the similarity matrix) with spherical k-means.

Use k=10, and a fixed random seed of 42.

Recall the assumptions of our spherical k-means implementation!

In [None]:
kcent = None # Compute the k-means cluster centers
kassi = None # Compute the k-means cluster assignment
from sklearn.cluster import KMeans
pass # Your solution here

In [None]:
# Minimalistic implementation for spherical k-means, so we use the same version in this assignment
# This is NOT meant as an example of good code, but to be short.
def initial_centers(X, k, seed):
    return X[np.random.default_rng(seed=seed).choice(X.shape[0], k, replace=False)]

def sphericalkmeans(X, centers, max_iter=100):
    assert abs((X**2).sum()-len(X)) < 1e-7, "Improper input for spherical k-means!"
    last_assignment = None
    for iter in range(max_iter):
        assignment = np.asarray((X @ centers.T).argmax(axis=1)).squeeze()
        if last_assignment is not None and all(assignment == last_assignment): break
        last_assignment, centers = assignment, np.zeros(centers.shape)
        for i in range(centers.shape[0]):
            c_assignment = assignment == i
            if np.sum(c_assignment) == 0:
                # If no points were assigned, do not move the center
                continue
            c = np.sum(X[c_assignment,:], axis=0)
            centers[i] = c / np.linalg.norm(c)
    return centers, assignment

In [None]:
scent = None # Compute the spherical k-means cluster centers
sassi = None # Compute the spherical k-means cluster assignment
pass # Your solution here

## Explore your result

Explore the result: write a function to determine the most important words for each factor, and the most relevant documents.

In [None]:
def most_central(tfidf, centers, assignment, i, k=5):
    """Find the most central documents of cluster i"""
    pass # Your solution here

def explain(tfidf, titles, classes, centers, assignment):
    """Explain the clusters: print
    (1) relative size of each cluster
    (2) three most frequent classes of each cluster
    (3) five most central documents of each cluster
    (4) ARI of the entire clustering"""
    from sklearn.metrics import adjusted_rand_score
    from collections import Counter
    pass # Your solution here

In [None]:
print("Regular k-means clustering:")
explain(document_vectors, titles, classes, kcent, kassi)

In [None]:
# Note: in case of poor performance, revisit your code above!
print("Spherical k-means clustering:")
explain(document_vectors, titles, classes, scent, sassi)

In [None]:
nats25_07_01_word2vec.hidden_tests_36_0(document_vectors, scent, sassi, titles, classes, explain)