# Evaluating Word Representations
Matthew van Rijn - 10779353 <br />
Ruben gerritse - 10760326

In [4]:
from gensim.models import KeyedVectors
from operator import itemgetter
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.palettes import d3
from bokeh.io import output_notebook
from bokeh.layouts import column

In [3]:
# Load the word embeddings. This may take a while.
# NOTE: The number of words and embedding dimension must be added to the
# first line before loading.
embeddings = ['deps', 'bow2', 'bow5']
models = {x: KeyedVectors.load_word2vec_format('{}.words'.format(x)) for x in embeddings}

### Word Similarity Task

In [11]:
# Remove extra data from the simlex dataset to allow it to be loaded by GenSim
with open('SimLex-999/SimLex-999.txt', 'r') as f:
    data = f.read().strip().split('\n')
    data_sim = [itemgetter(*[0, 1, 3])(line.split()) for line in data][1:]

with open('SimLex-999/SimLex-converted.txt', 'w+') as f:
    for line in data_sim:
        f.write(' '.join(line) + '\n')

In [12]:
# Get the Pearson and Spearman correlation coefficients
datasets = ['MEN/MEN_dataset_natural_form_full', 'SimLex-999/SimLex-converted.txt']

for model_name, model in models.items():
    print('Evaluating with {}'.format(model_name))
    for dataset in datasets:
        ((pc, pp),(sc, sp),_) = model.evaluate_word_pairs(dataset, delimiter=' ')
        print('Pearson:  {:.4f}/{:.2E}\nSpearman: {:.4f}/{:.2E} ({})\n'.format(pc, pp, sc, sp, dataset))

Evaluating with deps
Pearson:  0.5974/1.02E-289
Spearman: 0.6178/2.37E-315 (MEN/MEN_dataset_natural_form_full)

Pearson:  0.4619/6.84E-54
Spearman: 0.4456/7.41E-50 (SimLex-999/SimLex-converted.txt)

Evaluating with bow2
Pearson:  0.6777/0.00E+00
Spearman: 0.6999/0.00E+00 (MEN/MEN_dataset_natural_form_full)

Pearson:  0.4285/7.99E-46
Spearman: 0.4141/1.23E-42 (SimLex-999/SimLex-converted.txt)

Evaluating with bow5
Pearson:  0.7082/0.00E+00
Spearman: 0.7232/0.00E+00 (MEN/MEN_dataset_natural_form_full)

Pearson:  0.3756/8.61E-35
Spearman: 0.3674/2.98E-33 (SimLex-999/SimLex-converted.txt)



### Word Analogy Task

In [13]:
with open('questions-words.txt') as f:
    data = [category.split('\n')[:-1] for category in f.read().strip().split(': ')[1:]]
    data = {category[0]: [tuple(word.lower().split()) for word in category[1:]] for category in data}

In [14]:
for model_name, model in models.items():
    print('Evaluating model {}:'.format(model_name))
    total_acc = []
    total_mrr = []
    total_len = 0
    for category in data:
        acc = []
        mrr = []
        total = len(data[category])
        for i, (w1, w2, w3, w4) in enumerate(data[category]):
            if w4 in model:
                try:
                    ranking = model.most_similar(positive=[w2, w3], negative=[w1], topn=1000)
                    ranking = [word for word, _ in ranking]

                    # Collect stats
                    acc.append(ranking[0] == w4)
                    try:
                        mrr.append(1/(ranking.index(w4)+1))
                    except ValueError:
                        mrr.append(0)
                except KeyError:
                    continue

            # Print progress bar
            if i % 10 == 0 or i+1 == total:
                print('\r{:05d}/{:05d}'.format(i+1, total), end='')

        total_acc += acc
        total_mrr += mrr
        total_len += total

        print(' Accuracy: {:.4f} MRR: {:.4f} ({})'.format(sum(acc)/len(acc), sum(mrr)/len(mrr), category))
    print('{0:05d}/{0:05d} Accuracy: {1:.4f} MRR: {2:.4f} (overall)\n'.format(total_len, sum(total_acc)/len(total_acc), sum(total_mrr)/len(total_mrr)))

Evaluating model deps:
00506/00506 Accuracy: 0.3518 MRR: 0.4939 (capital-common-countries)
04524/04524 Accuracy: 0.1121 MRR: 0.2035 (capital-world)
00866/00866 Accuracy: 0.0638 MRR: 0.0958 (currency)
02467/02467 Accuracy: 0.1228 MRR: 0.2208 (city-in-state)
00506/00506 Accuracy: 0.8162 MRR: 0.8541 (family)
00992/00992 Accuracy: 0.0343 MRR: 0.0668 (gram1-adjective-to-adverb)
00812/00812 Accuracy: 0.4002 MRR: 0.4763 (gram2-opposite)
01332/01332 Accuracy: 0.8011 MRR: 0.8535 (gram3-comparative)
01122/01122 Accuracy: 0.5606 MRR: 0.6372 (gram4-superlative)
01056/01056 Accuracy: 0.6468 MRR: 0.7402 (gram5-present-participle)
01599/01599 Accuracy: 0.1213 MRR: 0.2199 (gram6-nationality-adjective)
01560/01560 Accuracy: 0.6590 MRR: 0.7320 (gram7-past-tense)
01332/01332 Accuracy: 0.6757 MRR: 0.7478 (gram8-plural)
00869/00869 Accuracy: 0.9091 MRR: 0.9447 (gram9-plural-verbs)
19543/19543 Accuracy: 0.3672 MRR: 0.4456 (overall)

Evaluating model bow2:
00506/00506 Accuracy: 0.8360 MRR: 0.8817 (capital-co

### Clustering Word Vectors
The results are plotted in the output html file

In [15]:
output_file("cluster-output.html")

def get_model_plot(name, model, words, N=20, perplexity=30.0):
    word_list = []
    vectors = []
    for word in words:
        if word in model.vocab:
            word_list.append(word)
            vectors.append(model[word])
    
    kmeans = KMeans(n_clusters=N)
    kmeans.fit(vectors)
    clusters = kmeans.labels_

    tsne = TSNE(n_components=2, perplexity=perplexity)
    tsne_vectors = tsne.fit_transform(vectors)
    
    p = figure(tools="pan,wheel_zoom,reset,save",
               toolbar_location="above",
               title="T-SNE for 2000 nouns with {}.words".format(name))

    colormap = d3['Category20'][N]
    colors = [colormap[i%20] for i in clusters]

    source = ColumnDataSource(data=dict(x1=tsne_vectors[:,0],
                                        x2=tsne_vectors[:,1],
                                        names=word_list,
                                        colors=colors))

    p.scatter(x="x1", y="x2", size=8, source=source, color='colors')

    labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                      text_font_size="8pt", text_color="#555555",
                      source=source, text_align='center')
    p.add_layout(labels)

    return p

N = 20

with open("2000_nouns_sorted.txt") as file: 
    words = [x.strip() for x in file.readlines()] 

p_list = []
for name, model in models.items():
    p_list.append(get_model_plot(name, model, words))
show(column(p_list))