In [1]:
#This cell imports all the needed modules
import nltk
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
import os
import pickle as pkl
import itertools
from tqdm import tqdm_notebook as tqdm
import numpy as np
from collections import Counter
from sklearn.preprocessing import OneHotEncoder
from scipy import spatial
from operator import itemgetter
import random
import json
import read_ap

[nltk_data] Downloading package stopwords to /home/kim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/kim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# This cell sets the parameters for the rest of the code
FREQUENCY_CUT_OFF = 50
CONTEXT_WINDOW_SIZE = 5
DIMENSIONS = 400
EPOCHS = 4
LR = 0.001
N_FAKE_PAIRS = 10
SUBSET_SIZE = 1000

In [3]:
# This cell loads in the data file
with open ('./pickles/processed_docs.pkl', 'rb') as infile:
    dat = pkl.load(infile)

In [4]:
# creates a subset of all the docs
counter = 0
data = {}
for key, value in dat.items():
    data[key] = value
    counter += 1
    if counter == SUBSET_SIZE:
        break

In [5]:
#This cell creates a dictionary with all the words that occurre more than FREQUENCY_CUT_OFF times in the corpus and a vocabulary
texts = list(data.values())
flat_list = list(itertools.chain(*texts))

frequency_dict = Counter(flat_list)
for word in frequency_dict.copy():
    if frequency_dict[word] < FREQUENCY_CUT_OFF:
        del frequency_dict[word]

In [6]:
# This cell removes all the words that do not occurre more than 50 times from the texts and creates a vocabulary
for doc_id, text in data.items():
    text = [word for word in text if word in frequency_dict]
    data[doc_id] = text
    
vocab = [word for word in frequency_dict]
print(len(vocab))

1148


In [7]:
# Creates pairs fot the centre word and the context and adds them to training_data
pairs = []
for doc_id, text in tqdm(data.items()):
    for cntr_word_index, cntr_word in enumerate(text):
        if cntr_word_index < CONTEXT_WINDOW_SIZE:
            context_words_index = [vocab.index(word) for word in text[ : cntr_word_index + CONTEXT_WINDOW_SIZE + 1] 
                                   if word != cntr_word]
            for i in context_words_index:
                pairs.append([cntr_word_index, i])
        else:
            context_words_index = [vocab.index(word) for word in text[cntr_word_index - (CONTEXT_WINDOW_SIZE) 
                                                       : cntr_word_index + (CONTEXT_WINDOW_SIZE)] if word != cntr_word]
            for i in context_words_index:
                pairs.append([cntr_word_index, i])
pairs = np.asarray(pairs)
print(pairs.shape)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


(1570628, 2)


In [8]:
# Function for getting one hot vector
def get_input(word_index, vocab):
    tens = torch.zeros(len(vocab)).float()
    tens[word_index] = 0.999999
    return tens  

In [9]:
# function for manually changeing the gradient
def change_grad(grad, target_word):
    samples = random.sample(range(len(vocab)), N_FAKE_PAIRS)
    values = [grad[sample] for sample in samples]
    new_grad = torch.zeros(len(grad))
    for i in range(len(samples)) :
        new_grad[samples[i]] = values[i]
    return new_grad

In [None]:
# Skim gram with negative sampling implemented

W1_ns = (torch.randn(DIMENSIONS, len(vocab), requires_grad=True).float())
W2_ns = (torch.randn(len(vocab), DIMENSIONS, requires_grad=True).float())

counter = 0
for epoch in range(EPOCHS):
    loss_value = 0
    for iteration, pair in enumerate(tqdm(pairs)):
        target = get_input(pair[0],vocab)
        output = get_input(pair[1],vocab).long()
        
        p1 = torch.matmul(W1_ns, target)
        p2 = torch.matmul(W2_ns, p1)
        
        p2.register_hook(lambda x: change_grad(x, pair[0]))
        
        log_softmax = F.log_softmax(p2, dim=0)
        
        loss = F.nll_loss(log_softmax.view(-1, 1), output)
        loss_value += loss.item()
        
        loss.backward()
        W1_ns.data -= LR * W1_ns.grad.data
        W2_ns.data -= LR * W2_ns.grad.data
        
        W1_ns.grad.data.zero_()
        W2_ns.grad.data.zero_()
        
        if iteration % 10000 == 0:
            print(loss.item())

In [10]:
# save model in pkl
# don't overwrite this with a "wrong model"
model_file_path = './pickles/optimal_word2vec_vectsize=400_window=5_subset=1000_epochs=10'
# with open(path, "wb") as writer:
#     pkl.dump(W2_ns, writer)

In [11]:
# load model from pkl
model = pkl.load(open(model_file_path, 'rb'))

In [12]:
# Creates a matrix with all the documents represented as a vector of length 50 for the negative sampling word
first_doc = True
doc_id_list_ns = []
for doc_id, text in tqdm(data.items()):
    doc_id_list_ns.append(doc_id)
    first_word = True
    for word in text:
        word_vector = np.asarray(model[vocab.index(word)].detach())
        if first_word:
            word_embeddings = word_vector
            first_word = False
        else:
            word_embeddings = np.vstack((word_embeddings, word_vector))
    if first_doc:
        doc_vecs_ns = np.mean([embedding for embedding in word_embeddings], axis=0)
        first_doc = False
    else:
        doc_vecs_ns = np.vstack((doc_vecs_ns, np.mean([embedding for embedding in word_embeddings], axis=0)))
print(len(doc_vecs_ns))
print(len(doc_id_list_ns))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


1000
1000


In [13]:
# loads the queries and places them in a dict
qrels, queries = read_ap.read_qrels()
query_dict = {}
for qid in queries:
    query_dict[qid] = read_ap.process_text(queries[qid])

In [14]:
first_query = True
for query_id, query in query_dict.items():
    first_word = True
    for word in query:
        if word in vocab:
            word_vector = np.asarray(model[vocab.index(word)].detach())
        else:
            word_vector = np.zeros(DIMENSIONS)
        if first_word:
            word_embeddings = word_vector
            first_word = False
        else:
            word_embeddings = np.vstack((word_embeddings, word_vector))
    if first_query:
        query_vecs_ns = np.mean([embedding for embedding in word_embeddings], axis=0)
        first_query = False
    else:
        if word_embeddings.shape[0] == DIMENSIONS:
            word_embeddings = np.reshape(word_embeddings, (1, -1))
        query_vecs_ns = np.vstack((query_vecs_ns, np.mean([embedding for embedding in word_embeddings], axis=0)))

In [15]:
def rank_docs(query_vec, doc_id_list_ns):
    rel_docs_ns = []
    smalles_sim = 1
    for i in range(len(doc_vecs_ns)):
        cosin_sim = abs(1 - spatial.distance.cosine(query_vec, doc_vecs_ns[i]))
        rel_docs_ns.append(tuple((doc_id_list_ns[i], cosin_sim)))
        rel_docs_ns = sorted(rel_docs_ns, key=itemgetter(1))
    return rel_docs_ns

rank_ns = rank_docs(query_vecs_ns[0], doc_id_list_ns)

In [16]:
dict_rank_ns = dict(rank_ns)
# print(dict_rank_ns)

In [17]:
import pytrec_eval
import logging
import helpers
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [18]:
def rank_all_queries(query_vecs_ns, doc_id_list_ns, queries, qrels, valid_set):
    params = "vecsize={}_window={}_subset={}_epochs={}".format(DIMENSIONS, CONTEXT_WINDOW_SIZE, SUBSET_SIZE, EPOCHS)
    path = f"./results/optimal_word2vec_{params}.json"
    overall_ser = {}
    print("Running {} Benchmark".format("Word2Vec"))
    overall_ser = {}
    for i, qid in enumerate(tqdm(qrels)):
        if qid not in valid_set:
            sims = rank_docs(query_vecs_ns[i], doc_id_list_ns)
            overall_ser[qid] = dict(sims)
    
    helpers.format_results(overall_ser, "word2vec_tuned", f"word2vec_{params}")

    evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'map', 'ndcg'})
    metrics = evaluator.evaluate(overall_ser)

    # get average score from MAP and NDCG
    avg_scores = helpers.get_average_score(metrics)
    print(avg_scores)

    # # dump this to JSON
    # # *Not* Optional - This is submitted in the assignment!
    with open(path, "w") as writer:
        json.dump(metrics, writer, indent=1)
    return avg_scores

In [19]:
# VERGEET NIET EERST DIE CODE TE FIXEN IN DE CELL WAAR JE QUERY_VECS_NS AANMAAKT WANT DIT KLOPT NU NIET
# run for default params first
# avg_scores = rank_all_queries(query_vecs_ns, doc_id_list_ns, queries, qrels)

# with open("./results/word2vec_avgscores_vecsize={}_window={}_subset={}_epochs={}.json".format(DIMENSIONS, CONTEXT_WINDOW_SIZE, SUBSET_SIZE, EPOCHS), "w") as writer:
#         json.dump(avg_scores, writer, indent=1)

# run for tuned params second (change params dimensions, window, vocab_size etc.)
validation_set = np.arange(76, 101)
validation_set = list(map(str, validation_set))
avg_scores = rank_all_queries(query_vecs_ns, doc_id_list_ns, queries, qrels, validation_set)

with open("./results/optimal_word2vec_avgscores_vecsize={}_window={}_subset={}_epochs={}.json".format(DIMENSIONS, CONTEXT_WINDOW_SIZE, SUBSET_SIZE, EPOCHS), "w") as writer:
        json.dump(avg_scores, writer, indent=1)

Running Word2Vec Benchmark


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=149.0), HTML(value='')))


{'map': 0.0016260101730512012, 'ndcg': 0.009751739821710152}


  dist = 1.0 - uv / np.sqrt(uu * vv)
