# Assignment 1


In [1]:
import numpy as np
np.random.seed(13) #TODO Check if this is used for sgd
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Reshape, Lambda
from keras.utils import np_utils
from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
from keras.utils.vis_utils import model_to_dot
from keras.preprocessing import sequence
from gensim.models import KeyedVectors
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors as nn
from matplotlib import pylab
from __future__ import division

Using TensorFlow backend.


In [3]:
# DO NOT Modify the lines in this cell
path = 'alice.txt'
corpus = open(path).readlines()[0:700]

corpus = [sentence for sentence in corpus if sentence.count(" ") >= 2]

tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'+"'")
tokenizer.fit_on_texts(corpus)
corpus = tokenizer.texts_to_sequences(corpus)
nb_samples = sum(len(s) for s in corpus)
V = len(tokenizer.word_index) + 1

# Is this something they need to change?
dim = 100
window_size = 2 #use this window size for Skipgram, CBOW, and the model with the additional hidden layer
window_size_corpus = 4 #use this window size for the co-occurrence matrix

## Question 1

### Co-occurrence Matrix
Use the provided code to load the "Alice in Wonderland" text document. 
1. Implement the word-word co-occurrence matrix for “Alice in Wonderland”
2. Normalize the words such that every value lies within a range of 0 and 1
3. Compute the cosine distance between the given words:
    - Alice 
    - Dinah
    - Rabbit
4. List the 5 closest words to 'Alice'. Discuss the results.
5. Discuss what the main drawbacks are of a term-term co-occurence matrix solutions?


In [None]:
#create co-occurrence matrix
#print(tokenizer)
#print(corpus)
#print(nb_samples)
#print(V)
#print("\n")

def create_co_occurrence(crps, win_size, voc_size):
    # Discuss -1 with Thijs
    co_occurrence_mat = np.zeros((voc_size-1,voc_size-1), int)
    for sentence in crps:
        # Sliding window inside of sentence
        for i, center_word in enumerate(sentence):
            i_min = max(0, i - win_size + 1)
            i_max = min(len(sentence), i + win_size)
            window = sentence[i_min: i_max]
            #print("Window:", window)
            
            # Increment co occurence of words in sliding window
            for j in range(i_min, i_max):
                if i != j:
                    co_word = sentence[j]
                    co_occurrence_mat[center_word-1, co_word-1] += 1
    
    return co_occurrence_mat

co_occurence = create_co_occurrence(corpus, window_size, V)

co_occurence

In [None]:
#find cosine similarity to Alice, Dinah and Rabbit
from scipy import spatial
print(tokenizer.word_index)

def cosine_sim(word_1, word_2, tknzr, matrix):
    word_1_ind = tknzr.word_index[word_1]
    word_2_ind = tknzr.word_index[word_2]
    
    #print(word_1, word_1_ind)
    #print(word_2, word_2_ind)
    
    word_1_vec = matrix[:,word_1_ind-1]
    word_2_vec = matrix[:,word_2_ind-1]
    
    #print(word_1_vec)
    #print(word_2_vec)
    
    similarity = 1 - spatial.distance.cosine(word_1_vec, word_2_vec)
    #print(similarity)
    
    return similarity
    
    
print('similarity of Alice and Dinah', str(cosine_sim("alice", "dinah", tokenizer, co_occurence)))
print('similarity of Alice and Rabbit', str(cosine_sim("alice", "rabbit", tokenizer, co_occurence)))
print('similarity of Dinah and Rabbit', str(cosine_sim("dinah", "rabbit", tokenizer, co_occurence)))


In [None]:
#find the closest words to Alice

similarities = [cosine_sim("alice", i, tokenizer, co_occurence) if "alice"!= i else 0.0 for i in tokenizer.word_index]
most_similar_ind = similarities.index(max(similarities)) + 1

print(similarities)
print("Most similar word", most_similar_ind)

Discussion of the drawbacks:

One of the major drawbacks of a co-occurence matrix is the fact that the matrix becomes very large in a very short time. In order to compute answers from it one would need a very strong machine. Ofcourse this is doable however, there are more efficient ways to calculate it. Another drawback of using a co-occurence matrix is that quite some memory is needed to be able to store it all in memory. Of course this can be optimalized but still it is quite costly. 

In [None]:
#Save your all the vector representations of your word embeddings in this way
#Change when necessary the sizes of the vocabulary/embedding dimension

f = open('vectors_co_occurrence.txt',"w")
f.write(" ".join([str(V-1),str(V-1)]))
f.write("\n")

#vectors = your word co-occurrence matrix
vectors = []
for word, i in tokenizer.word_index.items():    
    f.write(word)
    f.write(" ")
    f.write(" ".join(map(str, list(vectors[i,:]))))
    f.write("\n")
f.close()

In [None]:
#reopen your file as follows

co_occurrence = KeyedVectors.load_word2vec_format('./vectors_co_occurrence.txt', binary=False)

## Question 2

### Word embeddings
Build embeddings with a keras implementation where the embedding vector is of length 50, 150 and 300. Use the Alice in Wonderland text book for training.
1. Using the CBOW model
2. Using Skipgram model
3. Add extra hidden dense layer to CBow and Skipgram implementations. Choose an activation function for that layer and justify your answer.
4. Analyze the four different word embeddings
    - Implement your own function to perform the analogy task with. Do not use existing libraries for this task such as Gensim. Your function should be able to answer whether an anaology as in the example given in the pdf-file is true.
    - Compare the performance on the analogy task between the word embeddings that you have trained in 2.1, 2.2 and 2.3.  
    - Visualize your results and interpret your results
5. Use the word co-occurence matrix from Question 1. Compare the performance on the analogy task with the performance of your trained word embeddings.  
6. Discuss:
    - What are the main advantages of CBOW and Skipgram?
    - What is the advantage of negative sampling?
    - What are the main drawbacks of CBOW and Skipgram?
7. Load pre-trained embeddings on large corpuses (see the pdf file). You only have to consider the word embeddings with an embedding size of 300
    - Compare performance on the analogy task with your own trained embeddings from "Alice in Wonderland". You can limit yourself to the vocabulary of Alice in Wonderland. Visualize the pre-trained word embeddings and compare these with the results of your own trained word embeddings. 


In [18]:
def generate_data_cbow(corpus, window_size, V):
    maxlen = window_size*2
    for words in corpus:
        L = len(words)
        for index, word in enumerate(words):
            contexts = []
            labels   = []            
            s = index - window_size
            e = index + window_size + 1
            
            contexts.append([words[i] for i in range(s, e) if 0 <= i < L and i != index])
            labels.append(word)

            x = sequence.pad_sequences(contexts, maxlen=maxlen)
            y = np_utils.to_categorical(labels, V)
            yield (x, y)

In [4]:
cbow = Sequential()
cbow.add(Embedding(input_dim=V, output_dim=dim, input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(dim,)))
cbow.add(Dense(V, activation='softmax'))

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [5]:
cbow.compile(loss='categorical_crossentropy', optimizer='adadelta')

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [6]:
for ite in range(10):
    loss = 0.
    for x, y in generate_data_cbow(corpus, window_size, V):
        loss += cbow.train_on_batch(x, y)

    print(ite, loss)

0 42314.977483
1 38594.8979433
2 38768.2048302
3 38873.7228633
4 38952.0608069
5 39006.2323149
6 39026.7875525
7 39042.6164962
8 39066.4479563
9 39093.9196522


In [4]:
def generate_data_skipgram(corpus, window_size, V):
    maxlen = window_size*2
    all_in = []
    all_out = []
    for words in corpus:
        L = len(words)
        for index, word in enumerate(words):
            p = index - window_size
            n = index + window_size + 1
                    
            in_words = []
            labels = []
            for i in range(p, n):
                if i != index and 0 <= i < L:
                    in_words.append([word])
                    labels.append(words[i])
            if in_words != []:
                all_in.append(np.array(in_words,dtype=np.int32))
                all_out.append(np_utils.to_categorical(labels, V))
    return (all_in,all_out)

In [5]:
#get x and y's for data
x,y = generate_data_skipgram(corpus,window_size,V)#save the preprocessed data of Skipgram


In [6]:
f = open('data_skipgram.txt' ,'w')

for input,outcome  in zip(x,y):
    input = np.concatenate(input)
    f.write(" ".join(map(str, list(input))))
    f.write(",")
    outcome = np.concatenate(outcome)
    f.write(" ".join(map(str,list(outcome))))
    f.write("\n")
f.close()

In [7]:
#load the preprocessed Skipgram data
def generate_data_skipgram_from_file():
    f = open('data_skipgram.txt' ,'r')
    for row in f:
        inputs,outputs = row.split(",")
        inputs = np.fromstring(inputs, dtype=int, sep=' ')
        inputs = np.asarray(np.split(inputs, len(inputs)))
        outputs = np.fromstring(outputs, dtype=float, sep=' ')
        outputs = np.asarray(np.split(outputs, len(inputs)))
        yield (inputs,outputs)

In [11]:
skipgram = Sequential()
skipgram.add(Embedding(input_dim=V, output_dim=dim, embeddings_initializer='glorot_uniform', input_length=1))
skipgram.add(Reshape((dim, )))
skipgram.add(Dense(input_dim=dim, units=V, kernel_initializer='uniform', activation='softmax'))

In [12]:
skipgram.compile(loss='categorical_crossentropy', optimizer='adadelta')

In [13]:
for ite in range(10):
    loss = 0.
    for x, y in generate_data_skipgram_from_file():
        loss += skipgram.train_on_batch(x, y)

    print(ite, loss)

0 42004.6379657
1 38352.4237516
2 38925.4058473
3 39342.3078592
4 39507.9859626
5 39673.5729046
6 39836.0326133
7 40010.7845144
8 40190.7613418
9 40370.2071753


## Activation for the dense layer

As can be seen below we tried several different activation functions as an extra dense layer. After some research online we decided that at least we wanted to test both ELU and ReLU as activation functions. Apart from these two functions we also wanted to try one else which became Sigmoid. Since ELU and ReLU both are quite good with dealing with vanishing gradients we felt like we needed another layer which strong point was not dealing with the vanishing gradient. Even though Sigmoid and Softmax show similarities we wanted to at least try it to make sure that indeed it would not be the best solution. And as can be seen below it indeed was not the best choice as a second dense layer. 

Because we did not know which one would perform best we decided that we would try all three. We put the activation layers in before the Softmax layer since Softmax is usually used as the last layer in the hidden layers of a neural network. This because Softmax normalizes the results while minimizing the cross-entropy/negative likelihood between the predictions and the actual outcome. 

In [32]:
#create CBOW model with additional dense layer        
dcbow = Sequential()
dcbow.add(Embedding(input_dim=V, output_dim=dim, input_length=window_size*2))
dcbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(dim,)))
#dcbow.add(Dense(V, activation='elu'))
#dcbow.add(Dense(V, activation='sigmoid'))
dcbow.add(Dense(V, activation='relu'))
dcbow.add(Dense(V, activation='softmax'))



In [33]:
#define loss function for CBOW + dense
dcbow.compile(loss='categorical_crossentropy', optimizer='adadelta')

In [24]:
#train model for CBOW + dense elu activation
for ite in range(10):
    loss = 0.
    for x, y in generate_data_cbow(corpus, window_size, V):
        loss += dcbow.train_on_batch(x, y)

    print(ite, loss)

0 39889.3704163
1 37455.47041
2 36582.0113094
3 36487.9203506
4 36671.233816
5 36554.0726204
6 36225.0750958
7 35911.4977573
8 35988.226946
9 36281.5206115


In [25]:
dcbow.save('dcbow_elu.h5')

In [30]:
#train model for CBOW + dense sigmoid activation
for ite in range(10):
    loss = 0.
    for x, y in generate_data_cbow(corpus, window_size, V):
        loss += dcbow.train_on_batch(x, y)

    print(ite, loss)

0 42573.9224706
1 40779.2916886
2 40212.4141967
3 40963.4117401
4 42315.4382861
5 43557.8121278
6 44557.614516
7 45347.3085792
8 45826.0357494
9 45717.4792516


In [31]:
dcbow.save('dcbow_Sigmoid.h5')

In [34]:
for ite in range(10):
    loss = 0.
    for x, y in generate_data_cbow(corpus, window_size, V):
        loss += dcbow.train_on_batch(x, y)

    print(ite, loss)

0 40066.6369147
1 38122.0871696
2 37527.2364669
3 37265.8932388
4 37098.9213078
5 36907.566846
6 36689.9979763
7 36465.4231402
8 36250.071865
9 36031.9165194


In [35]:
dcbow.save('dcbow_Relu.h5')

In [9]:
#create Skipgram with additional dense layer
dskipgram = Sequential()
dskipgram.add(Embedding(input_dim=V, output_dim=dim, embeddings_initializer='glorot_uniform', input_length=1))
dskipgram.add(Reshape((dim, )))
dskipgram.add(Dense(input_dim=dim, units=V, kernel_initializer='uniform', activation='elu'))
dskipgram.add(Dense(input_dim=dim, units=V, kernel_initializer='uniform', activation='softmax'))


In [10]:
#define loss function for Skipgram + dense
dskipgram.compile(loss='categorical_crossentropy', optimizer='adadelta')

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [11]:
#train model for Skipgram + dense
for ite in range(10):
    loss = 0.
    for x, y in generate_data_skipgram_from_file():
        loss += dskipgram.train_on_batch(x, y)

    print(ite, loss)

0 39171.4094963
1 37593.4716475
2 37049.7866344
3 36863.5996245
4 36989.3757836
5 37082.103549
6 37015.1577585
7 36882.7768184
8 36745.6724875
9 36615.3105249


In [12]:
dskipgram.save('dskipgram_elu.h5')

In [13]:
#create Skipgram with additional dense layer
dskipgram = Sequential()
dskipgram.add(Embedding(input_dim=V, output_dim=dim, embeddings_initializer='glorot_uniform', input_length=1))
dskipgram.add(Reshape((dim, )))
dskipgram.add(Dense(input_dim=dim, units=V, kernel_initializer='uniform', activation='sigmoid'))
dskipgram.add(Dense(input_dim=dim, units=V, kernel_initializer='uniform', activation='softmax'))

In [14]:
#define loss function for Skipgram + dense
dskipgram.compile(loss='categorical_crossentropy', optimizer='adadelta')

In [15]:
#train model for Skipgram + dense
for ite in range(10):
    loss = 0.
    for x, y in generate_data_skipgram_from_file():
        loss += dskipgram.train_on_batch(x, y)

    print(ite, loss)

0 39699.7591234
1 39342.0153301
2 39097.7367924
3 38975.9978857
4 39085.1981041
5 39359.1998932
6 39351.1995022
7 39203.5751615
8 39015.3584349
9 38764.1542244


In [19]:
dskipgram.save('dskipgram_sigmoid.h5')

In [20]:
#create Skipgram with additional dense layer
dskipgram = Sequential()
dskipgram.add(Embedding(input_dim=V, output_dim=dim, embeddings_initializer='glorot_uniform', input_length=1))
dskipgram.add(Reshape((dim, )))
dskipgram.add(Dense(input_dim=dim, units=V, kernel_initializer='uniform', activation='relu'))
dskipgram.add(Dense(input_dim=dim, units=V, kernel_initializer='uniform', activation='softmax'))

In [21]:
#define loss function for Skipgram + dense
dskipgram.compile(loss='categorical_crossentropy', optimizer='adadelta')

In [22]:
#train model for Skipgram + dense
for ite in range(10):
    loss = 0.
    for x, y in generate_data_skipgram_from_file():
        loss += dskipgram.train_on_batch(x, y)

    print(ite, loss)

0 39437.579227
1 38211.8743823
2 37820.7792289
3 37606.3476986
4 37546.8119706
5 37517.1473264
6 37465.703705
7 37408.8727797
8 37353.1536354
9 37295.5879925


In [23]:
dskipgram.save('dskipgram_Relu.h5')

In [None]:
#Implement your own analogy function


Comparison performance:

In [None]:
#Visualization results trained word embeddings


Interpretation results of the visualization

Compare the results of the trained word embeddings with the word-word co-occurrence matrix

Discussion of the advantages of CBOW and Skipgram, the advantages of negative sampling and drawbacks of CBOW and Skipgram

In [None]:
#load pretrained word embeddings of word2vec

path_word2vec = "your path /GoogleNews-vectors-negative300.bin"

word2vec = KeyedVectors.load_word2vec_format(path_word2vec, binary=True)

In [None]:
#load pretraind word embeddings of Glove

path = "your path /glove.6B/glove.6B.300d_converted.txt"

#convert GloVe into word2vec format
gensim.scripts.glove2word2vec.get_glove_info(path)
gensim.scripts.glove2word2vec.glove2word2vec(path, "glove_converted.txt")

glove = KeyedVectors.load_word2vec_format(path, binary=False)

In [None]:
#Visualize the pre-trained word embeddings

Comparison performance with your own trained word embeddings