In [1]:
import pandas as pd
import numpy as np
from keras.utils import np_utils, pad_sequences
from keras.preprocessing.text import Tokenizer

In [2]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, Lambda
from keras.optimizers import SGD, Adam, Adadelta
import keras.backend as K

In [3]:
data = open('chernobyl.txt','r')
text = [text for text in data]

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text)
word2id = tokenizer.word_index
wids = tokenizer.texts_to_sequences(text)

word2id['PAD'] = 0
vocablen = len(word2id)

id2word = {v:k for k,v in word2id.items()}

embeddingsize = 100
windowsize =2

In [5]:
def generate_context_words(corpus, windowsize, vocablen):
    windowlen = windowsize*2
    for words in corpus:
        wordlen = len(words)
        for index, word in enumerate(words):
            context = []
            labels = []

            start = index - windowsize
            end = index +  windowsize +1

            context.append([words[i] for i in range(start,end) if 0<= i < windowlen & i!= index])
            labels.append(word)

            x = pad_sequences(context,maxlen=windowlen)
            y = np_utils.to_categorical(labels, vocablen)

            yield(x,y)

In [6]:
cbow = Sequential()
cbow.add(Embedding(input_dim=vocablen, output_dim=embeddingsize, input_length=windowsize*2))
cbow.add(Lambda(lambda x: K.mean(x,axis=1), output_shape=(embeddingsize, )))
cbow.add(Dense(vocablen, activation='softmax'))

In [7]:
cbow.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 100)            23600     
                                                                 
 lambda (Lambda)             (None, 100)               0         
                                                                 
 dense (Dense)               (None, 236)               23836     
                                                                 
Total params: 47,436
Trainable params: 47,436
Non-trainable params: 0
_________________________________________________________________


In [8]:
cbow.compile(loss='categorical_crossentropy', optimizer='adadelta')

In [9]:
for epoch in range(10):
    loss =0
    for x,y in generate_context_words(wids, windowsize, vocablen):
        loss += cbow.train_on_batch(x,y)
    
    print(f'Epoch: {epoch+1} Loss: {loss}')

Epoch: 1 Loss: 2043.1896319389343
Epoch: 2 Loss: 2043.1250319480896
Epoch: 3 Loss: 2043.0783619880676
Epoch: 4 Loss: 2043.0311675071716
Epoch: 5 Loss: 2042.9836502075195
Epoch: 6 Loss: 2042.935945034027
Epoch: 7 Loss: 2042.8880977630615
Epoch: 8 Loss: 2042.8401670455933
Epoch: 9 Loss: 2042.7921466827393
Epoch: 10 Loss: 2042.744077205658


In [10]:
weights = cbow.get_weights()[0]

In [11]:
print(weights.shape)

(236, 100)


In [12]:
from sklearn.metrics.pairwise import euclidean_distances

In [13]:
distmat = euclidean_distances(weights)
distmat.shape

(236, 236)

In [14]:
similar_words = {search_term: [ id2word[idx] for idx in distmat[word2id[search_term]-1].argsort()[1:6] +1]
    for search_term in ['chernobyl']}

similar_words

{'chernobyl': ['zone', 'russian', 'be', 'despite', 'still']}