In [1]:
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.utils import to_categorical

In [2]:
data = ["Deep learning also known as deep structured learning", 
"is part of a broader family of machine learning methods based", 
"on artificial neural networks with representation learning", 
"Learning can be supervised, semi-supervised or unsupervised",
"Deep-learning architectures such as deep neural networks", 
"deep belief networks, deep reinforcement learning", 
"recurrent neural networks, convolutional neural networks and Transformers have been applied to fields including computer vision  speech recognition, natural language processing, machine translation", 
"where they have produced results comparable to and in some cases surpassing human expert performance"
]

In [3]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(data)
word2id = tokenizer.word_index

# build vocabulary of unique words
word2id['PAD'] = 0
id2word = {v:k for k, v in word2id.items()}
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in data]

vocab_size = len(word2id)
embed_size = 100
window_size = 2 # context window size

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])

Vocabulary Size: 62
Vocabulary Sample: [('learning', 1), ('deep', 2), ('networks', 3), ('neural', 4), ('as', 5), ('of', 6), ('machine', 7), ('supervised', 8), ('and', 9), ('have', 10)]


In [4]:
from keras.utils import pad_sequences
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size*2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            context_words = []
            label_word   = []            
            start = index - window_size
            end = index + window_size + 1
            
            context_words.append([words[i] 
                                 for i in range(start, end) 
                                 if 0 <= i < sentence_length 
                                 and i != index])
            label_word.append(word)

            x = pad_sequences(context_words, maxlen=context_length)
            y = to_categorical(label_word, vocab_size)
            yield (x, y)

In [5]:
import numpy as np
i = 0
for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
    print(x, y)
    if 0 not in x[0]:
        print('Context (X):', [id2word[w] for w in x[0]], '-> Target (Y):', id2word[np.argwhere(y[0])[0][0]])
    
        if i == 10:
            break
        i += 1

[[ 0  0  1 12]] [[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[ 0  2 12 13]] [[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[ 2  1 13  5]] [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
Context (X): ['deep', 'learning', 'known', 'as'] -> Target (Y): also
[[ 1 12  5  2]] [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
Context (X): ['learning', 'also', 'as', 'deep'] -> Target (Y): known
[[12 13  2 14]] [[0. 0

In [10]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda

# build CBOW architecture
cbow = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embed_size),
    Lambda(lambda x: tf.reduce_mean(x, axis=1), output_shape=(embed_size,)),
    Dense(vocab_size, activation='softmax'),
    
])

# view model summary
cbow.summary()

In [11]:
cbow.compile(loss='categorical_crossentropy', 
             optimizer='rmsprop')

In [7]:
for epoch in range(2):
    loss = 0.
#     i = 0
    for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
#         i += 1
        loss += cbow.train_on_batch(x, y)
#         if i % (vocab_size-1) == 0:
#             print('Processed {} (context, word) pairs'.format(i+1))

    print('Epoch:', epoch, '\tLoss:', loss)
    print()

Epoch: 0 	Loss: 354.513475894928

Epoch: 1 	Loss: 353.2334475517273



In [8]:
# import pandas as pd
weights = cbow.get_weights()[0]
weights = weights[1:]
# print(weights.shape)
# print(weights)
# pd.DataFrame(weights, index=list(id2word.values())[1:])

In [9]:
from sklearn.metrics.pairwise import euclidean_distances

# compute pairwise distance matrix
distance_matrix = euclidean_distances(weights)
# print(distance_matrix)

# view contextually similar words
similar_words = {search_term: [id2word[idx+1] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]] 
                   for search_term in ['deep', 'unsupervised']}

similar_words

{'deep': ['reinforcement', 'based', 'known', 'neural', 'be'],
 'unsupervised': ['language',
  'representation',
  'based',
  'natural',
  'computer']}