In [None]:
corpus = ['king is a strong man', 
          'queen is a  beautiful woman']

In [None]:
import tensorflow as tf
tf.__version__

'1.15.0'

In [None]:
def remove_stop_words(corpus):
    stop_words = ['is', 'a']
    results = []
    for text in corpus:
        tmp = text.split(' ')
        for stop_word in stop_words:
            if stop_word in tmp:
                tmp.remove(stop_word)
        results.append(" ".join(tmp))
    
    return results

In [None]:
corpus = remove_stop_words(corpus)

In [None]:
words = []
for text in corpus:
    for word in text.split(' '):
        words.append(word)

words = set(words)
words

{'', 'beautiful', 'king', 'man', 'queen', 'strong', 'woman'}

In [None]:
word2int = {}

for i,word in enumerate(words):
    word2int[word] = i

sentences = []

for sentence in corpus:
    sentences.append(sentence.split())
    
WINDOW_SIZE = 2

data = []
for sentence in sentences:
    for idx, word in enumerate(sentence):
        for neighbor in sentence[max(idx - WINDOW_SIZE, 0) : min(idx + WINDOW_SIZE, len(sentence)) + 1] : 
            if neighbor != word:
                data.append([word, neighbor])

In [None]:
import pandas as pd
for text in corpus:
    print(text)

df = pd.DataFrame(data, columns = ['input', 'label'])

df.head(10)

king strong man
queen  beautiful woman


Unnamed: 0,input,label
0,king,strong
1,king,man
2,strong,king
3,strong,man
4,man,king
5,man,strong
6,queen,beautiful
7,queen,woman
8,beautiful,queen
9,beautiful,woman


In [None]:
word2int

{'': 0,
 'beautiful': 2,
 'king': 3,
 'man': 6,
 'queen': 1,
 'strong': 5,
 'woman': 4}

# Define Tensorflow Graph

In [None]:
import tensorflow as tf
import numpy as np

ONE_HOT_DIM = len(words)

# function to convert numbers to one hot vectors
def to_one_hot_encoding(data_point_index):
    one_hot_encoding = np.zeros(ONE_HOT_DIM)
    one_hot_encoding[data_point_index] = 1
    return one_hot_encoding

X = [] # input word
Y = [] # target word

for x, y in zip(df['input'], df['label']):
    X.append(to_one_hot_encoding(word2int[ x ]))
    Y.append(to_one_hot_encoding(word2int[ y ]))

# convert them to numpy arrays
X_train = np.asarray(X)
Y_train = np.asarray(Y)

# making placeholders for X_train and Y_train
x = tf.placeholder(tf.float32, shape=(None, ONE_HOT_DIM))
y_label = tf.placeholder(tf.float32, shape=(None, ONE_HOT_DIM))

# word embedding will be 2 dimension for 2d visualization
EMBEDDING_DIM = 2 

# hidden layer: which represents word vector eventually
W1 = tf.Variable(tf.random_normal([ONE_HOT_DIM, EMBEDDING_DIM]))
b1 = tf.Variable(tf.random_normal([1])) #bias
hidden_layer = tf.add(tf.matmul(x,W1), b1)

# output layer
W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, ONE_HOT_DIM]))
b2 = tf.Variable(tf.random_normal([1]))
prediction = tf.nn.softmax(tf.add( tf.matmul(hidden_layer, W2), b2))

# loss function: cross entropy
loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), axis=[1]))

# training operation
train_op = tf.train.GradientDescentOptimizer(0.05).minimize(loss)

In [None]:
!pip install tensorflow==1.15

Collecting tensorflow==1.15
[?25l  Downloading https://files.pythonhosted.org/packages/3f/98/5a99af92fb911d7a88a0005ad55005f35b4c1ba8d75fba02df726cd936e6/tensorflow-1.15.0-cp36-cp36m-manylinux2010_x86_64.whl (412.3MB)
[K     |████████████████████████████████| 412.3MB 27kB/s 
Collecting tensorboard<1.16.0,>=1.15.0
[?25l  Downloading https://files.pythonhosted.org/packages/1e/e9/d3d747a97f7188f48aa5eda486907f3b345cd409f0a0850468ba867db246/tensorboard-1.15.0-py3-none-any.whl (3.8MB)
[K     |████████████████████████████████| 3.8MB 31.2MB/s 
Collecting tensorflow-estimator==1.15.1
[?25l  Downloading https://files.pythonhosted.org/packages/de/62/2ee9cd74c9fa2fa450877847ba560b260f5d0fb70ee0595203082dafcc9d/tensorflow_estimator-1.15.1-py2.py3-none-any.whl (503kB)
[K     |████████████████████████████████| 512kB 38.9MB/s 
[?25hCollecting gast==0.2.2
  Downloading https://files.pythonhosted.org/packages/4e/35/11749bf99b2d4e3cceb4d55ca22590b0d7c2c62b9de38ac4a4a7f4687421/gast-0.2.2.tar.gz
Bu

In [None]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init) 

iteration = 30000
for i in range(iteration):
    # input is X_train which is one hot encoded word
    # label is Y_train which is one hot encoded neighbor word
    sess.run(train_op, feed_dict={x: X_train, y_label: Y_train})
    if i % 3000 == 0:
        print('iteration '+str(i)+' loss is : ', sess.run(loss, feed_dict={x: X_train, y_label: Y_train}))

iteration 0 loss is :  2.7752602
iteration 3000 loss is :  0.91535395
iteration 6000 loss is :  0.8927775
iteration 9000 loss is :  0.87948054
iteration 12000 loss is :  0.87096715
iteration 15000 loss is :  0.8649745
iteration 18000 loss is :  0.8604762
iteration 21000 loss is :  0.8569436
iteration 24000 loss is :  0.85407585
iteration 27000 loss is :  0.8516879


In [None]:
# Now the hidden layer (W1 + b1) is actually the word look up table
vectors = sess.run(W1 + b1)
print(vectors)

[[ 0.3042987  -0.5721961 ]
 [ 3.2355022  -1.1452657 ]
 [ 0.4157862  -7.2609396 ]
 [ 2.2080314   2.9899757 ]
 [-0.10864949 -0.6752644 ]
 [-3.7633772   6.6740255 ]
 [-0.46691036  0.5139273 ]]


In [None]:
w2v_df = pd.DataFrame(vectors, columns = ['x1', 'x2'])
w2v_df['word'] = words
w2v_df = w2v_df[['word', 'x1', 'x2']]
w2v_df

Unnamed: 0,word,x1,x2
0,,0.304299,-0.572196
1,queen,3.235502,-1.145266
2,beautiful,0.415786,-7.26094
3,king,2.208031,2.989976
4,woman,-0.108649,-0.675264
5,strong,-3.763377,6.674026
6,man,-0.46691,0.513927
