# Word2Vec


In [1]:
import collections
import math
import os
import errno
import random
import zipfile

import numpy as np
from six.moves import urllib
from six.moves import xrange 
import tensorflow as tf

In [2]:
data_dir = "word2vec_data/words"
data_url = 'http://mattmahoney.net/dc/text8.zip'

In [3]:
def fetch_words_data(url=data_url, words_data=data_dir):
    
    # Make the Dir if it does not exist
    os.makedirs(words_data, exist_ok=True)
    
    # Path to zip file 
    zip_path = os.path.join(words_data, "words.zip")
    
    # If the zip file isn't there, download it from the data url
    if not os.path.exists(zip_path):
        urllib.request.urlretrieve(url, zip_path)
        
    # Now that the zip file is there, get the data from it
    with zipfile.ZipFile(zip_path) as f:
        data = f.read(f.namelist()[0])
    
    # Return a list of all the words in the data source.
    return data.decode("ascii").split()

In [4]:
words = fetch_words_data()

In [5]:
len(words)

17005207

In [6]:
words[9000:9040]

['feelings',
 'and',
 'the',
 'auditory',
 'system',
 'of',
 'a',
 'person',
 'without',
 'autism',
 'often',
 'cannot',
 'sense',
 'the',
 'fluctuations',
 'what',
 'seems',
 'to',
 'non',
 'autistic',
 'people',
 'like',
 'a',
 'high',
 'pitched',
 'sing',
 'song',
 'or',
 'flat',
 'robot',
 'like',
 'voice',
 'is',
 'common',
 'in',
 'autistic',
 'children',
 'some',
 'autistic',
 'children']

In [7]:
for w in words[9000:9040]:
    print(w,end=' ')

feelings and the auditory system of a person without autism often cannot sense the fluctuations what seems to non autistic people like a high pitched sing song or flat robot like voice is common in autistic children some autistic children 

In [8]:
from collections import Counter

In [9]:
my_list = ['one', 'two', 'two']

In [10]:
Counter(my_list)

Counter({'one': 1, 'two': 2})

In [12]:
Counter(my_list).most_common(1)

[('two', 2)]

In [13]:
def create_counts(vocab_size=50000):
    
    vocab = [] + Counter(words).most_common(vocab_size)
    
    vocab = np.array([word for word, _ in vocab])
    
    dictionary = {word:code for code, word in enumerate(vocab)}
    
    data = np.array([dictionary.get(word,0) for word in words])
    
    return data,vocab 

In [14]:
data , vocabulary = create_counts()

In [15]:
data.shape

(17005207,)

In [16]:
vocabulary.shape

(50000,)

In [18]:
words[100]

'interpretations'

In [19]:
data[100]

4194

In [23]:
len(data)

17005207

In [22]:
vocabulary

array(['the', 'of', 'and', ..., 'chamada', 'pnv', 'ayckbourn'],
      dtype='<U28')

In [32]:
test_vocab = [] + Counter(['one', 'two', 'two', 'three', 'three', 'three']).most_common(2)
test_vocab

[('three', 3), ('two', 2)]

In [31]:
test_vocab1 = np.array([word for word, _ in test_vocab])
test_vocab1

array(['three', 'two'],
      dtype='<U5')

In [33]:
enumerate(test_vocab1)

<enumerate at 0x7f60122829d8>

In [34]:
test_dictionary = {word:code for code, word in enumerate(test_vocab1)}
test_dictionary

{'three': 0, 'two': 1}

In [36]:
test_data = np.array([test_dictionary.get(word,0) for word in ['one', 'two', 'two', 'three', 'three', 'three']])
test_data

array([0, 1, 1, 0, 0, 0])

In [37]:
test_data[2]

1

In [38]:
def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)
    if data_index + span > len(data):
        data_index = 0
    buffer.extend(data[data_index:data_index + span])
    data_index += span
    for i in range(batch_size // num_skips):
        target = skip_window  # target label at the center of the buffer
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
    if data_index == len(data):
        buffer[:] = data[:span]
        data_index = span
    else:
        buffer.append(data[data_index])
        data_index += 1
  # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels

## Constants

In [70]:
batch_size = 128

embedding_size = 150

skip_window = 1

num_skips = 2

In [40]:
valid_size = 16

In [41]:
valid_window = 100
valid_examples = np.random.choice(valid_window,valid_size,replace=False)

In [42]:
num_sampled = 64

In [43]:
learning_rate = 0.01

In [50]:
vocabulary_size = 5000

In [44]:
tf.reset_default_graph()

In [45]:
train_inputs = tf.placeholder(tf.int32, shape=[None])

In [46]:
train_labels = tf.placeholder(tf.int32, shape=[batchsize, 1])

In [47]:
valid_dataset = tf.constant(valid_examples,dtype=tf.int32)

In [53]:
init_embeds = tf.random_uniform([vocabulary_size,embedding_size], -1.0,1.0)

In [54]:
embeddings = tf.Variable(init_embeds)

In [55]:
embed = tf.nn.embedding_lookup(embeddings,train_inputs)

In [57]:
nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0/ np.sqrt(embedding_size)))

In [58]:
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

In [61]:
loss = tf.reduce_mean(tf.nn.nce_loss(nce_weights,nce_biases,train_labels,embed,num_sampled, vocabulary_size))

In [62]:
optimizer = tf.train.AdamOptimizer(learning_rate=1.0)
trainer = optimizer.minimize(loss)

In [63]:
# Compute the cosine similarity between minibatch examples and all embeddings.
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), axis=1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

In [64]:
data_index = 0

In [65]:
init = tf.global_variables_initializer()

In [68]:
num_steps = 200


In [72]:
with tf.Session() as sess:
    
    sess.run(init)
    average_loss = 0
    
    for step in range(num_steps):
        
        batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels:batch_labels}
        
        _,loss_val = sess.run([trainer,loss], feed_dict=feed_dict)
        
        average_loss += loss_val
        
        if step % 1000 -- 0 :
            if step > 0:
                average_loss = average_loss / 1000
            print("Average Loss at step ", step, " is ", average_loss)
            average_losss = 0
            
        final_embeddings = normalized_embeddings.eval()

Average Loss at step  1  is  0.452315093994
Average Loss at step  2  is  0.247335371735


TypeError: sequence index must be integer, not 'slice'