In [2]:
#################### Implementing tensorflow version of word2vec line by line, to understand the nitty-gritties #############

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [5]:
import argparse
import collections
import math
import os
import random
import sys
from tempfile import gettempdir
import zipfile

import numpy as np
from six.moves import urllib
from six.moves import xrange
import tensorflow as tf

from tensorflow.contrib.tensorboard.plugins import projector

In [18]:
# this functions download the given file from the 
def maybe_download(filename, expected_bytes,dirname):
    """ Download the file if not present. """
    url = 'http://mattmahoney.net/dc/'
    local_filename = os.path.join(dirname, filename)
    if not os.path.exists(local_filename):
        # download the given file name to the path specified
        local_filename, _ = urllib.request.urlretrieve(url+filename, local_filename)
            
    statinfo = os.stat(local_filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception('Failed to verify' + local_filename)
    return local_filename

In [54]:
# this function reads the data in the given filename(.zip) into a list of string
def read_data(filename):
    """ Extract the first file enclosed in a zip file as a list of words """
    with zipfile.ZipFile(filename) as f:
        # reading the first file from the zip archive
        # tf.compat.as_str() returns the file context read (via read()) as a string
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
        return data

In [78]:
# this function takes in the data (as a list of words) and retrievs the most frequent ones based on their frequency
# data -> ids of all the words. If the word is not chosen then its id = 0
# counts -> key: word, value -> frequency
# reversed_dictionary -> key: id, value -> word (contains only the most frequent words)
def build_dataset(words, n_words):
    """ Process raw text data into a dataset"""
    counts = [['UNK', -1]]
    # retrieving the most common words based on their frequencies
    counts.extend(collections.Counter(words).most_common(n_words-1))
    dictionary = {} # stores word as key and its order based on frequency as value
    for word, _ in counts:
        dictionary[word] = len(dictionary)
    data = []
    unk_cnt = 0
    # this loop loops through the entire word list and assigns any word not selected earlier (as the most frequenct ones) as 'unknown'
    for word in words:
        index = dictionary.get(word,0) # if the word does not exist in the dictionary then index = 0
        if index ==0: # i.e if the word is not part of the dictionary, assigned to unknown
            unk_cnt += 1
        data.append(index) # storing the indexes of selected words
    counts[0][1] = unk_cnt
    # reversed_dictionary: key -> id, value -> word
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, counts, dictionary, reversed_dictionary

In [132]:
# this function generates a random batch of given size for the skip-gram model
# skip_window -> window size (i.e. how many words to consider left and right )
# num_skips -> how many times to reuse an input to generate a label 
# i.e same combination of context words used num_skip times with the same target word as label
def generate_batch(data,batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2*skip_window
    batch = np.ndarray(shape=(batch_size), dtype = np.int32)
    labels = np.ndarray(shape=(batch_size,1), dtype = np.int32)
    span = 2 * skip_window + 1 # [skip_window, target, skip_window]
    # doubly ended queue to store number of time a word has been part of a batch. deque's support fast O(1) operations to append and pop
    buffer = collections.deque(maxlen = span) 
    # resetting the data_index if it grows longer/bigger than the data size
    if data_index + span > len(data): 
        data_index = 0
    #append to the deque (initially first 3 words in the data would be added)
    buffer.extend(data[data_index:data_index + span]) 
    data_index += span
    for i in range(batch_size // num_skips):
        context_word_ids = [w for w in range(span) if w!= skip_window]
        words_to_use = random.sample(context_word_ids, num_skips)
        for j, context_word_id in enumerate(words_to_use):
            batch[i*num_skips + j] = buffer[skip_window] # setting the skip_window
            labels[i*num_skips+j,0] = buffer[context_word_id] #setting the context word as the label
        # if end of the words reached, then starting over again
        if data_index == len(data):
            buffer.extend(data[0:span])
            data_index = span
        # adding the next word
        else:
            buffer.append(data[data_index])
            data_index += 1
    # back track a little bit to avoid skipping words in the end of the batch
    data_index = (data_index + len(data) - span) & len(data)
    print("Buffer: ",buffer)
    return batch,labels

In [133]:
# this functions implements the vanilla version of word2vec
# log_dir -> path to a log directory to save the tensorboard summaries
def word2vec_basic(log_dir):
    # create the directory for TensorBoard variables if there is not
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    
    # downloading file from http://mattmahoney.net/dc/
    filename = maybe_download('text8.zip',31344016,'/Users/ghai7c/Desktop/Hands_on_ML')
    
    # reading the content first file in the zip archive as a list of words
    data = read_data(filename)
    print('Data Size:', len(data))
    
    vocab_size = 50000 # size of the vocabulary i.e most frequent words
    # data -> list of ids ( intergers from 0 to vocabulary_size - 1). Original text where words replaced by their IDs
    # counts -> map of words (strings) to their frequencies
    # dictionary -> map of words (strings) to their ids(integer)
    # reversed_dictionary -> map of ids (integer) to their words (string)
    data, counts, dictionary, reversed_dictionary = build_dataset(data, vocab_size)
    print('Most common words (including UNK): ',counts[:5])
    print('Sample Data: ',data[:10], [reversed_dictionary[i] for i in data[:10]])
    
    batch, labels = generate_batch(data, batch_size = 16, num_skips = 2, skip_window = 1)
    for i in range(16):
        print(batch[i], reversed_dictionary[batch[i]], '->', labels[i,0], reversed_dictionary[labels[i,0]])

In [134]:
# implementing word2vec_basic
data_index = 0 # maintains a running index of the words traversed for generating the training batches
word2vec_basic('word2vec_logs')

Found and verified text8.zip
Data Size: 17005207
Most common words (including UNK):  [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]
Sample Data:  [5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156] ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']
Buffer:  deque([59, 156, 128], maxlen=3)
3081 originated -> 12 as
3081 originated -> 5234 anarchism
12 as -> 3081 originated
12 as -> 6 a
6 a -> 12 as
6 a -> 195 term
195 term -> 2 of
195 term -> 6 a
2 of -> 195 term
2 of -> 3134 abuse
3134 abuse -> 2 of
3134 abuse -> 46 first
46 first -> 3134 abuse
46 first -> 59 used
59 used -> 156 against
59 used -> 46 first


In [135]:
test = collections.deque(maxlen = 2) 
test.extend([1,2])
print(test)
test.append(3)
print(test)