In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import math
import os
import random
import zipfile

import numpy as np
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf

In [3]:
url = 'http://mattmahoney.net/dc/'


def maybe_download(filename, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        filename, _ = urllib.request.urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception(
            'Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename

filename = maybe_download('text8.zip', 31344016)
print (filename)

Found and verified text8.zip
text8.zip


In [5]:
def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words"""
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

words = read_data(filename)
print('Data size', len(words))


Data size 17005207


In [27]:
# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = 50000

def build_dataset(words):
    count = [['UNK', -1]]
    # Given a vacabulary size the below code will collect the first 50000 words that have the hige
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
#     print (count)
    dictionary = dict()
    for no, (word, _) in enumerate(count):
        if 1==1:
#             print (word)
#             print (len(dictionary))
            dictionary[word] = len(dictionary)   # Just providing index to every word
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index) # Data is a list of IDs of the words in the same order as they appear in word 
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reverse_dictionary
data, count, dictionary, reverse_dictionary = build_dataset(words)

print (data[0:10])
print (len(data))  #Data is a list of IDs of the words in the same order as they appear in word 
print (len(count))
print (len(dictionary))
print (len(reverse_dictionary))

[5244, 3081, 12, 6, 195, 2, 3136, 46, 59, 156]
17005207
50000
50000
50000


In [31]:
data_index = 0

def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    print (batch)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    print (labels)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)
    print(buffer)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    print (buffer)   # Just contains the first three data from the list [data]
    print(data_index)
    
    for i in range(batch_size // num_skips):
        print (i)
        target = skip_window  # target label at the center of the buffer
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch, labels

batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
print (batch)
print ('')
print(labels)
for i in range(8):
    print(batch[i], reverse_dictionary[batch[i]],
        '->', labels[i, 0], reverse_dictionary[labels[i, 0]])

[0 0 0 0 0 0 0 0]
[[0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]]
deque([], maxlen=3)
deque([5244, 3081, 12], maxlen=3)
3
0
1
2
3
[3081 3081   12   12    6    6  195  195]

[[  12]
 [5244]
 [   6]
 [3081]
 [ 195]
 [  12]
 [   6]
 [   2]]
3081 originated -> 12 as
3081 originated -> 5244 anarchism
12 as -> 6 a
12 as -> 3081 originated
6 a -> 195 term
6 a -> 12 as
195 term -> 6 a
195 term -> 2 of


In [None]:
batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1       # How many words to consider left and right.
num_skips = 2         # How many times to reuse an input to generate a label.

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64    # Number of negative examples to sample.