In [1]:
import time

import numpy as np
import tensorflow as tf

import utils

In [2]:
from urllib.request import urlretrieve
from os.path import isfile, isdir
from tqdm import tqdm
import zipfile

dataset_folder_path = 'data'
dataset_filename = 'text8.zip'
dataset_name = 'Text8 Dataset'

class DLProgress(tqdm):
    last_block = 0

    def hook(self, block_num=1, block_size=1, total_size=None):
        self.total = total_size
        self.update((block_num - self.last_block) * block_size)
        self.last_block = block_num

if not isfile(dataset_filename):
    with DLProgress(unit='B', unit_scale=True, miniters=1, desc=dataset_name) as pbar:
        urlretrieve(
            'http://mattmahoney.net/dc/text8.zip',
            dataset_filename,
            pbar.hook)

if not isdir(dataset_folder_path):
    with zipfile.ZipFile(dataset_filename) as zip_ref:
        zip_ref.extractall(dataset_folder_path)
        
with open('data/text8') as f:
    text = f.read()

Text8 Dataset: 31.4MB [02:33, 204kB/s]                                         


In [3]:
words = utils.preprocess(text)
print(words[:30])

['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working', 'class', 'radicals', 'including', 'the', 'diggers', 'of', 'the', 'english', 'revolution', 'and', 'the', 'sans', 'culottes', 'of', 'the', 'french', 'revolution', 'whilst']


In [4]:
print("Total words: {}".format(len(words)))
print("Unique words: {}".format(len(set(words))))

Total words: 16680599
Unique words: 63641


In [5]:
vocab_to_int, int_to_vocab = utils.create_lookup_tables(words)
int_words = [vocab_to_int[word] for word in words]

In [6]:
from collections import Counter
import random

threshold = 1e-5
word_counts = Counter(int_words)
total_count = len(int_words)
freqs = {word: count/total_count for word, count in word_counts.items()}
p_drop = {word: 1 - np.sqrt(threshold/freqs[word]) for word in word_counts}
train_words = [word for word in int_words if random.random() < (1 - p_drop[word])]

In [7]:
def get_target(words, idx, window_size=5):
    ''' Get a list of words in a window around an index. '''
    
    R = np.random.randint(1, window_size+1)
    start = idx - R if (idx - R) > 0 else 0
    stop = idx + R
    target_words = set(words[start:idx] + words[idx+1:stop+1])
    
    return list(target_words)

In [8]:
def get_batches(words, batch_size, window_size=5):
    ''' Create a generator of word batches as a tuple (inputs, targets) '''
    
    n_batches = len(words)//batch_size
    
    # only full batches
    words = words[:n_batches*batch_size]
    
    for idx in range(0, len(words), batch_size):
        x, y = [], []
        batch = words[idx:idx+batch_size]
        for ii in range(len(batch)):
            batch_x = batch[ii]
            batch_y = get_target(batch, ii, window_size)
            y.extend(batch_y)
            x.extend([batch_x]*len(batch_y))
        yield x, y

In [9]:
train_graph = tf.Graph()
with train_graph.as_default():
    inputs = tf.placeholder(tf.int32, [None], name='inputs')
    labels = tf.placeholder(tf.int32, [None, None], name='labels')

In [10]:
n_vocab = len(int_to_vocab)
n_embedding = 200 # Number of embedding features 
with train_graph.as_default():
    embedding = tf.Variable(tf.random_uniform((n_vocab, n_embedding), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs)

In [11]:
n_sampled = 100
with train_graph.as_default():
    softmax_w = tf.Variable(tf.truncated_normal((n_vocab, n_embedding), stddev=0.1))
    softmax_b = tf.Variable(tf.zeros(n_vocab))
    
    # Calculate the loss using negative sampling
    loss = tf.nn.sampled_softmax_loss(softmax_w, softmax_b, 
                                      labels, embed,
                                      n_sampled, n_vocab)
    
    cost = tf.reduce_mean(loss)
    optimizer = tf.train.AdamOptimizer().minimize(cost)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.



In [12]:
with train_graph.as_default():
    ## From Thushan Ganegedara's implementation
    valid_size = 16 # Random set of words to evaluate similarity on.
    valid_window = 100
    # pick 8 samples from (0,100) and (1000,1100) each ranges. lower id implies more frequent 
    valid_examples = np.array(random.sample(range(valid_window), valid_size//2))
    valid_examples = np.append(valid_examples, 
                               random.sample(range(1000,1000+valid_window), valid_size//2))

    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    
    # We use the cosine distance:
    norm = tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keep_dims=True))
    normalized_embedding = embedding / norm
    valid_embedding = tf.nn.embedding_lookup(normalized_embedding, valid_dataset)
    similarity = tf.matmul(valid_embedding, tf.transpose(normalized_embedding))

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [None]:
epochs = 10
batch_size = 1000
window_size = 10

with train_graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=train_graph) as sess:
    iteration = 1
    loss = 0
    sess.run(tf.global_variables_initializer())

    for e in range(1, epochs+1):
        batches = get_batches(train_words, batch_size, window_size)
        start = time.time()
        for x, y in batches:
            
            feed = {inputs: x,
                    labels: np.array(y)[:, None]}
            train_loss, _ = sess.run([cost, optimizer], feed_dict=feed)
            
            loss += train_loss
            
            if iteration % 100 == 0: 
                end = time.time()
                print("Epoch {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Avg. Training loss: {:.4f}".format(loss/100),
                      "{:.4f} sec/batch".format((end-start)/100))
                loss = 0
                start = time.time()
            
            if iteration % 1000 == 0:
                # note that this is expensive (~20% slowdown if computed every 500 steps)
                sim = similarity.eval()
                for i in range(valid_size):
                    valid_word = int_to_vocab[valid_examples[i]]
                    top_k = 8 # number of nearest neighbors
                    nearest = (-sim[i, :]).argsort()[1:top_k+1]
                    log = 'Nearest to %s:' % valid_word
                    for k in range(top_k):
                        close_word = int_to_vocab[nearest[k]]
                        log = '%s %s,' % (log, close_word)
                    print(log)
            
            iteration += 1
    save_path = saver.save(sess, "checkpoints/text8.ckpt")
    embed_mat = sess.run(normalized_embedding)

Epoch 1/10 Iteration: 100 Avg. Training loss: 5.6609 2.0420 sec/batch
Epoch 1/10 Iteration: 200 Avg. Training loss: 5.6205 1.9677 sec/batch
Epoch 1/10 Iteration: 400 Avg. Training loss: 5.5830 1.9493 sec/batch
Epoch 1/10 Iteration: 500 Avg. Training loss: 5.5011 1.9630 sec/batch
Epoch 1/10 Iteration: 600 Avg. Training loss: 5.5541 1.9625 sec/batch
Epoch 1/10 Iteration: 700 Avg. Training loss: 5.5555 1.9512 sec/batch
Epoch 1/10 Iteration: 800 Avg. Training loss: 5.5697 1.9570 sec/batch
Epoch 1/10 Iteration: 900 Avg. Training loss: 5.4876 1.9549 sec/batch
Epoch 1/10 Iteration: 1000 Avg. Training loss: 5.4274 1.9394 sec/batch
Nearest to it: laden, speculate, rhotic, bulges, chlorofluorocarbons, copyrights, smart, notably,
Nearest to four: forcefully, wb, bat, kilowatts, lesbianism, musk, okinawa, carrel,
Nearest to called: pug, supers, quebec, north, angiotensin, germaine, pashtunistan, unchanged,
Nearest to th: scholarly, instrumentalists, cnd, primality, inspired, stipulation, synergist

Nearest to construction: crawling, remembrance, kimberley, protected, southside, motorway, disappear, instituted,
Nearest to except: encapsulation, rafael, nat, pieces, succesor, keyword, grimwood, introduces,
Nearest to shows: johannesburg, martial, learned, manslaughter, modules, dresden, customer, integumentary,
Nearest to articles: gosh, balto, williams, manly, tested, basemen, expiration, phoenix,
Nearest to grand: firings, catatonia, tirpitz, brodie, altercation, scandal, leverage, accolades,
Nearest to units: lengths, watt, ineffective, alamogordo, raincoat, asimo, dalton, growing,
Nearest to resources: rewrote, supplies, fiercest, posteriori, ou, reasonably, quorum, firenze,
Epoch 1/10 Iteration: 4100 Avg. Training loss: 4.6552 1.9598 sec/batch
Epoch 1/10 Iteration: 4200 Avg. Training loss: 4.6691 1.9276 sec/batch
Epoch 1/10 Iteration: 4300 Avg. Training loss: 4.6365 1.9566 sec/batch
Epoch 1/10 Iteration: 4400 Avg. Training loss: 4.6425 1.9323 sec/batch
Epoch 1/10 Iteration: 45

Nearest to four: eight, six, seven, wb, five, nine, okinawa, forcefully,
Nearest to called: isolation, supers, pashtunistan, quebec, garfunkel, decompositions, germaine, pug,
Nearest to th: century, sizable, scholarly, manzikert, dionysius, instrumentalists, survived, waxed,
Nearest to years: female, weasel, rocker, quelling, recalcitrant, successive, percent, cluttered,
Nearest to were: montevideo, garagiola, eec, great, questioning, last, cavalli, horton,
Nearest to their: deterioration, malted, metron, gradually, fermionic, suda, either, cantilever,
Nearest to other: usually, sofa, vitro, nhu, dialling, lightsaber, fornication, constructionism,
Nearest to file: computing, device, software, editors, reboot, encode, zseries, windows,
Nearest to construction: southside, motorway, protected, crawling, kimberley, appreciably, talc, disasters,
Nearest to except: encapsulation, seiy, earnest, succesor, rafael, keyword, pieces, mps,
Nearest to shows: manslaughter, modules, adb, johannesburg

Epoch 3/10 Iteration: 11300 Avg. Training loss: 4.1823 1.9351 sec/batch
Epoch 3/10 Iteration: 11400 Avg. Training loss: 4.1453 1.9543 sec/batch
Epoch 3/10 Iteration: 11500 Avg. Training loss: 4.1597 1.9248 sec/batch
Epoch 3/10 Iteration: 11600 Avg. Training loss: 4.1746 1.9437 sec/batch
Epoch 3/10 Iteration: 11700 Avg. Training loss: 4.2027 1.9558 sec/batch
Epoch 3/10 Iteration: 11800 Avg. Training loss: 4.1802 1.9241 sec/batch
Epoch 3/10 Iteration: 11900 Avg. Training loss: 4.1471 1.9552 sec/batch
Epoch 3/10 Iteration: 12000 Avg. Training loss: 4.1943 1.9492 sec/batch
Nearest to it: seemed, edif, bundled, chlorofluorocarbons, unvoiced, speculate, combinations, above,
Nearest to four: eight, five, six, seven, two, nine, one, wb,
Nearest to called: supers, isolation, garfunkel, generic, quebec, decompositions, pashtunistan, chromatids,
Nearest to th: century, manzikert, scholarly, dionysius, sizable, ringway, waxed, survived,
Nearest to years: female, male, age, quelling, cluttered, suc

Nearest to shows: borkou, pontificia, scrabble, fan, cfsp, johannesburg, martial, modules,
Nearest to articles: balto, takla, subscriptions, merino, publications, com, comprehensive, oren,
Nearest to grand: milestone, firings, lithuanian, chiefs, polis, marching, ostpolitik, chlorus,
Nearest to units: quicksand, triode, density, gulag, alamogordo, conifer, mi, initialization,
Nearest to resources: cacao, rewrote, fiercest, sticker, supplies, tempura, cois, ou,
Epoch 4/10 Iteration: 15100 Avg. Training loss: 4.0226 1.9720 sec/batch
Epoch 4/10 Iteration: 15200 Avg. Training loss: 4.0267 1.9326 sec/batch
Epoch 4/10 Iteration: 15300 Avg. Training loss: 4.0168 1.9373 sec/batch
Epoch 4/10 Iteration: 15400 Avg. Training loss: 4.0508 1.9487 sec/batch
Epoch 4/10 Iteration: 15500 Avg. Training loss: 4.0786 1.9294 sec/batch
Epoch 4/10 Iteration: 15600 Avg. Training loss: 4.0589 1.9580 sec/batch
Epoch 4/10 Iteration: 15700 Avg. Training loss: 4.0578 1.9398 sec/batch
Epoch 4/10 Iteration: 15800 Avg

Nearest to other: usually, sofa, lightsaber, use, nhu, are, dialling, constructionism,
Nearest to file: nix, encode, files, reboot, zseries, delete, nostra, formats,
Nearest to construction: southside, motorway, building, appreciably, disasters, design, chennai, talc,
Nearest to except: introduces, encapsulation, seiy, dek, earnest, emaciated, osu, nat,
Nearest to shows: borkou, pontificia, scrabble, cfsp, modules, show, fan, schlei,
Nearest to articles: takla, balto, revise, expiration, merino, publications, comprehensive, oren,
Nearest to grand: lithuanian, milestone, polis, firings, chiefs, ostpolitik, chlorus, marching,
Nearest to units: quicksand, triode, mi, density, conifer, gulag, makeup, airfoil,
Nearest to resources: ore, cacao, rewrote, fiercest, notebooks, tempura, sticker, envoy,
Epoch 5/10 Iteration: 19100 Avg. Training loss: 4.0013 1.9608 sec/batch
Epoch 5/10 Iteration: 19200 Avg. Training loss: 3.9490 1.9522 sec/batch
Epoch 5/10 Iteration: 19300 Avg. Training loss: 4.02

Nearest to years: female, year, age, zero, male, five, ago, mortality,
Nearest to were: machines, workstations, montevideo, garagiola, symbionts, intermediate, pleats, picaresque,
Nearest to their: either, most, metron, the, multiple, made, bitnet, with,
Nearest to other: usually, use, are, as, sofa, lightsaber, such, to,
Nearest to file: files, nix, rivaled, delete, reboot, encode, nostra, formats,
Nearest to construction: southside, motorway, building, appreciably, design, engineering, disasters, chennai,
Nearest to except: introduces, dek, encapsulation, seiy, osu, masse, nat, pieces,
Nearest to shows: show, borkou, cfsp, modules, scrabble, pontificia, schlei, gramophone,
Nearest to articles: takla, revise, publications, balto, merino, oren, expiration, comprehensive,
Nearest to grand: lithuanian, milestone, ostpolitik, polis, vilnius, chiefs, chlorus, tempelhof,
Nearest to units: quicksand, mi, conifer, density, triode, makeup, residing, gulag,
Nearest to resources: ore, envoy, tem

Nearest to called: smp, fulani, generic, pashtunistan, kiswahili, br, multiplicity, supers,
Nearest to th: century, manzikert, mtsho, waxed, st, nd, england, brigade,
Nearest to years: female, year, age, days, zero, ago, five, due,
Nearest to were: epidemic, symbionts, picaresque, machines, pleats, was, excluded, last,
Nearest to their: the, either, with, metron, made, most, multiple, bitnet,
Nearest to other: usually, use, as, are, such, lightsaber, to, notably,
Nearest to file: files, delete, nostra, nix, reboot, rivaled, finalised, carrel,
Nearest to construction: southside, design, building, motorway, engineering, appreciably, ieoh, disasters,
Nearest to except: introduces, dek, proviso, seiy, pieces, nat, masse, asparagus,
Nearest to shows: show, borkou, modules, pontificia, scrabble, cfsp, fanpage, snatcher,
Nearest to articles: revise, takla, publications, merino, oren, expiration, balto, portals,
Nearest to grand: lithuanian, polis, vilnius, ostpolitik, freemasonic, tempelhof, 

Nearest to four: five, one, two, eight, seven, six, three, zero,
Nearest to called: smp, multiplicity, fulani, kiswahili, pashtunistan, unity, inner, sultanate,
Nearest to th: century, mtsho, manzikert, waxed, st, nd, england, six,
Nearest to years: female, year, age, days, zero, ago, due, five,
Nearest to were: excluded, last, epidemic, previously, symbionts, was, various, picaresque,
Nearest to their: either, they, multiple, the, with, most, metron, motorcyclists,
Nearest to other: as, usually, are, use, such, to, associated, many,
Nearest to file: files, finalised, carrel, encode, nostra, formats, nix, delete,
Nearest to construction: design, southside, building, motorway, engineering, dam, appreciably, ieoh,
Nearest to except: introduces, seiy, dek, omitted, masse, qsort, pieces, proviso,
Nearest to shows: show, borkou, pontificia, modules, cfsp, scrabble, aired, according,
Nearest to articles: revise, takla, merino, expiration, oren, publications, balto, tuileries,
Nearest to gran

Nearest to called: smp, such, unity, known, multiplicity, forming, sent, quebec,
Nearest to th: century, mtsho, waxed, st, nd, manzikert, bealtaine, stoicism,
Nearest to years: female, year, days, age, zero, due, five, male,
Nearest to were: was, being, previously, last, excluded, in, ostia, machines,
Nearest to their: they, either, the, multiple, with, most, for, more,
Nearest to other: are, as, use, usually, in, to, such, be,
Nearest to file: files, encode, delete, finalised, carrel, nostra, admixture, nix,
Nearest to construction: design, motorway, southside, engineering, building, ieoh, dam, kimberley,
Nearest to except: introduces, omitted, unaccompanied, qsort, quer, dek, proviso, councilor,
Nearest to shows: show, borkou, aired, pontificia, modules, schlei, according, distinguished,
Nearest to articles: revise, oren, takla, expiration, publications, online, merino, quotes,
Nearest to grand: lithuanian, prix, surfing, vilnius, destabilizing, tempelhof, ostpolitik, freemasonic,
Ne

Nearest to th: century, st, mtsho, nd, waxed, manzikert, rd, six,
Nearest to years: female, year, days, age, zero, five, two, ago,
Nearest to were: was, previously, later, in, many, however, various, last,
Nearest to their: they, the, with, either, most, to, from, multiple,
Nearest to other: are, as, such, use, and, be, usually, these,
Nearest to file: files, encode, delete, finalised, nix, formats, carrel, nostra,
Nearest to construction: building, ieoh, motorway, design, dam, southside, engineering, shea,
Nearest to except: introduces, unaccompanied, quer, omitted, ectocervix, qsort, kuni, replenished,
Nearest to shows: show, aired, borkou, modules, wedding, according, special, pontificia,
Nearest to articles: revise, publications, oren, expiration, dess, takla, information, page,
Nearest to grand: lithuanian, prix, vilnius, tempelhof, freemasonic, surfing, ostpolitik, destabilizing,
Nearest to units: makeup, housing, unit, mi, residing, quicksand, conifer, density,
Nearest to resour

Nearest to their: they, the, for, to, with, from, multiple, either,
Nearest to other: as, are, such, use, and, many, to, with,
Nearest to file: files, encode, delete, finalised, carrel, nix, nostra, formats,
Nearest to construction: building, dam, constructed, ieoh, motorway, design, engineering, talc,
Nearest to except: introduces, quer, unaccompanied, omitted, councilor, ectocervix, kuni, dek,
Nearest to shows: show, borkou, aired, according, wedding, modules, pontificia, special,
Nearest to articles: publications, revise, page, dess, online, information, oren, expiration,
Nearest to grand: lithuanian, prix, tempelhof, vilnius, freemasonic, destabilizing, surfing, ostpolitik,
Nearest to units: makeup, housing, mi, unit, residing, quicksand, conifer, density,
Nearest to resources: tempura, notebooks, extremes, arable, ore, society, cacao, envoy,
Epoch 10/10 Iteration: 43100 Avg. Training loss: 3.7719 1.9606 sec/batch
Epoch 10/10 Iteration: 43200 Avg. Training loss: 3.8361 1.9692 sec/b

#### Очень долго обучается, так что пока как-то так ((

In [None]:
with train_graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=train_graph) as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    embed_mat = sess.run(embedding)

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [None]:
viz_words = 500
tsne = TSNE()
embed_tsne = tsne.fit_transform(embed_mat[:viz_words, :])

In [None]:
fig, ax = plt.subplots(figsize=(14, 14))
for idx in range(viz_words):
    plt.scatter(*embed_tsne[idx, :], color='steelblue')
    plt.annotate(int_to_vocab[idx], (embed_tsne[idx, 0], embed_tsne[idx, 1]), alpha=0.7)