<a href="https://colab.research.google.com/github/Xiongfeng-Jin/WordEmbedding/blob/master/CBOW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%matplotlib inline
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import bz2
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import nltk # standard preprocessing
import operator # sorting items in dictionary by value
nltk.download('punkt') #tokenizers/punkt/PY3/english.pickle
from math import ceil

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
tf.enable_eager_execution()

Prepare Dataset

In [0]:
url = 'http://www.evanjones.ca/software/'

def download_if_needed(filename,expected_bytes):
  if not os.path.exists(filename):
    print("Downloading file...")
    filename,_ = urlretrieve(url+filename,filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print("Found and verified %s" % filename)
  else:
    print(statinfo.st_size)
    raise Exception("Failed to verify"+filename)
  return filename

filename = download_if_needed('wikipedia2text-extracted.txt.bz2', 18377035)

Found and verified wikipedia2text-extracted.txt.bz2


In [0]:
def read_data(filename):
  with bz2.BZ2File(filename) as f:
    data = []
    file_string = f.read().decode('utf-8')
    file_string = nltk.word_tokenize(file_string)
    data.extend(file_string)
  return data

words = read_data(filename)
print("data size %d" % len(words))
print("Example words: %s" % words[:10])

data size 11631723
Example words: ['Propaganda', 'is', 'a', 'concerted', 'set', 'of', 'messages', 'aimed', 'at', 'influencing']


# Build word dictionary

In [0]:
vocabulary_size = 50000

def build_dataset(words):
  count = [['UNK',-1]]
  count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
  dictionary = dict()
  i = 0
  for word,_ in count:
    dictionary[word] = i
    i += 1
  
  data = []
  unk_count = 0
  for word in words:
    if word in dictionary:
      index = dictionary[word]
    else:
      index = 0
      unk_count += 1
    data.append(index)
    
  count[0][1] = unk_count
  reverse_dictionary = dict(zip(dictionary.values(),dictionary.keys()))
  assert len(dictionary) == vocabulary_size
  return data,count,dictionary,reverse_dictionary

data,count,dictionary,reverse_dictionary = build_dataset(words)
print("most common words",count[:5])
print("sample data",data[:10])
# del words

most common words [['UNK', 388121], ('the', 690296), (',', 632179), ('.', 439932), ('of', 402970)]
sample data [18392, 9, 8, 19083, 221, 4, 6436, 3769, 30, 12058]


## Generating Batches of Data for Skip-Gram

In [0]:
data_index = 0
def generate_batch_cbow(batch_size,window_size):
  global data_index
  
  span = 2*window_size + 1
  
  batch = np.ndarray(shape=(batch_size,span-1),dtype=np.int32)
  labels = np.ndarray(shape=(batch_size,1),dtype=np.int32)
  
  buffer = collections.deque(maxlen=span)
  dataLen = len(data)
  for _ in range(span):
    buffer.append(data[data_index])
    data_index = (data_index + 1) % dataLen
    
  num_samples = 2 * window_size
  
  for i in range(batch_size):
    target = window_size
    target_to_avoid = [window_size]
    col_idx = 0
    for j in range(span):
      if j == span//2:
        continue
      batch[i,col_idx] = buffer[j]
      col_idx += 1
    labels[i,0] = buffer[target]
    
    buffer.append(data[data_index])
    data_index = (data_index + 1) % dataLen

  return batch,labels

for window_size in [1,2]:
  data_index = 0
  batch,labels = generate_batch_cbow(batch_size=8,window_size=window_size)
  print("window size:",window_size)
  print("batch:",[[reverse_dictionary[bii] for bii in bi] for bi in batch])
  print("labels:", [reverse_dictionary[li] for li in labels.reshape(8)])

window size: 1
batch: [['Propaganda', 'a'], ['is', 'concerted'], ['a', 'set'], ['concerted', 'of'], ['set', 'messages'], ['of', 'aimed'], ['messages', 'at'], ['aimed', 'influencing']]
labels: ['is', 'a', 'concerted', 'set', 'of', 'messages', 'aimed', 'at']
window size: 2
batch: [['Propaganda', 'is', 'concerted', 'set'], ['is', 'a', 'set', 'of'], ['a', 'concerted', 'of', 'messages'], ['concerted', 'set', 'messages', 'aimed'], ['set', 'of', 'aimed', 'at'], ['of', 'messages', 'at', 'influencing'], ['messages', 'aimed', 'influencing', 'the'], ['aimed', 'at', 'the', 'opinions']]
labels: ['a', 'concerted', 'set', 'of', 'messages', 'aimed', 'at', 'influencing']


In [0]:
batch_size = 128
embedding_size = 128
window_size = 2
valid_size = 16
valid_window = 50
valid_examples = np.array(random.sample(range(valid_window),valid_size))
valid_examples = np.append(valid_examples,random.sample(range(1000,1000+valid_window),valid_size),axis=0)
num_sampled = 32

In [0]:
embeddings = tf.Variable(tf.random_uniform([vocabulary_size,embedding_size],-1.0,1.0))
softmax_weights = tf.Variable(tf.truncated_normal([vocabulary_size,embedding_size],stddev=0.5/math.sqrt(embedding_size)))
softmax_biases = tf.Variable(tf.random_uniform([vocabulary_size],0.0,0.01))

In [0]:
def stacked_embedings(train_dataset):
  for i in range(2*window_size):
    embedding_i = tf.nn.embedding_lookup(embeddings, train_dataset[:,i])
    x_size,y_size = embedding_i.get_shape().as_list()
    temp = None
    if temp is None:
        temp = tf.reshape(embedding_i,[x_size,y_size,1])
    else:
        temp = tf.concat(axis=2,values=[temp,tf.reshape(embedding_i,[x_size,y_size,1])])
  return temp

def mean_embeddings(x):
  return tf.reduce_mean(stacked_embedings(x),2,keepdims=False)

In [0]:
def loss(x,y):
  return tf.reduce_mean(
      tf.nn.sampled_softmax_loss(
        weights=softmax_weights,
        biases=softmax_biases,
        inputs=mean_embeddings(x),
        labels=y,
        num_sampled=num_sampled,
        num_classes=vocabulary_size
    )
  )

### Calculating word similarities

In [0]:
def get_normalized_embeddings():
  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings),1,keepdims=True))
  normalized_embeddings = embeddings / norm
  return normalized_embeddings

def get_similarity():
  valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
  normalized_embeddings = get_normalized_embeddings()
  valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,valid_dataset)
  similarity = tf.matmul(valid_embeddings,tf.transpose(normalized_embeddings))
  return similarity

In [0]:
optimizer = tf.train.AdagradOptimizer(1.0)
def train(x,y):
  optimizer.minimize(lambda:loss(x,y),var_list=[embeddings,softmax_weights,softmax_biases])

In [0]:
num_steps = 100001
skip_losses = []
for step in range(num_steps):
  batch_data,batch_labels = generate_batch_cbow(batch_size,window_size)
  batch_data = tf.constant(batch_data,dtype=tf.int32)
  batch_labels = tf.constant(batch_labels,dtype=tf.int32)
  train(batch_data,batch_labels)
  if (step+1) % 10000 == 0:
    sim = get_similarity()
    for i in range(valid_size):
      valid_word = reverse_dictionary[valid_examples[i]]
      top_k = 8
      nearest = (-sim[i,:]).numpy().argsort()[1:top_k+1]
      log = "Nearest to %s: " % valid_word
      for k in range(top_k):
        close_word = reverse_dictionary[nearest[k]]
        log = "%s %s," % (log,close_word)
      print(log)
    print("-"*100)
      


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.
Nearest to are:  were, is, have, has, was, CDMA, can, 21.3,
Nearest to at:  in, on, with, to, ., pharaoh, through, Superboy,
Nearest to on:  in, with, at, to, for, from, by, Jacques,
Nearest to by:  to, with, as, from, on, in, allied, synod,
Nearest to 's:  day, Barnes, difference, lover, ratings, name, fisherman, Tao,
Nearest to have:  be, are, outlook, 1655, journal, Eduard, Minoan, Wait,
Nearest to but:  In, The, 420,000, which, exuberant, Bangladeshis, including, reflective,
Nearest to a:  an, the, this, his, any, one, Brand, rewrites,
Nearest to which:  The, In, and, but, Chaffee, Dares, It, patience,
Nearest to has:  is, was, are, genre, Assent, emerged, supposedly, prefect,
Nearest to with:  to, by, on, for, at, in, Bien, Oahu,
Nearest to were:  are, may, was, breakage, will, Wax, would, Automatic,
Nearest to h