<a href="https://colab.research.google.com/github/Xiongfeng-Jin/WordEmbedding/blob/master/Skip_Gram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%matplotlib inline
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import bz2
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import nltk # standard preprocessing
import operator # sorting items in dictionary by value
nltk.download('punkt') #tokenizers/punkt/PY3/english.pickle
from math import ceil
import csv

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [0]:
tf.enable_eager_execution()

Prepare Dataset

In [0]:
url = 'http://www.evanjones.ca/software/'

def download_if_needed(filename,expected_bytes):
  if not os.path.exists(filename):
    print("Downloading file...")
    filename,_ = urlretrieve(url+filename,filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print("Found and verified %s" % filename)
  else:
    print(statinfo.st_size)
    raise Exception("Failed to verify"+filename)
  return filename

filename = download_if_needed('wikipedia2text-extracted.txt.bz2', 18377035)

Downloading file...
Found and verified wikipedia2text-extracted.txt.bz2


In [0]:
def read_data(filename):
  with bz2.BZ2File(filename) as f:
    data = []
    file_string = f.read().decode('utf-8')
    file_string = nltk.word_tokenize(file_string)
    data.extend(file_string)
  return data

words = read_data(filename)
print("data size %d" % len(words))
print("Example words: %s" % words[:10])

data size 11631723
Example words: ['Propaganda', 'is', 'a', 'concerted', 'set', 'of', 'messages', 'aimed', 'at', 'influencing']


# Build word dictionary

In [0]:
vocabulary_size = 50000

def build_dataset(words):
  count = [['UNK',-1]]
  count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
  dictionary = dict()
  i = 0
  for word,_ in count:
    dictionary[word] = i
    i += 1
  
  data = []
  unk_count = 0
  for word in words:
    if word in dictionary:
      index = dictionary[word]
    else:
      index = 0
      unk_count += 1
    data.append(index)
    
  count[0][1] = unk_count
  reverse_dictionary = dict(zip(dictionary.values(),dictionary.keys()))
  assert len(dictionary) == vocabulary_size
  return data,count,dictionary,reverse_dictionary

data,count,dictionary,reverse_dictionary = build_dataset(words)
print("most common words",count[:5])
print("sample data",data[:10])
# del words

most common words [['UNK', 388121], ('the', 690296), (',', 632179), ('.', 439932), ('of', 402970)]
sample data [18392, 9, 8, 19083, 221, 4, 6436, 3769, 30, 12058]


## Generating Batches of Data for Skip-Gram

In [0]:
data_index = 0
def generate_batch_skip_gram(batch_size,window_size):
  global data_index
  
  batch = np.ndarray(shape=(batch_size),dtype=np.int32)
  labels = np.ndarray(shape=(batch_size,1),dtype=np.int32)
  
  span = 2*window_size + 1
  buffer = collections.deque(maxlen=span)
  dataLen = len(data)
  for _ in range(span):
    buffer.append(data[data_index])
    data_index = (data_index + 1) % dataLen
    
  num_samples = 2 * window_size
  
  for i in range(batch_size // num_samples):
    k = 0
    for j in list(range(window_size))+list(range(window_size+1,2*window_size+1)):
      batch[i*num_samples + k] = buffer[window_size]
      labels[i*num_samples + k,0] = buffer[j]
      k += 1
    buffer.append(data[data_index])
    data_index = (data_index + 1) % dataLen

  return batch,labels

for window_size in [1,2]:
  data_index = 0
  batch,labels = generate_batch_skip_gram(batch_size=8,window_size=window_size)
  print("window size:",window_size)
  print("batch:",[reverse_dictionary[b] for b in batch])
  print("labels:",[reverse_dictionary[i] for i in labels.reshape(8)])

window size: 1
batch: ['is', 'is', 'a', 'a', 'concerted', 'concerted', 'set', 'set']
labels: ['Propaganda', 'a', 'is', 'concerted', 'a', 'set', 'concerted', 'of']
window size: 2
batch: ['a', 'a', 'a', 'a', 'concerted', 'concerted', 'concerted', 'concerted']
labels: ['Propaganda', 'is', 'concerted', 'set', 'is', 'a', 'set', 'of']


In [0]:
batch_size = 128
embedding_size = 128
window_size = 4
valid_size = 16
valid_window = 50
valid_examples = np.array(random.sample(range(valid_window),valid_size))
valid_examples = np.append(valid_examples,random.sample(range(1000,1000+valid_window),valid_size),axis=0)
num_sampled = 32

In [0]:
embeddings = tf.Variable(tf.random_uniform([vocabulary_size,embedding_size],-1.0,1.0))
softmax_weights = tf.Variable(tf.truncated_normal([vocabulary_size,embedding_size],stddev=0.5/math.sqrt(embedding_size)))
softmax_biases = tf.Variable(tf.random_uniform([vocabulary_size],0.0,0.01))

In [0]:
def embed(train_dataset):
  return tf.nn.embedding_lookup(embeddings,train_dataset)

In [0]:
def loss(x,y):
  return tf.nn.sampled_softmax_loss(
      weights=softmax_weights,
      biases=softmax_biases,
      inputs=embed(x),
      labels=y,
      num_sampled=num_sampled,
      num_classes=vocabulary_size
  )

### Calculating word similarities

In [0]:
def get_normalized_embeddings():
  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings),1,keepdims=True))
  normalized_embeddings = embeddings / norm
  return normalized_embeddings

def get_similarity():
  valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
  normalized_embeddings = get_normalized_embeddings()
  valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,valid_dataset)
  similarity = tf.matmul(valid_embeddings,tf.transpose(normalized_embeddings))
  return similarity

In [0]:
optimizer = tf.train.AdagradOptimizer(1.0)
def train(x,y):
  optimizer.minimize(lambda:loss(x,y),var_list=[embeddings,softmax_weights,softmax_biases])

In [0]:
num_steps = 100001
skip_losses = []
average_loss = 0
for step in range(num_steps):
  batch_data,batch_labels = generate_batch_skip_gram(batch_size,window_size)
  batch_data = tf.constant(batch_data,dtype=tf.int32)
  batch_labels = tf.constant(batch_labels,dtype=tf.int32)
  train(batch_data,batch_labels)
  if (step+1) % 10000 == 0:
    sim = get_similarity()
    for i in range(valid_size):
      valid_word = reverse_dictionary[valid_examples[i]]
      top_k = 8
      nearest = (-sim[i,:]).numpy().argsort()[1:top_k+1]
      log = "Nearest to %s: " % valid_word
      for k in range(top_k):
        close_word = reverse_dictionary[nearest[k]]
        log = "%s %s," % (log,close_word)
      print(log)
    print("-"*100)
      


Nearest to on:  the, ., ,, of, The, a, and, is,
Nearest to are:  to, ,, ., of, the, as, in, UNK,
Nearest to is:  ., and, ,, the, of, to, in, a,
Nearest to also:  is, of, '', ., ,, to, which, for,
Nearest to which:  in, was, ,, to, ., the, of, is,
Nearest to the:  of, ., ,, in, and, a, to, was,
Nearest to has:  for, ., of, the, is, in, on, (,
Nearest to of:  the, ,, ., and, in, to, a, is,
Nearest to (:  ., ,, the, of, to, a, The, in,
Nearest to have:  in, ., a, to, the, with, that, ,,
Nearest to his:  in, ., of, ,, '', as, and, the,
Nearest to by:  ,, in, the, of, a, ., and, to,
Nearest to or:  and, of, to, is, (, in, the, UNK,
Nearest to that:  ., the, and, of, to, ,, is, a,
Nearest to to:  of, ,, ., the, in, as, and, a,
Nearest to first:  ,, and, ., of, in, the, is, for,
----------------------------------------------------------------------------------------------------
Nearest to on:  the, in, of, a, UNK, ., and, to,
Nearest to are:  the, by, ,, UNK, of, and, ., to,
Nearest to is:  t