## import

In [1]:
""" Word2Vec.

Implement Word2Vec algorithm to compute vector representations of words.
This example is using a small chunk of Wikipedia articles to train from.

References:
    - Mikolov, Tomas et al. "Efficient Estimation of Word Representations
    in Vector Space.", 2013.

Links:
    - [Word2Vec] https://arxiv.org/pdf/1301.3781.pdf

Author: Aymeric Damien
Project: https://github.com/aymericdamien/TensorFlow-Examples/
"""
from __future__ import division, print_function, absolute_import

import collections
import os
import random
import urllib.request
import zipfile

import numpy as np
import tensorflow as tf

## 參數設定

In [0]:
# Training Parameters
learning_rate = 0.1
batch_size = 128
num_steps = 100000 # 3000000
display_step = 10000
eval_step = 50000 # 200000

# Evaluation Parameters
word_to_compare = 'funny'
words = [
    'hilarious',
    'amusing',
    'entertaining',
    'humorous',
    'fun',
    'laugh',
    'good',
    'apple',
    'banana',
    'orange',
]

# Word2Vec Parameters
embedding_size = 200 # Dimension of the embedding vector
max_vocabulary_size = 50000 # Total number of different words in the vocabulary
min_occurrence = 10 # Remove all words that does not appears at least n times
skip_window = 3 # How many words to consider left and right
num_skips = 2 # How many times to reuse an input to generate a label
num_sampled = 64 # Number of negative examples to sample

## 下載IMDB資料集

In [0]:
import urllib.request
import os
import tarfile

In [0]:
if not os.path.exists('data/'):
    os.makedirs('data/')

url="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
filepath="data/aclImdb_v1.tar.gz"
if not os.path.isfile(filepath):
    result=urllib.request.urlretrieve(url,filepath)
    print('downloaded:',result)

In [0]:
if not os.path.exists("data/aclImdb"):
    tfile = tarfile.open("data/aclImdb_v1.tar.gz", 'r:gz')
    result=tfile.extractall('data/')

## 讀取IMDB

In [0]:
import re
def rm_tags(text):
    re_tag = re.compile(r'<[^>]+>')
    return re_tag.sub('', text)

In [0]:
import os
def read_files(filetype):
    path = "data/aclImdb/"
    file_list=[]

    positive_path=path + filetype+"/pos/"
    for f in os.listdir(positive_path):
        file_list+=[positive_path+f]
    
    negative_path=path + filetype+"/neg/"
    for f in os.listdir(negative_path):
        file_list+=[negative_path+f]
        
    print('read',filetype, 'files:',len(file_list))
       
    all_labels = ([1] * 12500 + [0] * 12500) 
    
    all_texts  = []
    for fi in file_list:
        with open(fi,encoding='utf8') as file_input:
            all_texts += [rm_tags(" ".join(file_input.readlines()))]
            
    return all_labels,all_texts

In [8]:
y_train,train_text=read_files("train")

read train files: 25000


In [9]:
y_test,test_text=read_files("test")

read test files: 25000


## 處理IMDB資料
將資料集文字移除標點符號，合併成一個list

In [0]:
import string
table = str.maketrans('', '', string.punctuation)
text_words = []

for line in train_text + test_text:
    line = line.lower()
    line = line.translate(table)
    line = re.sub(r'  +', ' ', line)
    line = line.strip().split(' ')
    for word in line:
        text_words.append(word.encode())

In [11]:
print(text_words[:100])

[b'the', b'jokes', b'are', b'obvious', b'the', b'gags', b'are', b'corny', b'and', b'the', b'characters', b'are', b'walking', b'characatures', b'but', b'i', b'couldnt', b'stop', b'from', b'laughing', b'at', b'his', b'highly', b'entertaining', b'movie', b'no', b'matter', b'how', b'many', b'times', b'i', b'see', b'it', b'i', b'still', b'get', b'a', b'kick', b'out', b'of', b'this', b'one', b'and', b'i', b'recommend', b'it', b'highly', b'for', b'all', b'lovers', b'of', b'mindless', b'entertainment', b'it', b'contains', b'many', b'quotable', b'moments', b'and', b'some', b'of', b'the', b'best', b'sightgags', b'ive', b'seen', b'to', b'this', b'day', b'if', b'youve', b'had', b'a', b'bad', b'week', b'and', b'you', b'need', b'a', b'chuckle', b'rent', b'this', b'one', b'on', b'your', b'way', b'home', b'friday', b'night', b'to', b'give', b'your', b'weekend', b'a', b'good', b'start', b'i', b'myself', b'feel', b'this']


In [0]:
# Build the dictionary and replace rare words with UNK token
count = [('UNK', -1)]
# Retrieve the most common words
count.extend(collections.Counter(text_words).most_common(max_vocabulary_size - 1))
# Remove samples with less than 'min_occurrence' occurrences
for i in range(len(count) - 1, -1, -1):
    if count[i][1] < min_occurrence:
        count.pop(i)
    else:
        # The collection is ordered, so stop when 'min_occurrence' is reached
        break
# Compute the vocabulary size
vocabulary_size = len(count)
# Assign an id to each word
word2id = dict()
for i, (word, _)in enumerate(count):
    #print(word)
    #print(i)
    word2id[word] = i

In [13]:
data = list()
unk_count = 0
for word in text_words:
    # Retrieve a word id, or assign it index 0 ('UNK') if not in dictionary
    index = word2id.get(word, 0)
    #print(word)
    #print(index)
    if index == 0:
        unk_count += 1
    data.append(index)
count[0] = ('UNK', unk_count)
id2word = dict(zip(word2id.values(), word2id.keys()))

print("Words count:", len(text_words))
print("Unique words:", len(set(text_words)))
print("Vocabulary size:", vocabulary_size)
print("Most common words:", count[:10])

Words count: 11312464
Unique words: 223865
Vocabulary size: 29880
Most common words: [('UNK', 336976), (b'the', 650537), (b'and', 319334), (b'a', 319204), (b'of', 288060), (b'to', 266275), (b'is', 210043), (b'in', 183111), (b'it', 151223), (b'i', 145450)]


In [0]:
data_index = 0
# Generate training batch for the skip-gram model
def next_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    # get window size (words left and right + current one)
    span = 2 * skip_window + 1
    buffer = collections.deque(maxlen=span)
    if data_index + span > len(data):
        data_index = 0
    buffer.extend(data[data_index:data_index + span])
    data_index += span
    for i in range(batch_size // num_skips):
        context_words = [w for w in range(span) if w != skip_window]
        words_to_use = random.sample(context_words, num_skips)
        for j, context_word in enumerate(words_to_use):
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[context_word]
        if data_index == len(data):
            buffer.extend(data[0:span])
            data_index = span
        else:
            buffer.append(data[data_index])
            data_index += 1
    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels

In [15]:
# Input data
X = tf.placeholder(tf.int32, shape=[None])
# Input label
Y = tf.placeholder(tf.int32, shape=[None, 1])

# Ensure the following ops & var are assigned on CPU
# (some ops are not compatible on GPU)
with tf.device('/cpu:0'):
    # Create the embedding variable (each row represent a word embedding vector)
    embedding = tf.Variable(tf.random_normal([vocabulary_size, embedding_size]))
    # Lookup the corresponding embedding vectors for each sample in X
    X_embed = tf.nn.embedding_lookup(embedding, X)

    # Construct the variables for the NCE loss
    nce_weights = tf.Variable(tf.random_normal([vocabulary_size, embedding_size]))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

# Compute the average NCE loss for the batch
loss_op = tf.reduce_mean(
    tf.nn.nce_loss(weights=nce_weights,
                   biases=nce_biases,
                   labels=Y,
                   inputs=X_embed,
                   num_sampled=num_sampled,
                   num_classes=vocabulary_size))

# Define the optimizer
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
train_op = optimizer.minimize(loss_op)

# Evaluation
# Compute the cosine similarity between input data embedding and every embedding vectors
X_embed_norm = X_embed / tf.sqrt(tf.reduce_sum(tf.square(X_embed)))
embedding_norm = embedding / tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keepdims=True))
cosine_sim_op = tf.matmul(X_embed_norm, embedding_norm, transpose_b=True)

# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()

with tf.Session() as sess:

    # Run the initializer
    sess.run(init)

    # Testing data
    x_test = np.array([word2id[w.encode()] for w in [word_to_compare]])

    average_loss = 0
    for step in range(1, num_steps + 1):
        # Get a new batch of data
        batch_x, batch_y = next_batch(batch_size, num_skips, skip_window)
        # Run training op
        _, loss = sess.run([train_op, loss_op], feed_dict={X: batch_x, Y: batch_y})
        average_loss += loss

        if step % display_step == 0 or step == 1:
            if step > 1:
                average_loss /= display_step
            print("Step " + str(step) + ", Average Loss= " + \
                  "{:.4f}".format(average_loss))
            average_loss = 0

        # Evaluation
        if step % eval_step == 0 or step == 1:
            print("Evaluation...")
            sim = sess.run(cosine_sim_op, feed_dict={X: x_test})
            for word in words:
                print('  distance between "{}" and "{}" is {:.2f}'.format(
                    word_to_compare,
                    word,
                    sim[0][word2id[word.encode()]]
                ))

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Step 1, Average Loss= 491.5267
Evaluation...
  distance between "funny" and "hilarious" is -0.05
  distance between "funny" and "amusing" is -0.01
  distance between "funny" and "entertaining" is 0.03
  distance between "funny" and "humorous" is -0.09
  distance between "funny" and "fun" is 0.08
  distance between "funny" and "laugh" is -0.02
  distance between "funny" and "good" is -0.01
  distance between "funny" and "apple" is -0.10
  distance between "funny" and "banana" is -0.06
  distance between "funny" and "orange" is 0.06
Step 10000, Average Loss= 161.4916
Step 20000, Average Loss= 64.9322
Step 30000, Average Loss= 44.1460
Step 40000, Average Loss= 34.7992
Step 50000, Average Loss= 29.2630
Evaluation...
  distance between "funny" and "hilarious" is 0.59
  distance between "funny" and "amusing" is 0.46
  distance between "funny" and "entertaining" is 0.67
  distance between "funny" and