<a href="https://colab.research.google.com/github/ThierrySt-Arnaud/wiki-reading/blob/colab-conversion/jupyter/wiki_reading_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Create a script file from get_data.sh (necessary because %%bash magic will not update in real time)

In [0]:
%%writefile get_data.sh
echo "Downloading English WikiReading TensorFlow Records..."

CLOUD_STORAGE=https://storage.googleapis.com/wikireading

DATA_FOLDER=data

downloadlExtractDelete(){
  wget -c ${CLOUD_STORAGE}/${1}
  tar xvzf ${1} -C ${DATA_FOLDER} --skip-old-files
  rm ${1}
}

mkdir ${DATA_FOLDER}
downloadlExtractDelete "train.tar.gz" &
downloadlExtractDelete "validation.tar.gz" &
downloadlExtractDelete "test.tar.gz" &
wget -P ${DATA_FOLDER} https://github.com/google-research-datasets/wiki-reading/blob/master/README.md
wget -P ${DATA_FOLDER} ${CLOUD_STORAGE}/answer.vocab
wget -P ${DATA_FOLDER} ${CLOUD_STORAGE}/document.vocab
wget -P ${DATA_FOLDER} ${CLOUD_STORAGE}/raw_answer.vocab
wget -P ${DATA_FOLDER} ${CLOUD_STORAGE}/type.vocab 
wget -P ${DATA_FOLDER} ${CLOUD_STORAGE}/character.vocab
wait

echo "Done."


Add execution permission and bash file

In [0]:
!chmod +x get_data.sh
!./get_data.sh

Conversion of utils.py to a Jupyter notebook using Tensorflow r1.15

In [0]:
# Copyright 2016 The Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Utils for all models."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

%tensorflow_version 1.x
import tensorflow as tf


def resize_axis(tensor, axis, new_size, fill_value=0):
  """Truncates or pads a tensor to new_size on on a given axis.

  Truncate or extend tensor such that tensor.shape[axis] == new_size. If the
  size increases, the padding will be performed at the end, using fill_value.

  Args:
    tensor: The tensor to be resized.
    axis: An integer representing the dimension to be sliced.
    new_size: An integer or 0d tensor representing the new value for
      tensor.shape[axis].
    fill_value: Value to use to fill any new entries in the tensor. Will be
      cast to the type of tensor.

  Returns:
    The resized tensor.
  """
  tensor = tf.convert_to_tensor(tensor)
  shape = tf.unstack(tf.shape(tensor))

  pad_shape = shape[:]
  pad_shape[axis] = tf.maximum(0, new_size - shape[axis])

  shape[axis] = tf.minimum(shape[axis], new_size)
  shape = tf.stack(shape)

  resized = tf.concat(axis=axis,
                      values=[
      tf.slice(tensor, tf.zeros_like(shape), shape),
      tf.fill(tf.stack(pad_shape), tf.cast(fill_value, tensor.dtype))
  ])

  # Update shape.
  new_shape = tensor.get_shape().as_list()  # A copy is being made.
  new_shape[axis] = new_size
  resized.set_shape(new_shape)
  return resized


def prune_out_of_vocab_ids(sparse_ids, vocab_size):
  """Prunes out of vocabulary ids from given SparseTensor."""
  is_id_valid = tf.less(sparse_ids.values, vocab_size)
  return tf.sparse_retain(sparse_ids, is_id_valid)


Conversion of bow.py to a Jupyter notebook using Tensorflow r1.15

In [0]:
# Copyright 2016 The Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Bag of embeddings model."""
import tensorflow.estimator as learn
from tensorflow.data.experimental import AUTOTUNE
from tensorflow.contrib import layers

VOCAB_SIZE = 10000
EMBED_DIM = 20
ANSWER_DIM = 2 * EMBED_DIM
ANSWER_NUM = 5000
BATCH_SIZE = 128
LEARNING_RATE = 0.01
HIDDEN_SIZE = 128
SPARSE_FEATURES = ['document_sequence', 'question_sequence']
filename = "data/train-*"


def input_fn():
  features = {k: tf.VarLenFeature(dtype=tf.int64) for k in SPARSE_FEATURES}
  features['answer_ids'] = tf.VarLenFeature(dtype=tf.int64)
  files = tf.data.Dataset.list_files(file_pattern=filename)
  dataset = files.interleave(tf.data.TFRecordDataset,
                              cycle_length=AUTOTUNE,
                              num_parallel_calls=AUTOTUNE)

  def parse_fn(serialized):
    example = tf.io.parse_single_sequence_example(serialized=serialized,
                                                  sequence_features=features)[1]
    labels = example.pop('answer_ids')
    labels = resize_axis(tf.sparse_tensor_to_dense(labels), 1, 1)
    return example, labels

  dataset = dataset.map(map_func=parse_fn, num_parallel_calls=AUTOTUNE)
  dataset = dataset.batch(batch_size=BATCH_SIZE)
  dataset = dataset.shuffle(buffer_size=BATCH_SIZE)
  dataset = dataset.prefetch(buffer_size=AUTOTUNE)
  return dataset


def bow_model(features, labels):
  document = prune_out_of_vocab_ids(features['document_sequence'], VOCAB_SIZE)
  question = prune_out_of_vocab_ids(features['question_sequence'], VOCAB_SIZE)
  answers = tf.squeeze(tf.one_hot(labels, ANSWER_NUM, 1.0, 0.0),
                       axis=[1])
  embeddings = tf.get_variable('embeddings', [VOCAB_SIZE, EMBED_DIM])
  doc_enc = layers.safe_embedding_lookup_sparse(
      [embeddings], document, None, combiner='sum')
  question_enc = layers.safe_embedding_lookup_sparse(
      [embeddings], question, None, combiner='sum')
  joint_enc = tf.concat(axis=1, values=[doc_enc, question_enc])
  answer_embeddings = tf.get_variable(
      'answer_embeddings', [ANSWER_DIM, ANSWER_NUM])
  answer_biases = tf.get_variable('answer_biases', [ANSWER_NUM])

  # TODO: Convert tf.contrib.ops.softmax() to tf.nn.softmax()
  softmax, loss = tf.nn.softmax(
      joint_enc, answers, answer_embeddings, answer_biases)
  train_op = layers.optimize_loss(
      loss, tf.contrib.framework.get_global_step(),
      learning_rate=LEARNING_RATE,
      optimizer='Adam')
  return softmax, loss, train_op


def main():
  tf.logging.set_verbosity(tf.logging.INFO)
  estimator = learn.Estimator(
    model_fn=bow_model,
    model_dir="results/bow/",
  )
  estimator.evaluate(input_fn=input_fn, steps=10000)


if __name__ == "__main__":
  main()
