<a href="https://colab.research.google.com/github/ThierrySt-Arnaud/wiki-reading/blob/colab-conversion/colab/wiki_fast_reading_keras_en.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Create a script file from get_data.sh (necessary because %%bash magic will not update in real time)

In [0]:
%%writefile get_data.sh
echo "Downloading English WikiReading TensorFlow Records..."

CLOUD_STORAGE=https://storage.googleapis.com/wikireading

DATA_FOLDER=data

downloadlExtractDelete(){
  wget -c ${CLOUD_STORAGE}/${1}
  tar xvzf ${1} -C ${DATA_FOLDER} --skip-old-files
  rm ${1}
}

mkdir ${DATA_FOLDER}
#downloadlExtractDelete "train.json.tar.gz" &
#downloadlExtractDelete "validation.json.tar.gz" &
downloadlExtractDelete "test.json.tar.gz" &
wget -P ${DATA_FOLDER} https://github.com/google-research-datasets/wiki-reading/blob/master/README.md
wget -P ${DATA_FOLDER} ${CLOUD_STORAGE}/answer.vocab
wget -P ${DATA_FOLDER} ${CLOUD_STORAGE}/document.vocab
wget -P ${DATA_FOLDER} ${CLOUD_STORAGE}/raw_answer.vocab
wget -P ${DATA_FOLDER} ${CLOUD_STORAGE}/type.vocab 
wget -P ${DATA_FOLDER} ${CLOUD_STORAGE}/character.vocab
wait

echo "Done."


Writing get_data.sh


Add execution permission and execute bash file

In [0]:
!chmod +x get_data.sh
!./get_data.sh

Downloading English WikiReading TensorFlow Records...
--2019-11-20 22:45:09--  https://github.com/google-research-datasets/wiki-reading/blob/master/README.md
--2019-11-20 22:45:09--  https://storage.googleapis.com/wikireading/test.json.tar.gz
Resolving storage.googleapis.com (storage.googleapis.com)... Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... 74.125.199.128, 2607:f8b0:400e:c09::80
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.199.128|:443... connected.
HTTP request sent, awaiting response... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘data/README.md’

README.md               [<=>                 ]       0  --.-KB/s               200 OK
Length: 2934382808 (2.7G) [application/gzip]
Saving to: ‘test.json.tar.gz’

README.md               [ <=>                ]  76.55K  --.-KB/s    in 0.1s    

2019-11-20 22:45:10 (604 KB/s) - ‘data/README.md’ 

Python import statements

In [0]:
from __future__ import absolute_import, division, print_function
import io, json, glob, random
import numpy as np
import gc
from collections import deque
from itertools import islice
from multiprocessing import Pool

%tensorflow_version 2.x
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import Sequence


TensorFlow 2.x selected.


Define global constants and variables

In [0]:
VOCAB_SIZE = 20000
NGRAM_VOCAB = 20000
STOPWORDS = tuple(range(80)+[86, 88, 90, 93, 100, 102, 110, ])
NGRAMS = 3
EMBED_DIM = 32
BATCH_SIZE = 128
LEARNING_RATE = 0.01
FEATURE = 'document_sequence'
LABEL = 'answer_ids'
PATH_PREFIX = "data"
TRAIN_FILE = "train"
VALIDATIION_FILE = "validation"
TEST_FILE = "test"
MAX_LENGTH = 512
MAX_NGRAMS = 512
MAX_ANSWERS = 128

ngram_indices = {}
total_samples = -1


Define ngram creation utilities

In [0]:
ngram_freq = 

class NgramExtractor(object):
    def __init__(self, feature=FEATURE,
                 vocab_size=VOCAB_SIZE, ngram_range=NGRAMS):
        self.feature = feature
        self.vocab_size = vocab_size
        self.ngram_range = ngram_range

    def __call__(self, filename):
      print(f"Getting ngram set from {filename}")
      with open(filename, 'r') as file:
        sequences = [json.loads(line)[self.feature] for line in file]

      sequences = prune_lists(sequences, self.vocab_size)

      gc.collect()

      fileset = set()
      for sequence in sequences:
        for i in range(2, self.ngram_range + 1):
            new_set = set(zip(*[sequence[j:] for j in range(i)]))
            fileset.update(new_set)
      del sequences
      gc.collect()

      print(f"Extracted {len(fileset)} ngrams from {filename}")
      return fileset


def create_ngram_set(filenames, feature=FEATURE,
                     vocab_size=VOCAB_SIZE, ngrams=NGRAMS):
  """
  Extract a set of n-grams from a list of files containing feature where
  feature is a list of ints.

  >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
  {(4, 9), (4, 1), (1, 4), (9, 4)}

  >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
  [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
  """
  # Create set of unique n-grams from the training set.
  ngram_set = set()
  with Pool(2) as pool:
    new_sets = pool.map(NgramExtractor(feature, vocab_size, ngrams), filenames)

  for new_set in new_sets:
    ngram_set.update(new_set)
  print(f"Found {len(ngram_set)} unique n-grams (n <= {ngrams})")

  # Dictionary mapping n-gram token to a unique integer.
  # Integer values are greater than max_features in order
  # to avoid collision with existing features.
  start_index = vocab_size + 1
  token_indice = {v: k for k, v in enumerate(start_index, ngram_set)}
  return token_indice


def prune_lists(sequences, vocab_size=VOCAB_SIZE):
  """Remove all values that do not fit in the vocabulary
  """
  return [[x for x in sequence if x <= vocab_size] for sequence in sequences]


def get_ngrams(sequences, token_indice=ngram_indices,
               ngram_range=NGRAMS, max_ngrams=MAX_NGRAMS):
  """
  Returns the list of ngrams of token_indice
  found in each sequence of sequences.

  Example: adding bi-gram
  >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
  >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
  >>> add_ngram(sequences, token_indice, ngram_range=2)
  [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]

  Example: adding tri-gram
  >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
  >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
  >>> add_ngram(sequences, token_indice, ngram_range=3)
  [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 2018, 1337, 42]]
  """
  new_sequences = []
  for input_list in sequences:
    new_list = []
    for ngram_value in range(ngram_range+1, 2, -1):
      if len(new_list) < max_ngrams:
        for i in range(len(input_list[:]) - ngram_value + 1):
          ngram = tuple(input_list[i:i + ngram_value])
          if ngram in token_indice:
            new_list.append(token_indice[ngram])
            if len(new_list) == max_ngrams:
              break
      else:
        break
    new_sequences.append(new_list)

  return new_sequences
    

Definition of the DataGenerator for wiki-reading + fasttext

In [0]:
class DataGenerator(Sequence):
  """Generates data for Keras
  Sequence based data generator. Suitable for
  building data generator for training.
  """
  def __init__(self, feature=FEATURE, max_length=MAX_LENGTH,
               label=LABEL, path_prefix=PATH_PREFIX,
               file_prefix=TRAIN_FILE, ngrams=NGRAMS,
               max_ngrams=MAX_NGRAMS, batch_size=BATCH_SIZE, 
               vocab_size=VOCAB_SIZE, max_answers=MAX_ANSWERS,
               shuffle=True):

    """Initialization
    :param features: features to use for classification
    :param labels: labels to use for training or validation
    :param file_prefix: type of files to extract from
    :param batch_size: batch size at each iteration
    :param vocab_size: max vocab id used
    :param shuffle: True to shuffle label indexes after every epoch
    """

    self.feature = feature
    self.label = label
    self.max_length = max_length
    self.path_prefix = path_prefix
    self.ngrams = ngrams
    self.max_ngrams = 0
    self.max_answers = max_answers
    self.batch_size = batch_size
    self.vocab_size = vocab_size
    self.shuffle = shuffle
    self.filenames = glob.glob1(path_prefix,f"{file_prefix}*.json")
     
    if self.shuffle:
      random.shuffle(self.filenames)
    self.file_list = deque(
        [f"{path_prefix}/{file}" for file in self.filenames])
    global total_samples
    if total_samples < 0:
      self.total_samples = self._get_total_samples()
      total_samples = self.total_samples
    else:
      print(f"Reusing previous value of total_samples: {total_samples}")
      self.total_samples = total_samples

    global ngram_indices
    if ngrams > 1:
      self.max_ngrams = max_ngrams
      if ngram_indices:
        print(f"Reusing previously generated set of n-grams")
      else:
        ngram_indices = create_ngram_set(
            self.file_list, feature=feature,
            vocab_size=vocab_size, ngrams=ngrams)
  
    self.current_file = open(self.file_list.popleft())


  def __len__(self):
    """Denotes the number of batches per epoch
    :return: number of batches per epoch
    """
    return self.total_samples // self.batch_size

  def __getitem__(self, index):
    """Generate one batch of data
    :param index: index of the batch
    :return: x (feature), y (labels)
    """
    batch = self._generate_sample()

    if self.shuffle:
      random.shuffle(batch)

    # Extract selected features and labels from json object
    x, y = zip(*[[prune_lists(sample[self.feature], self.vocab_size),
                  sample[self.label]] for sample in batch])
    
    x_num = np.zeros((self.batch_size, self.max_length+self.max_ngrams),
                     dtype=int)
    y_num = np.zeros((self.batch_size, self.max_answers), dtype=int)

    for i in range(self.batch_size):
      actual_length = min(self.max_length, len(x[i]))
      actual_answers = min(self.max_answers, len(y[i]))
      x_num[i][:actual_length] = x[i][:self.max_length]
      y_num[i][:actual_answers] = y[i][:self.max_answers]

    if self.ngrams > 1:
      ngrams = get_ngrams(x, ngram_indices, ngram_range, self.max_ngrams)
      for i in range(self.batch_size):
        actual_ngrams = min(self.max_ngrams, len(ngrams[i]))
        x_num[i][max_length:actual_ngrams] = ngrams[i][:self.max_ngrams]

    return x_num, y_num

  def on_epoch_end(self):
    self.current_file.close()
    if self.shuffle:
      random.shuffle(self.filenames)
    self.file_list = deque(
        [f"{self.path_prefix}/{file}" for file in self.filenames])

  def _generate_sample(self):
    """Generates data containing batch_size images
    :param list_IDs_temp: list of label ids to load
    :return: batch of images
    """
    batch = []
    for _ in range(self.batch_size):
      try:
        batch.append(json.loads(next(self.current_file)))
      except (StopIteration, ValueError):
        if self.file_list:
          self.current_file.close()
          self.current_file = open(self.file_list.popleft())
          batch.append(json.loads(next(self.current_file)))
    return batch

  def _get_total_samples(self):
    print(f"Getting number of samples from all files")

    # Count all samples in specified files
    with Pool(4) as pool:
      samples_per_file = pool.map(rawgencount, self.file_list)
    return sum(samples_per_file)

def _make_gen(reader):
  b = reader(1024 * 1024)
  while b:
    yield b
    b = reader(1024*1024)

def rawgencount(filename):
  print(f"Opening {filename}")
  with open(filename, 'rb') as f:
    f_gen = _make_gen(f.raw.read)
    return sum( buf.count(b'\n') for buf in f_gen )


Conversion of bow.py to a Jupyter notebook using Keras + Tensorflow 2.x

In [0]:
"""FastText Model."""

def main():
  inputs = keras.Input(shape=(1,), name='input')
  embedding = layers.Embedding(VOCAB_SIZE+len(ngram_indices),
                               EMBED_DIM, mask_zero=True)(inputs)
  average = layers.GlobalAveragePooling1D()(embedding)
  outputs = layers.Dense(1, use_bias=True, activation=keras.activations.sigmoid,
                         name='predictions')(average)
  model = keras.Model(inputs=inputs, outputs=outputs)

  model.compile(optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE),
                loss=keras.losses.binary_crossentropy,
                metrics=['acc'])
  test_generator = DataGenerator(file_prefix='test')
  model.fit_generator(test_generator, steps_per_epoch=100, verbose=2)
if __name__ == "__main__":
  main()


Getting number of samples from all files
Opening data/test-00014-of-00015.json
Opening data/test-00012-of-00015.json
Opening data/test-00002-of-00015.json
Opening data/test-00006-of-00015.json
Opening data/test-00013-of-00015.json
Opening data/test-00008-of-00015.json
Opening data/test-00010-of-00015.json
Opening data/test-00011-of-00015.json
Opening data/test-00003-of-00015.json
Opening data/test-00009-of-00015.json
Opening data/test-00005-of-00015.json
Opening data/test-00007-of-00015.json
Opening data/test-00000-of-00015.json
Opening data/test-00001-of-00015.json
Opening data/test-00004-of-00015.json
Getting ngram set from data/test-00014-of-00015.json
Getting ngram set from data/test-00002-of-00015.json
Extracted 17080088 ngrams from data/test-00002-of-00015.json
Getting ngram set from data/test-00006-of-00015.json
Extracted 17141610 ngrams from data/test-00014-of-00015.json
Getting ngram set from data/test-00012-of-00015.json
Extracted 17076344 ngrams from data/test-00006-of-00015

KeyboardInterrupt: ignored