<a href="https://colab.research.google.com/github/ThierrySt-Arnaud/wiki-reading/blob/colab-conversion/colab/wiki_reading_training_keras_en.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Create a script file from get_data.sh (necessary because %%bash magic will not update in real time)

In [0]:
%%writefile get_data.sh
echo "Downloading English WikiReading TensorFlow Records..."

CLOUD_STORAGE=https://storage.googleapis.com/wikireading

DATA_FOLDER=data

downloadlExtractDelete(){
  wget -c ${CLOUD_STORAGE}/${1}
  tar xvzf ${1} -C ${DATA_FOLDER} --skip-old-files
  rm ${1}
}

mkdir ${DATA_FOLDER}
#downloadlExtractDelete "train.json.tar.gz" &
#downloadlExtractDelete "validation.json.tar.gz" &
downloadlExtractDelete "test.json.tar.gz" &
wget -P ${DATA_FOLDER} https://github.com/google-research-datasets/wiki-reading/blob/master/README.md
wget -P ${DATA_FOLDER} https://raw.githubusercontent.com/ThierrySt-Arnaud/wiki-reading/colab-conversion/data/stopwords.json
wget -P ${DATA_FOLDER} https://raw.githubusercontent.com/ThierrySt-Arnaud/wiki-reading/colab-conversion/data/3-grams-test.pkl
wget -P ${DATA_FOLDER} ${CLOUD_STORAGE}/answer.vocab
wget -P ${DATA_FOLDER} ${CLOUD_STORAGE}/document.vocab
wget -P ${DATA_FOLDER} ${CLOUD_STORAGE}/raw_answer.vocab
wget -P ${DATA_FOLDER} ${CLOUD_STORAGE}/type.vocab 
wget -P ${DATA_FOLDER} ${CLOUD_STORAGE}/character.vocab
wait

echo "Done."


Add execution permission and execute bash file

In [0]:
!chmod +x get_data.sh
!./get_data.sh

Python import statements

In [0]:
from __future__ import absolute_import, division, print_function
import io, json, glob, random, gc, pickle
import numpy as np
from os import path
from collections import deque, Counter
from multiprocessing import Pool

%tensorflow_version 2.x
%load_ext tensorboard
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import Sequence


Define global constants and variables

In [0]:
VOCAB_SIZE = 32768
NGRAM_VOCAB = 16384
ANSWER_VOCAB = 8192
NGRAMS = 3
EMBED_DIM = 32
BATCH_SIZE = 32
LEARNING_RATE = 0.001
FEATURE = 'document_sequence'
LABEL = 'answer_ids'
PATH_PREFIX = "data"
TRAIN_FILE = "train"
VALIDATION_FILE = "validation"
TEST_FILE = "test"
MAX_LENGTH = 8192
MAX_NGRAMS = 2048
TARGET = TEST_FILE
LOG_DIR = "logs/fit/"
NGRAM_FILE = f"{PATH_PREFIX}/{NGRAMS}-grams-{TARGET}.pkl"
STOPWORD_FILE = f"{PATH_PREFIX}/stopwords.json"


Define global variables

In [0]:
with open(STOPWORD_FILE) as stopword_file:
  stopwords = [x for x in json.load(stopword_file) if x < VOCAB_SIZE]

ngram_indices = {}
ngram_list = []
if NGRAMS > 1:
  if path.isfile(NGRAM_FILE):
    with open(NGRAM_FILE, 'rb') as ngram_file:
      ngram_list = pickle.load(ngram_file)[:NGRAM_VOCAB]
  else:
    print(f"{NGRAM_FILE} not found. N-grams will need to be generated")
    
total_samples = -1 # 941280 for test files

Define ngram creation utilities

In [0]:
class NgramExtractor(object):
    def __init__(self, feature=FEATURE, ngram_range=NGRAMS):
        self.feature = feature
        self.ngram_range = ngram_range

    def __call__(self, filename):
        print(f"Getting ngram set from {filename}")
        with open(filename, 'r') as file:
            sequences = [json.loads(line)[self.feature] for line in file]

        gc.collect()

        fileset = Counter()
        for sequence in sequences:
            for i in range(2, self.ngram_range + 1):
                ngrams = zip(*[sequence[j:] for j in range(i)])
                for ngram in ngrams:
                    if all(x not in stopwords for x in ngram):
                        fileset[ngram] += 1

        del sequences
        gc.collect()

        print(f"Extracted {len(fileset)} ngrams from {filename}")
        most_common = fileset.most_common(250000)
        del fileset
        gc.collect()
        return most_common


def create_ngram_set(filenames, feature=FEATURE, ngrams=NGRAMS):
    """
    Extract a set of n-grams from a list of files containing feature where
    feature is a list of ints.

    create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
    {(4, 9), (4, 1), (1, 4), (9, 4)}

    create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
    [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
    """
    
    # Create set of unique n-grams from the training set.
    with Pool() as pool:
        new_sets = pool.map(NgramExtractor(feature, ngrams), filenames)

    gc.collect()

    ngram_freq = Counter()
    for new_set in new_sets:
        for ngram in new_set:
            ngram_freq[ngram[0]] += ngram[1]

    del new_sets
    gc.collect()

    sorted_tokens = [x for (x, v) in ngram_freq.most_common(500000)]

    del ngram_freq
    gc.collect()

    return sorted_tokens


def get_ngrams(sequences, token_indice=ngram_indices,
               ngram_range=NGRAMS, max_ngrams=MAX_NGRAMS):
  """
  Returns the list of ngrams of token_indice
  found in each sequence of sequences.

  Example: adding bi-gram
  >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
  >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
  >>> add_ngram(sequences, token_indice, ngram_range=2)
  [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]

  Example: adding tri-gram
  >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
  >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
  >>> add_ngram(sequences, token_indice, ngram_range=3)
  [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 2018, 1337, 42]]
  """
  new_sequences = []
  for input_list in sequences:
    new_list = []
    for ngram_value in range(ngram_range+1, 2, -1):
      if len(new_list) < max_ngrams:
        for i in range(len(input_list[:]) - ngram_value + 1):
          ngram = tuple(input_list[i:i + ngram_value])
          if ngram in token_indice:
            new_list.append(token_indice[ngram])
            if len(new_list) == max_ngrams:
              break
      else:
        break
    new_sequences.append(new_list)

  return new_sequences
    

Definition of the DataGenerator for wiki-reading + fasttext

In [0]:
class DataGenerator(Sequence):
  """Generates data for Keras
  Sequence based data generator. Suitable for
  building data generator for training.
  """
  def __init__(self, feature=FEATURE, max_length=MAX_LENGTH,
               label=LABEL, path_prefix=PATH_PREFIX,
               file_prefix=TRAIN_FILE, ngrams=NGRAMS,
               max_ngrams=MAX_NGRAMS, batch_size=BATCH_SIZE, 
               vocab_size=VOCAB_SIZE, answer_vocab=ANSWER_VOCAB,
               shuffle=True):

    """Initialization
    :param features: features to use for classification
    :param labels: labels to use for training or validation
    :param file_prefix: type of files to extract from
    :param batch_size: batch size at each iteration
    :param vocab_size: max vocab id used
    :param shuffle: True to shuffle label indexes after every epoch
    """

    self.feature = feature
    self.label = label
    self.max_length = max_length
    self.path_prefix = path_prefix
    self.ngrams = ngrams
    self.max_ngrams = 0
    self.answer_vocab = answer_vocab
    self.batch_size = batch_size
    self.vocab_size = vocab_size + len(stopwords)
    self.shuffle = shuffle
    self.filenames = glob.glob1(path_prefix,f"{file_prefix}*.json")
    self.file_list = deque(
        [f"{path_prefix}/{file}" for file in self.filenames])
    global total_samples
    if total_samples < 0:
      self.total_samples = self._get_total_samples()
      total_samples = self.total_samples
    else:
      print(f"Reusing previous value of total_samples: {total_samples}")
      self.total_samples = total_samples

    global ngram_indices, ngram_list
    if ngrams > 1:
      self.max_ngrams = max_ngrams
      if not ngram_list:
        ngram_list = create_ngram_sets(self.file_list, feature=feature,
                                       ngrams=ngrams)
      ngram_indices = {k: v for (v, k) in enumerate(ngram_list,
                                                    self.vocab_size)}
  
    if self.shuffle:
      random.shuffle(self.file_list)
    self.current_file = open(self.file_list[0])
    self.file_list.rotate()


  def __len__(self):
    """Denotes the number of batches per epoch
    :return: number of batches per epoch
    """
    return self.total_samples // self.batch_size

  def __getitem__(self, index):
    """Generate one batch of data
    :param index: index of the batch
    :return: x (feature), y (labels)
    """
    batch = self._generate_sample()

    if self.shuffle:
      random.shuffle(batch)

    # Extract selected features and labels from json object
    x, y = zip(*[[self.prune_sample(sample[self.feature])[:self.max_length],
                  self.prune_label(sample[self.label])] for sample in batch])
    
    x_num = np.zeros((self.batch_size, self.max_length+self.max_ngrams),
                     dtype=int)
    y_num = np.zeros((self.batch_size, self.answer_vocab), dtype=bool)

    for i in range(self.batch_size):
      actual_length = min(self.max_length, len(x[i]))
      x_num[i][:actual_length] = x[i]
      for a in y[i]:
        y_num[i][a-1] = True

    if self.ngrams > 1:
      sample_ngrams = get_ngrams(x, ngram_indices, self.ngrams, self.max_ngrams)
      for i in range(self.batch_size):
        actual_ngrams = min(self.max_ngrams, len(sample_ngrams[i]))
        x_num[i][self.max_length:self.max_length+actual_ngrams]\
          = sample_ngrams[i][:self.max_ngrams]

    return x_num, y_num

  def on_epoch_end(self):
    self.current_file.close()
    if self.shuffle:
      random.shuffle(self.file_list)
    self.current_file = open(self.file_list[0])
    self.file_list.rotate()
    

  def _generate_sample(self):
    """Generates data containing batch_size images
    :param list_IDs_temp: list of label ids to load
    :return: batch of images
    """
    batch = []
    for _ in range(self.batch_size):
      try:
        batch.append(json.loads(next(self.current_file)))
      except (StopIteration, ValueError):
        self.current_file.close()
        self.current_file = open(self.file_list[0])
        self.file_list.rotate()
        batch.append(json.loads(next(self.current_file)))
    return batch

  def _get_total_samples(self):
    print(f"Getting number of samples from all files")

    # Count all samples in specified files
    with Pool(4) as pool:
      samples_per_file = pool.map(rawgencount, self.file_list)
    return sum(samples_per_file)

  def prune_sample(self, sample):
    """Remove all values that do not fit in the vocabulary
    """
    return [x for x in sample if x <= self.vocab_size and x not in stopwords]

  def prune_label(self, label):
    return [x for x in label if x <= self.answer_vocab]
  

def _make_gen(reader):
  b = reader(1024 * 1024)
  while b:
    yield b
    b = reader(1024*1024)

def rawgencount(filename):
  print(f"Opening {filename}")
  with open(filename, 'rb') as f:
    f_gen = _make_gen(f.raw.read)
    return sum(buf.count(b'\n') for buf in f_gen )


Conversion of bow.py to a Jupyter notebook using Keras + Tensorflow 2.x

In [0]:
"""FastText Model."""


tbcallback = tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR, histogram_freq=1)

def main():
  inputs = keras.Input(shape=(MAX_LENGTH+MAX_NGRAMS,), name='input')
  embedding = layers.Embedding(VOCAB_SIZE+len(stopwords)+len(ngram_list),
                               EMBED_DIM, mask_zero=True)(inputs)
  lstm = layers.LSTM(8, activation='linear',
                     input_shape=(
                         EMBED_DIM, VOCAB_SIZE+len(stopwords)+len(ngram_list)),
                     return_sequences=True)(embedding)
  average = layers.GlobalAveragePooling1D()(lstm)
  outputs = layers.Dense(ANSWER_VOCAB,
                         activation=keras.activations.sigmoid)(average)
  model = keras.Model(inputs=inputs, outputs=outputs)

  model.compile(optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE),
                loss=keras.losses.binary_crossentropy,
                metrics=['categorical_accuracy'])
  test_generator = DataGenerator(file_prefix='test')
  model.fit_generator(test_generator, steps_per_epoch=10,
                      epochs=5, verbose=2)
if __name__ == "__main__":
  main()


In [0]:
"""Generate Ngrams"""

def main():
  filenames = [f"{PATH_PREFIX}/{file}" for file in glob.glob1(
      PATH_PREFIX,f"{TEST_FILE}*.json")]
  ngram_indices, sorted_ngrams = create_ngram_set(filenames, ngrams=2)
  save_ngrams = {"indices": ngram_indices, "sorted": sorted_ngrams}
  with open(f"{PATH_PREFIX}/test_ngrams.json", 'w') as savefile:
    json.dump(save_ngrams, savefile)

if __name__ == "__main__":
  main()

In [0]:
  %tensorboard --logdir logs/fit