In [1]:
import json
import csv
import random

import numpy as np
from keras.utils.np_utils import to_categorical

random.seed(1984)

INPUT_PADDING = 50
OUTPUT_PADDING = 100

Using TensorFlow backend.
  from ._conv import register_converters as _register_converters


In [2]:
class Vocabulary(object):

    def __init__(self, vocabulary_file, padding=None):
        """
            Creates a vocabulary from a file
            :param vocabulary_file: the path to the vocabulary
        """
        
        # set vocabulary file, padding and reverse vocabulary
        
        self.vocabulary_file = vocabulary_file
        with open(vocabulary_file, 'r') as f:
            self.vocabulary = json.load(f)

        self.padding = padding
        self.reverse_vocabulary = {v: k for k, v in self.vocabulary.items()}

    def size(self):
        """
            returns the size of the vocabulary
        """
        return len(self.vocabulary.keys())

    def string_to_int(self, text):
        """
            Converts a string into it's character integer 
            representation
            :param text: text to convert
        """
        # get the characters
        characters = list(text)

        # integer representation
        integers = []

        # pick first k characters where k = self.padding
        if self.padding and len(characters) >= self.padding:
            # truncate if too long
            characters = characters[:self.padding - 1]
        
        # append special characters
        characters.append('<eot>')
        
        # append the integer equivalent
        for c in characters:
            if c in self.vocabulary:
                integers.append(self.vocabulary[c])
            else:
                integers.append(self.vocabulary['<unk>'])


        # pad the data if its shorter
        if self.padding and len(integers) < self.padding:
            integers.extend([self.vocabulary['<unk>']]
                            * (self.padding - len(integers)))

        if len(integers) != self.padding:
            print(text)
            raise AttributeError('Length of text was not padding.')
        return integers

    def int_to_string(self, integers):
        """
            Decodes a list of integers
            into it's string representation
        """
        characters = []
        for i in integers:
            characters.append(self.reverse_vocabulary[i])

        return characters



In [3]:
class Data(object):

    def __init__(self, file_name, input_vocabulary, output_vocabulary):
        """
            Creates an object that gets data from a file
            :param file_name: name of the file to read from
            :param vocabulary: the Vocabulary object to use
            :param batch_size: the number of datapoints to return
            :param padding: the amount of padding to apply to 
                            a short string
        """
        
        # set vocab files and data file
        self.input_vocabulary = input_vocabulary
        self.output_vocabulary = output_vocabulary
        self.file_name = file_name

    def load(self):
        """
            Loads data from a file
        """
        self.inputs = []
        self.targets = []

        # load the data
        with open(self.file_name, 'r') as f:
            reader = csv.reader(f)
            for row in reader:
                self.inputs.append(row[0])
                self.targets.append(row[1])

    def transform(self):
        """
            Transforms the data as necessary
        """
        # @TODO: use `pool.map_async` here?
        
        # convert string to int
        self.inputs = np.array(list(map(self.input_vocabulary.string_to_int, self.inputs)))
        self.targets = map(self.output_vocabulary.string_to_int, self.targets)
        
        #output is a seq of integers - we represent each integer as 1-hopt encoding
        self.targets = np.array(
            list(map(
                lambda x: to_categorical(
                    x,
                    num_classes=self.output_vocabulary.size()),
                self.targets)))
        
        # noit sure what exactly is this for 
        assert len(self.inputs.shape) == 2, 'Inputs could not properly be encoded'
        assert len(self.targets.shape) == 3, 'Targets could not properly be encoded'

    def generator(self, batch_size):
        """
            Creates a generator that can be used in `model.fit_generator()`
            Batches are generated randomly.
            :param batch_size: the number of instances to include per batch
        """
        instance_id = range(len(self.inputs))
        
        while True:
            try:
                
                batch_ids = random.sample(instance_id, batch_size) # random chose a batch
                yield (np.array(self.inputs[batch_ids], dtype=int), np.array(self.targets[batch_ids]))
                
            except Exception as e:
                print('EXCEPTION OMG')
                print(e)
                yield None, None

In [9]:
import os
cwd = os.getcwd()
DATA_FOLDER = os.path.join(cwd, 'data')
input_vocab_file_path = os.path.join(DATA_FOLDER, 'human_vocab.json')
output_vocab_file_path = os.path.join(DATA_FOLDER, 'machine_vocab.json')

In [11]:
input_vocab = Vocabulary(input_vocab_file_path, padding=50)
output_vocab = Vocabulary(output_vocab_file_path, padding=12)

In [12]:
sample_csv_file_path = os.path.join(DATA_FOLDER, 'training.csv')

ds = Data(sample_csv_file_path, input_vocab, output_vocab)
ds.load()
ds.transform()

In [13]:
print(ds.inputs.shape)
print(ds.targets.shape)

g = ds.generator(32)


print(ds.inputs[[5,10, 12]].shape)
print(ds.targets[[5,10,12]].shape)

(500000, 50)
(500000, 12, 13)
(3, 50)
(3, 12, 13)


In [14]:
g.

<generator object generator at 0x7fe096b172d0>