In [5]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import data_source.preproc as pp
import h5py
import numpy as np
import unicodedata
import string

In [None]:
 class DataGenerator():
    
    def __init__(self, source, batchsize, maxTextLenght, predict = False):
        """DataGenerator class, functions: next_train_batch
                                           next_valid_batch
                                           next_test_batch
        """
        
        self.charset = string.printable[:95] #All possible chars that the model will predict
        self.maxTextLenght = maxTextLenght
        
        self.tokenizer = Tokenizer(self.charset, self.maxTextLenght)
        
        self.batchsize = batchsize
        self.partitions = ['test'] if predict else ['train', 'valid', 'test']
        
        self.size = dict()
        self.steps = dict()
        self.index = dict()
        self.dataset = dict()
        
        with h5py.File(source, "r") as f:
            for pt in self.partitions:
                self.dataset[pt] = dict()
                self.dataset[pt]['dt'] = f[pt]['dt'][:]
                self.dataset[pt]['gt'] = f[pt]['gt'][:]
                
        for pt in self.partitions:
            # decode sentences from byte
            self.dataset[pt]['gt'] = [x.decode() for x in self.dataset[pt]['gt']]

            # set size and setps
            self.size[pt] = len(self.dataset[pt]['gt'])
            self.steps[pt] = int(np.ceil(self.size[pt] / self.batchsize))
            self.index[pt] = 0
            
            
        
    def next_train_batch(self):
        "get the next batch, function yields batch"
        
        while(True):
            if self.index['train'] >= self.size["train"]:
                #reset index if all trainings example have been taken
                self.index['train'] = 0
            
            #index -> index + batchsize and index -> batchsize
            index = self.index['train']
            until = index + self.batchsize
            self.index['train'] = until
            
            x_train = self.dataset['train']['dt'][index:until]
            y_train = self.dataset['train']['gt'][index:until]
            
            
            #Augment trainings data:
            x_train = pp.augmentation(x_train,
                                      rotation_range=1.5,
                                      scale_range=0.05,
                                      height_shift_range=0.025,
                                      width_shift_range=0.05,
                                      erode_range=5,
                                      dilate_range=3)
            
            x_train = pp.normalization(x_train)
            
            y_train = [self.tokenizer.encode(i) for i in y_train]
            y_train = pad_sequences(y_train, maxlen=self.tokenizer.maxlen, padding="post")
            
            yield(x_train, y_train, [])
            
            
    def next_valid_batch(self):
        "get the next validation batch, function yields the batch"
        
        while(True):
            if self.index['valid'] >= self.size['valid']:
                 self.index['valid'] = 0
                
            index = self.index['valid']
            until = index + self.batchsize
            self.index['valid'] = until
                
            x_valid = self.dataset['valid']['dt'][index:until]
            y_valid = self.dataset['valid']['gt'][index:until]
                
            x_valid = pp.normalization(x_valid)
                
            y_valid = [self.tokenizer.encode(i) for i in y_valid]
            y_valid = pad_sequences(y_valid, maxlen=self.tokenizer.maxlen, padding="post")
            
            yield (x_valid, y_valid, [])
                
                
        
    def next_test_batch(self):
        
        while(True):
            if self.index['test'] >= self.size['test']:
                self.index['test'] = 0
                
            index = self.index['test']
            until = index + self.batchsize
            self.index['test'] = until
                
            x_test = self.dataset['test']['dt'][index:until]
            x_test = pp.normalization(x_test)
                
            yield x_test
                
                
                
            
            
        

In [41]:
class Tokenizer():
    
    def __init__(self, chars, max_TextLenght):
        """Tokenizerclass Functions: encode() char -> numpy vector
                                     decode() numpy vector -> chars
                                     remove_tokens() removes PAD token from text
        """
        self.PAD_TK, self.UNK_TK = "¶", "¤"
        self.chars = (self.PAD_TK + self.UNK_TK + chars)

        self.PAD = self.chars.find(self.PAD_TK)
        self.UNK = self.chars.find(self.UNK_TK)

        self.vocab_size = len(self.chars)
        self.maxlen = max_TextLenght
    
    def encode(self, text):
        "encode data into Vector char -> index"
        
        text = unicodedata.normalize("NFKD", text).encode("ASCII", "ignore").decode("ASCII")
        text = " ".join(text.split())
        #self.test = 0
        
        encoded = []
        
        for item in text:
            #get a Vector with a number from 0 to len(chars), each letter gets a number
            index = self.chars.find(item)
            index = self.UNK if index == -1 else index
            encoded.append(index)
            #if self.test == 10:
                #print(encoded)
            #self.test = self.test + 1
            

        return np.asarray(encoded)
    
    def decode(self, text):
        """Decode vector to text"""

        decoded = "".join([self.chars[int(x)] for x in text if x > -1])
        decoded = self.remove_tokens(decoded)
        decoded = pp.text_standardize(decoded)

        return decoded

    def remove_tokens(self, text):
        """Remove tokens (PAD) from text"""

        return text.replace(self.PAD_TK, "")
        
    
    
            