### Working with parser

In [14]:
from utils import parse_for_tables, preprocess
import spacy

nlp = spacy.load("de_core_news_sm")

In [15]:
tables = parse_for_tables("./ML Task/balance_sheets_examples/random_cases/0.html")
data = tables[0]
data = preprocess(data)
data.lower()

'robako gasstraßenbeleuchtung gmbh berlin jahresabschluss zum geschäftsjahr vom 01.01.2008 bis zum 31.12.2008 bilanz aktiva 31.12.2008 eur a. anlagevermögen 2.537,48 i. sachanlagen 37,48 1. andere anlagen, betriebs und geschäftsausstattung 37,48 ii. finanzanlagen 2.500,00 1. sonstige ausleihungen 2.500,00 b. umlaufvermögen 29.120,72 i. forderungen und sonstige vermögensgegenstände 28.844,22 1. forderungen aus lieferungen und leistungen 28.397,35 2. sonstige vermögensgegenstände 446,87 ii. kassenbestand, bundesbankguthaben, guthaben bei kreditinstituten und schecks 276,50 bilanzsumme, summe aktiva 31.658,20 passiva 31.12.2008 eur a. eigenkapital 13.161,23 i. gezeichnetes kapital 25.564,59 ii. verlustvortrag 15.780,67 iii. jahresüberschuss 3.377,31 b. verbindlichkeiten 18.496,97 1. verbindlichkeiten gegenüber kreditinstituten 8.964,02 davon mit einer restlaufzeit bis zu einem jahr 8.964,02 2. verbindlichkeiten aus lieferungen und leistungen 2.762,29 davon mit einer restlaufzeit bis zu ei

In [16]:
def tokenize_de(data):
    doc = nlp.tokenizer(data)
    tokenized_data = [tok.text for tok in doc]
    return tokenized_data

In [17]:
tokenize_de(data.lower())

['robako',
 'gasstraßenbeleuchtung',
 'gmbh',
 'berlin',
 'jahresabschluss',
 'zum',
 'geschäftsjahr',
 'vom',
 '01.01.2008',
 'bis',
 'zum',
 '31.12.2008',
 'bilanz',
 'aktiva',
 '31.12.2008',
 'eur',
 'a.',
 'anlagevermögen',
 '2.537,48',
 'i.',
 'sachanlagen',
 '37,48',
 '1.',
 'andere',
 'anlagen',
 ',',
 'betriebs',
 'und',
 'geschäftsausstattung',
 '37,48',
 'ii',
 '.',
 'finanzanlagen',
 '2.500,00',
 '1.',
 'sonstige',
 'ausleihungen',
 '2.500,00',
 'b.',
 'umlaufvermögen',
 '29.120,72',
 'i.',
 'forderungen',
 'und',
 'sonstige',
 'vermögensgegenstände',
 '28.844,22',
 '1.',
 'forderungen',
 'aus',
 'lieferungen',
 'und',
 'leistungen',
 '28.397,35',
 '2.',
 'sonstige',
 'vermögensgegenstände',
 '446,87',
 'ii',
 '.',
 'kassenbestand',
 ',',
 'bundesbankguthaben',
 ',',
 'guthaben',
 'bei',
 'kreditinstituten',
 'und',
 'schecks',
 '276,50',
 'bilanzsumme',
 ',',
 'summe',
 'aktiva',
 '31.658,20',
 'passiva',
 '31.12.2008',
 'eur',
 'a.',
 'eigenkapital',
 '13.161,23',
 'i.',
 

### Dataset example

In [32]:
from torchtext.data import Field, BucketIterator, TabularDataset
import pandas as pd
import torch

df = pd.read_csv("./dataset.csv")

In [33]:
df

Unnamed: 0,src,trg
0,robako gasstraßenbeleuchtung gmbh berlin jahre...,"aktiva : anlagevermögen : 2.537,48 aktiva : sa..."
1,transporte bauer gmbh bockhorn jahresabschluss...,"aktiva : umlaufvermögen : 25.023,63 | aktiva :..."


In [34]:
field_ = Field(tokenize = tokenize_de, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True, 
            batch_first = True)
fields = [('src', field_), ('trg', field_)]

In [35]:
train_data, valid_data, test_data = TabularDataset.splits(
                                        path = './',
                                        train = "dataset.csv",
                                        validation = "dataset.csv",
                                        test = "dataset.csv",
                                        format = 'csv',
                                        fields = fields,
                                        skip_header = True
)

In [36]:
field_.build_vocab(train_data, min_freq=1)

In [37]:
field_.vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x000001A09FC87C08>>,
            {'<unk>': 0,
             '<pad>': 1,
             '<sos>': 2,
             '<eos>': 3,
             ':': 4,
             '|': 5,
             'passiva': 6,
             'verbindlichkeiten': 7,
             'aktiva': 8,
             'und': 9,
             'sonstige': 10,
             ',': 11,
             'eigenkapital': 12,
             'umlaufvermögen': 13,
             'bis': 14,
             'davon': 15,
             'forderungen': 16,
             'vermögensgegenstände': 17,
             'aus': 18,
             'kreditinstituten': 19,
             '25.023,63': 20,
             'einem': 21,
             'einer': 22,
             'jahr': 23,
             'mit': 24,
             'restlaufzeit': 25,
             'zu': 26,
             '.': 27,
             'i.': 28,
             'leistungen': 29,
             'lieferungen': 30,
             '1.': 31,
             '2

In [38]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 2

In [39]:
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    device=device
)

### Model

In [40]:
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import numpy as np

import random
import math
import time

In [41]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [65]:
class Encoder(nn.Module):
    def __init__(self, 
                 input_dim, 
                 emb_dim, 
                 hid_dim, 
                 n_layers, 
                 kernel_size, 
                 dropout, 
                 device,
                 max_length = 100):
        super().__init__()
        
        assert kernel_size % 2 == 1, "Kernel size must be odd!"
        
        self.device = device
        
        self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device)
        
        self.tok_embedding = nn.Embedding(input_dim, emb_dim)
        self.pos_embedding = nn.Embedding(max_length, emb_dim)
        
        self.emb2hid = nn.Linear(emb_dim, hid_dim)
        self.hid2emb = nn.Linear(hid_dim, emb_dim)
        
        self.convs = nn.ModuleList([nn.Conv1d(in_channels = hid_dim, 
                                              out_channels = 2 * hid_dim, 
                                              kernel_size = kernel_size, 
                                              padding = (kernel_size - 1) // 2)
                                    for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [batch size, src len]
        
        batch_size = src.shape[0]
        src_len = src.shape[1]
        
        #create position tensor
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        
        #pos = [0, 1, 2, 3, ..., src len - 1]
        
        #pos = [batch size, src len]
        
        #embed tokens and positions
        tok_embedded = self.tok_embedding(src)
        pos_embedded = self.pos_embedding(pos)
        
        #tok_embedded = pos_embedded = [batch size, src len, emb dim]
        
        #combine embeddings by elementwise summing
        embedded = self.dropout(tok_embedded + pos_embedded)
        
        #embedded = [batch size, src len, emb dim]
        
        #pass embedded through linear layer to convert from emb dim to hid dim
        conv_input = self.emb2hid(embedded)
        
        #conv_input = [batch size, src len, hid dim]
        
        #permute for convolutional layer
        conv_input = conv_input.permute(0, 2, 1) 
        
        #conv_input = [batch size, hid dim, src len]
        
        #begin convolutional blocks...
        
        for i, conv in enumerate(self.convs):
        
            #pass through convolutional layer
            conved = conv(self.dropout(conv_input))

            #conved = [batch size, 2 * hid dim, src len]

            #pass through GLU activation function
            conved = F.glu(conved, dim = 1)

            #conved = [batch size, hid dim, src len]
            
            #apply residual connection
            conved = (conved + conv_input) * self.scale

            #conved = [batch size, hid dim, src len]
            
            #set conv_input to conved for next loop iteration
            conv_input = conved
        
        #...end convolutional blocks
        
        #permute and convert back to emb dim
        conved = self.hid2emb(conved.permute(0, 2, 1))
        
        #conved = [batch size, src len, emb dim]
        
        #elementwise sum output (conved) and input (embedded) to be used for attention
        combined = (conved + embedded) * self.scale
        
        #combined = [batch size, src len, emb dim]
        
        return conved, combined

In [66]:
class Decoder(nn.Module):
    def __init__(self, 
                 output_dim, 
                 emb_dim, 
                 hid_dim, 
                 n_layers, 
                 kernel_size, 
                 dropout, 
                 trg_pad_idx, 
                 device,
                 max_length = 100):
        super().__init__()
        
        self.kernel_size = kernel_size
        self.trg_pad_idx = trg_pad_idx
        self.device = device
        
        self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device)
        
        self.tok_embedding = nn.Embedding(output_dim, emb_dim)
        self.pos_embedding = nn.Embedding(max_length, emb_dim)
        
        self.emb2hid = nn.Linear(emb_dim, hid_dim)
        self.hid2emb = nn.Linear(hid_dim, emb_dim)
        
        self.attn_hid2emb = nn.Linear(hid_dim, emb_dim)
        self.attn_emb2hid = nn.Linear(emb_dim, hid_dim)
        
        self.fc_out = nn.Linear(emb_dim, output_dim)
        
        self.convs = nn.ModuleList([nn.Conv1d(in_channels = hid_dim, 
                                              out_channels = 2 * hid_dim, 
                                              kernel_size = kernel_size)
                                    for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
      
    def calculate_attention(self, embedded, conved, encoder_conved, encoder_combined):
        
        #embedded = [batch size, trg len, emb dim]
        #conved = [batch size, hid dim, trg len]
        #encoder_conved = encoder_combined = [batch size, src len, emb dim]
        
        #permute and convert back to emb dim
        conved_emb = self.attn_hid2emb(conved.permute(0, 2, 1))
        
        #conved_emb = [batch size, trg len, emb dim]
        
        combined = (conved_emb + embedded) * self.scale
        
        #combined = [batch size, trg len, emb dim]
                
        energy = torch.matmul(combined, encoder_conved.permute(0, 2, 1))
        
        #energy = [batch size, trg len, src len]
        
        attention = F.softmax(energy, dim=2)
        
        #attention = [batch size, trg len, src len]
            
        attended_encoding = torch.matmul(attention, encoder_combined)
        
        #attended_encoding = [batch size, trg len, emd dim]
        
        #convert from emb dim -> hid dim
        attended_encoding = self.attn_emb2hid(attended_encoding)
        
        #attended_encoding = [batch size, trg len, hid dim]
        
        #apply residual connection
        attended_combined = (conved + attended_encoding.permute(0, 2, 1)) * self.scale
        
        #attended_combined = [batch size, hid dim, trg len]
        
        return attention, attended_combined
        
    def forward(self, trg, encoder_conved, encoder_combined):
        
        #trg = [batch size, trg len]
        #encoder_conved = encoder_combined = [batch size, src len, emb dim]
                
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
            
        #create position tensor
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        
        #pos = [batch size, trg len]
        
        #embed tokens and positions
        tok_embedded = self.tok_embedding(trg)
        pos_embedded = self.pos_embedding(pos)
        
        #tok_embedded = [batch size, trg len, emb dim]
        #pos_embedded = [batch size, trg len, emb dim]
        
        #combine embeddings by elementwise summing
        embedded = self.dropout(tok_embedded + pos_embedded)
        
        #embedded = [batch size, trg len, emb dim]
        
        #pass embedded through linear layer to go through emb dim -> hid dim
        conv_input = self.emb2hid(embedded)
        
        #conv_input = [batch size, trg len, hid dim]
        
        #permute for convolutional layer
        conv_input = conv_input.permute(0, 2, 1) 
        
        #conv_input = [batch size, hid dim, trg len]
        
        batch_size = conv_input.shape[0]
        hid_dim = conv_input.shape[1]
        
        for i, conv in enumerate(self.convs):
        
            #apply dropout
            conv_input = self.dropout(conv_input)
        
            #need to pad so decoder can't "cheat"
            padding = torch.zeros(batch_size, 
                                  hid_dim, 
                                  self.kernel_size - 1).fill_(self.trg_pad_idx).to(self.device)
                
            padded_conv_input = torch.cat((padding, conv_input), dim = 2)
        
            #padded_conv_input = [batch size, hid dim, trg len + kernel size - 1]
        
            #pass through convolutional layer
            conved = conv(padded_conv_input)

            #conved = [batch size, 2 * hid dim, trg len]
            
            #pass through GLU activation function
            conved = F.glu(conved, dim = 1)

            #conved = [batch size, hid dim, trg len]
            
            #calculate attention
            attention, conved = self.calculate_attention(embedded, 
                                                         conved, 
                                                         encoder_conved, 
                                                         encoder_combined)
            
            #attention = [batch size, trg len, src len]
            
            #apply residual connection
            conved = (conved + conv_input) * self.scale
            
            #conved = [batch size, hid dim, trg len]
            
            #set conv_input to conved for next loop iteration
            conv_input = conved
            
        conved = self.hid2emb(conved.permute(0, 2, 1))
         
        #conved = [batch size, trg len, emb dim]
            
        output = self.fc_out(self.dropout(conved))
        
        #output = [batch size, trg len, output dim]
            
        return output, attention

In [67]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, src, trg):
        
        #src = [batch size, src len]
        #trg = [batch size, trg len - 1] (<eos> token sliced off the end)
           
        #calculate z^u (encoder_conved) and (z^u + e) (encoder_combined)
        #encoder_conved is output from final encoder conv. block
        #encoder_combined is encoder_conved plus (elementwise) src embedding plus 
        #  positional embeddings 
        encoder_conved, encoder_combined = self.encoder(src)
            
        #encoder_conved = [batch size, src len, emb dim]
        #encoder_combined = [batch size, src len, emb dim]
        
        #calculate predictions of next words
        #output is a batch of predictions for each word in the trg sentence
        #attention a batch of attention scores across the src sentence for 
        #  each word in the trg sentence
        output, attention = self.decoder(trg, encoder_conved, encoder_combined)
        
        #output = [batch size, trg len - 1, output dim]
        #attention = [batch size, trg len - 1, src len]
        
        return output, attention

In [68]:
INPUT_DIM = len(field_.vocab)
OUTPUT_DIM = len(field_.vocab)
EMB_DIM = 256
HID_DIM = 512 # each conv. layer has 2 * hid_dim filters
ENC_LAYERS = 10 # number of conv. blocks in encoder
DEC_LAYERS = 10 # number of conv. blocks in decoder
ENC_KERNEL_SIZE = 3 # must be odd!
DEC_KERNEL_SIZE = 3 # can be even or odd
ENC_DROPOUT = 0.25
DEC_DROPOUT = 0.25
MAX_LEN = 300
TRG_PAD_IDX = field_.vocab.stoi[field_.pad_token]
    
enc = Encoder(INPUT_DIM, EMB_DIM, HID_DIM, ENC_LAYERS, ENC_KERNEL_SIZE, ENC_DROPOUT, device, MAX_LEN)
dec = Decoder(OUTPUT_DIM, EMB_DIM, HID_DIM, DEC_LAYERS, DEC_KERNEL_SIZE, DEC_DROPOUT, TRG_PAD_IDX, device, MAX_LEN)

model = Seq2Seq(enc, dec).to(device)

In [69]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 32,503,148 trainable parameters


In [70]:
optimizer = optim.Adam(model.parameters())

In [71]:
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [72]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output, _ = model(src, trg[:,:-1])
        
        #output = [batch size, trg len - 1, output dim]
        #trg = [batch size, trg len]
        
        output_dim = output.shape[-1]
        
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:,1:].contiguous().view(-1)
        
        #output = [batch size * trg len - 1, output dim]
        #trg = [batch size * trg len - 1]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [73]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [74]:
N_EPOCHS = 200
CLIP = 0.1

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')

Epoch: 01 | Time: 0m 0s
	Train Loss: 4.655 | Train PPL: 105.155
Epoch: 02 | Time: 0m 0s
	Train Loss: 4.125 | Train PPL:  61.875
Epoch: 03 | Time: 0m 0s
	Train Loss: 4.602 | Train PPL:  99.676
Epoch: 04 | Time: 0m 0s
	Train Loss: 3.048 | Train PPL:  21.083
Epoch: 05 | Time: 0m 0s
	Train Loss: 2.903 | Train PPL:  18.233
Epoch: 06 | Time: 0m 0s
	Train Loss: 2.616 | Train PPL:  13.688
Epoch: 07 | Time: 0m 0s
	Train Loss: 2.459 | Train PPL:  11.689
Epoch: 08 | Time: 0m 0s
	Train Loss: 2.261 | Train PPL:   9.593
Epoch: 09 | Time: 0m 0s
	Train Loss: 2.126 | Train PPL:   8.384
Epoch: 10 | Time: 0m 0s
	Train Loss: 2.129 | Train PPL:   8.406
Epoch: 11 | Time: 0m 0s
	Train Loss: 1.970 | Train PPL:   7.168
Epoch: 12 | Time: 0m 0s
	Train Loss: 1.780 | Train PPL:   5.927
Epoch: 13 | Time: 0m 0s
	Train Loss: 1.710 | Train PPL:   5.528
Epoch: 14 | Time: 0m 0s
	Train Loss: 1.750 | Train PPL:   5.753
Epoch: 15 | Time: 0m 0s
	Train Loss: 1.380 | Train PPL:   3.977
Epoch: 16 | Time: 0m 0s
	Train Loss: 1.3

Epoch: 130 | Time: 0m 0s
	Train Loss: 0.884 | Train PPL:   2.420
Epoch: 131 | Time: 0m 0s
	Train Loss: 0.652 | Train PPL:   1.920
Epoch: 132 | Time: 0m 0s
	Train Loss: 0.306 | Train PPL:   1.358
Epoch: 133 | Time: 0m 0s
	Train Loss: 0.318 | Train PPL:   1.374
Epoch: 134 | Time: 0m 0s
	Train Loss: 0.404 | Train PPL:   1.497
Epoch: 135 | Time: 0m 0s
	Train Loss: 0.381 | Train PPL:   1.464
Epoch: 136 | Time: 0m 0s
	Train Loss: 0.266 | Train PPL:   1.304
Epoch: 137 | Time: 0m 0s
	Train Loss: 0.343 | Train PPL:   1.410
Epoch: 138 | Time: 0m 0s
	Train Loss: 0.218 | Train PPL:   1.243
Epoch: 139 | Time: 0m 0s
	Train Loss: 0.362 | Train PPL:   1.436
Epoch: 140 | Time: 0m 0s
	Train Loss: 0.311 | Train PPL:   1.365
Epoch: 141 | Time: 0m 0s
	Train Loss: 0.194 | Train PPL:   1.215
Epoch: 142 | Time: 0m 0s
	Train Loss: 0.340 | Train PPL:   1.405
Epoch: 143 | Time: 0m 0s
	Train Loss: 0.199 | Train PPL:   1.221
Epoch: 144 | Time: 0m 0s
	Train Loss: 0.180 | Train PPL:   1.197
Epoch: 145 | Time: 0m 0s


In [75]:
def translate_sentence(sentence, src_field, trg_field, model, device, max_len = 50):

    model.eval()
        
    if isinstance(sentence, str):
        tokens = [token.text.lower() for token in nlp(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
        
    src_indexes = [src_field.vocab.stoi[token] for token in tokens]

    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)

    with torch.no_grad():
        encoder_conved, encoder_combined = model.encoder(src_tensor)

    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]

    for i in range(max_len):

        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)

        with torch.no_grad():
            output, attention = model.decoder(trg_tensor, encoder_conved, encoder_combined)
        
        pred_token = output.argmax(2)[:,-1].item()
        
        trg_indexes.append(pred_token)

        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break
    
    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
    
    return trg_tokens[1:], attention

In [77]:
example_idx = 1

src = vars(train_data.examples[example_idx])['src']
trg = vars(train_data.examples[example_idx])['trg']

print(f'src = {src}')
print(f'trg = {trg}')

src = ['transporte', 'bauer', 'gmbh', 'bockhorn', 'jahresabschluss', 'zum', 'geschäftsjahr', 'vom', '01.01.2010', 'bis', 'zum', '31.12.2010', 'bilanz', 'aktiva', '31.12.2010', 'eur', 'a.', 'umlaufvermögen', '25.023,63', 'i.', 'forderungen', 'und', 'sonstige', 'vermögensgegenstände', '67,28', 'ii', '.', 'kassenbestand', ',', 'bundesbankguthaben', ',', 'guthaben', 'bei', 'kreditinstituten', 'und', 'schecks', '24.956,35', 'bilanzsumme', ',', 'summe', 'aktiva', '25.023,63', 'passiva', '31.12.2010', 'eur', 'a.', 'eigenkapital', '24.189,25', 'i.', 'gezeichnetes', 'kapital', '25.000,00', 'ii.', 'jahresfehlbetrag', '810,75', 'b.', 'rückstellungen', '400,00', 'c.', 'verbindlichkeiten', '434,38', 'bilanzsumme', ',', 'summe', 'passiva', '25.023,63']
trg = ['aktiva', ':', 'umlaufvermögen', ':', '25.023,63', '|', 'aktiva', ':', 'umlaufvermögen', ':', 'forderungen', 'und', 'sonstige', 'vermögensgegenstände', ':', '67,28', '|', 'aktiva', ':', 'umlaufvermögen', ':', 'kassenbestand', 'bundesbankguthabe

### Inference test

In [80]:
translation, attention = translate_sentence(src, field_, field_, model, device, max_len = MAX_LEN)

print(f'predicted trg = {translation}')

predicted trg = ['aktiva', ':', 'anlagevermögen', ':', '2.537,48', 'aktiva', ':', 'sachanlagen', ':', '37,48', '|', 'aktiva', ':', 'sachanlagen', ':', 'andere', 'anlagen', 'betriebs', 'und', 'geschäftsausstattung', ':', '37,48', '|', 'aktiva', ':', 'finanzanlagen', ':', '2.500,00', '|', 'aktiva', ':', 'finanzanlagen', ':', 'sonstige', 'ausleihungen', ':', '2.500,00', '|', 'aktiva', ':', 'umlaufvermögen', ':', '29.120,72', '|', 'aktiva', ':', 'umlaufvermögen', ':', 'forderungen', 'und', 'sonstige', 'vermögensgegenstände', ':', '28.844,22', '|', 'aktiva', ':', 'umlaufvermögen', ':', 'forderungen', 'und', 'sonstige', 'vermögensgegenstände', ':', 'forderungen', 'aus', 'lieferungen', 'und', 'leistungen', ':', '28.397,35', '|', 'aktiva', ':', 'umlaufvermögen', ':', 'forderungen', 'und', 'sonstige', 'vermögensgegenstände', ':', 'sonstige', 'vermögensgegenstände', ':', '28.397,35', '|', 'aktiva', ':', 'kassenbestand', 'bundesbankguthaben', 'guthaben', 'bei', 'kreditinstituten', 'und', 'schecks

In [84]:
tables = parse_for_tables("./ML Task/2011.html")
data = tables[0]
data = preprocess(data)
data.lower()

'arcus planung + beratung gmbh & co. service kg cottbus jahresabschluss zum geschäftsjahr vom 01.01.2011 bis zum 31.12.2011 bilanz zum 31. dezember 2011 aktiva geschäftsjahr eur vorjahr eur a. umlaufvermögen i. forderungen und sonstige vermögensgegenstände 135.437,50 117.884,92 ii. kassenbestand, bundesbankguthaben, guthaben bei kreditinstituten und schecks 3.764,17 7.699,61 139.201,67 125.584,53 passiva geschäftsjahr eur vorjahr eur a. eigenkapital i. kapitalanteile kommanditisten 12.000,00 12.000,00 b. rückstellungen 19.153,34 22.175,60 c. verbindlichkeiten 108.048,33 91.408,93 139.201,67 125.584,53'

In [85]:
translation, attention = translate_sentence(data, field_, field_, model, device, max_len = MAX_LEN)

print(f'predicted trg = {translation}')

predicted trg = ['aktiva', ':', 'umlaufvermögen', ':', '25.023,63', '|', 'aktiva', ':', 'umlaufvermögen', ':', 'forderungen', 'und', 'sonstige', 'vermögensgegenstände', ':', '67,28', '|', 'aktiva', ':', 'umlaufvermögen', ':', 'kassenbestand', 'bundesbankguthaben', 'guthaben', 'bei', 'kreditinstituten', 'und', 'schecks', ':', '276,50', '|', 'aktiva', ':', 'total', ':', '31.658,20', '||', 'passiva', ':', 'eigenkapital', ':', '13.161,23', '|', 'passiva', ':', 'eigenkapital', ' ', ':', 'gezeichnetes', 'kapital', ':', '25.000,00', '|', 'passiva', ':', 'eigenkapital', ' ', ':', 'jahresfehlbetrag', ':', '810,75', '|', 'passiva', ':', 'rückstellungen', ':', '400,00', '|', 'passiva', ':', 'verbindlichkeiten', ':', '434,38', '|', 'passiva', ':', 'total', ':', '25.023,63', '<eos>']
