# Character Based NMT - Data Based Translation of HTML Tables to JSON objects

## Notebook Setup
The easy way to run this is to use Google Colab. You can run this notebook by uploading it to Google Colab and running it there. You can also run this notebook locally by installing the required libraries from the requirements.txt file.

If you decided to run this notebook with Google Colab upload the `html2json.zip` file to the runtime and unzip it using the next commented out cell.
Alongside the file project it will also unzip the html and json data files.

If you decided to run this notebook locally, you can skip the next cell.

In [26]:
### Uncomment this cell if you are running this notebook on Google Colab
# !unzip html2json.zip
# !pip install evaluate

In [None]:
### Uncomment this cell if you are running this notebook locally
# !unzip dataset.zip

### Importing Required Libraries

In [1]:
import torch
from torch import nn
from functools import partial
from html2json import HTML_JSON_Dataset, padding_collate_fn
from torch.utils.data import DataLoader, random_split
from html2json.charactertokenizer import HTMLTokenizer, JSONTokenizer
from html2json.charactertokenizer import MASK_TOKEN
from html2json import load_data, reverse_tokenized_json
from html2json.seq2seq import Seq2SeqTransformer
from html2json.seq2seq import translate_greedy_search, translate_beam_search
from html2json.training import train_epoch, evaluate
from timeit import default_timer as timer
from evaluate import load
import os

### CUDA Availablity

In [27]:
torch.cuda.empty_cache()
torch.cuda.is_available()

True

In [3]:
# Set the device to GPU if available
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
# Setting the paths for the HTML and JSON data directories
html_pth = 'generated_tables/tables'
json_pth = 'generated_tables/metadata'

## Data Loading and Preprocessing

In [5]:
# data loading
html_data, json_data = load_data(html_pth, json_pth, as_string=False, limit=None)

In [6]:
# Building the tokenizers
if os.path.exists("./assets/html_tokenizer.pkl"):
    print("Loading HTML tokenizer")
    html_tokenizer = HTMLTokenizer.load("./assets/html_tokenizer.pkl")
else:
    html_tokenizer = HTMLTokenizer(html_data)
    html_tokenizer.save("./assets/html_tokenizer.pkl")
if os.path.exists("./assets/json_tokenizer.pkl"):
    print("Loading JSON tokenizer")
    json_tokenizer = JSONTokenizer.load("./assets/json_tokenizer.pkl")
else:
    json_tokenizer = JSONTokenizer(json_data)
    json_tokenizer.save("./assets/json_tokenizer.pkl")

Loading HTML tokenizer
Loading JSON tokenizer


In [7]:
# Loading data as strings, tokenizing and creating the dataset
collate_fn = partial(padding_collate_fn, pad_token_html = MASK_TOKEN, pad_token_json = MASK_TOKEN)
html_data_str, json_data_str = load_data(html_pth, json_pth, as_string=True, limit=None)
h2j_dataset = HTML_JSON_Dataset([html_tokenizer.encode(h) for h in html_data_str], [json_tokenizer.encode(j) for j in json_data_str])

In [8]:
# Splitting the dataset into training and validation sets
train_set, val_set = random_split(h2j_dataset, [0.8, 0.2], torch.Generator().manual_seed(42))

## Training the NMT seq2seq Model

In [None]:
# Setting the hyperparameters for the model
torch.manual_seed(42)
SRC_VOCAB_SIZE = len(html_tokenizer)
TGT_VOCAB_SIZE = len(json_tokenizer)
EMB_SIZE = 256
NHEAD = 8
FFN_HID_DIM = 4096
BATCH_SIZE = 32
NUM_ENCODER_LAYERS = 1
NUM_DECODER_LAYERS = 1
LR = 0.001
NUM_EPOCHS = 40

In [None]:
# Creating the dataloaders
torch.cuda.empty_cache()
train_dataloader = DataLoader(train_set, batch_size=BATCH_SIZE, collate_fn=collate_fn)
validation_dataloader = DataLoader(val_set, batch_size=BATCH_SIZE, collate_fn=collate_fn)

In [10]:
# Creating the model and loading the pretrained model if available
transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

if os.path.exists("./assets/transformer.pt"):
    transformer.load_state_dict(torch.load("./assets/transformer.pt", map_location=torch.device(DEVICE)))
else:
    for p in transformer.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
transformer = transformer.to(DEVICE)

  transformer.load_state_dict(torch.load("./assets/transformer.pt", map_location=torch.device(DEVICE)))


In [11]:
# Setting the loss function, optimizer and learning rate scheduler
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=MASK_TOKEN)
optimizer = torch.optim.Adam(transformer.parameters(), lr=LR, betas=(0.9, 0.98), eps=1e-9)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, threshold=0.1, threshold_mode='rel')

In [16]:
# Training the model
for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    transformer.train()
    train_loss = train_epoch(transformer, optimizer, train_dataloader, loss_fn)
    end_time = timer()
    scheduler.step(train_loss)
    # evaluation
    transformer.eval()
    val_loss = evaluate(transformer, validation_dataloader, loss_fn)
    # add save model checkpoint every 20 epochs
    if epoch % 5 == 0:
        torch.save({
                'epoch': epoch,
                'model_state_dict': transformer.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': train_loss,
                }, f"./checkpoints/checkpoint_{epoch}.pt")
    # val_loss = evaluate(transformer)

    print(f"Epoch: {epoch}, Train loss: {train_loss:.5f}, Val loss: {val_loss:.5f}, "f"Epoch time = {(end_time - start_time):.3f}s, lr: {scheduler.get_last_lr()}")
# save the model after training
torch.save(transformer.state_dict(), "./assets/transformer.pt")



Epoch: 1, Train loss: 1.62429, Val loss: 0.93622, Epoch time = 201.339s, lr: [0.001]
Epoch: 2, Train loss: 0.78052, Val loss: 0.34480, Epoch time = 200.476s, lr: [0.001]
Epoch: 3, Train loss: 0.34372, Val loss: 0.04702, Epoch time = 200.059s, lr: [0.001]
Epoch: 4, Train loss: 0.13985, Val loss: 0.01239, Epoch time = 199.942s, lr: [0.001]
Epoch: 5, Train loss: 0.07132, Val loss: 0.00495, Epoch time = 200.083s, lr: [0.001]
Epoch: 6, Train loss: 0.04995, Val loss: 0.00357, Epoch time = 199.928s, lr: [0.001]
Epoch: 7, Train loss: 0.04093, Val loss: 0.00302, Epoch time = 199.706s, lr: [0.001]
Epoch: 8, Train loss: 0.03551, Val loss: 0.00235, Epoch time = 200.038s, lr: [0.001]
Epoch: 9, Train loss: 0.03179, Val loss: 0.00177, Epoch time = 199.950s, lr: [0.001]
Epoch: 10, Train loss: 0.02919, Val loss: 0.00177, Epoch time = 199.931s, lr: [0.001]
Epoch: 11, Train loss: 0.02691, Val loss: 0.00157, Epoch time = 199.969s, lr: [0.001]
Epoch: 12, Train loss: 0.02483, Val loss: 0.00151, Epoch time =

## Model Evaluation

### Human Evaluation

In [21]:
# generating the validation indices for evaluation - using the same seed to get the same samples
train_idx, val_idx = random_split(range(len(h2j_dataset)), [0.8, 0.2], torch.Generator().manual_seed(42))

In [None]:
#### Change the sample_num to evaluate a different sample

In [22]:
sample_num = 0
idx = val_idx[sample_num]
html_idx = html_data_str[idx]

### Greedy Search Translation

In [28]:
# Translating the HTML to JSON using greedy search
pred_greedy = translate_greedy_search(transformer, html_idx, html_tokenizer, json_tokenizer)

In [15]:
# displaying the original JSON (preprocessed)
json_data_str[idx]

'[{]["body"][:][{]["content"][:][[]"648"[,]"836%"[,]"88"[,]"737"[,]"928%"[,]"692%"[,]"445"[]][,]["headers"][:][{]["col"][:][[]"Freeman-Guerra"[,]"Harper-Wells"[,]"Johnson, Lewis and Harrington"[,]"Burton-Tran"[,]"Johnson and Sons"[,]"Campbell, Smith and Wilson"[,]"Thomas, Kelly and Griffin"[]][,]["row"][:][[]"April Lee"[]][}][}][,]["footer"][:][{]["table_creation_date:"][:]"28Jul2009"[,]["text"][:]"Creation: 28Jul2009 Nigeria"[}][,]["header"][:][{]["table_id"][:]"45.41"[,]["text"][:]"Table 45.41 Interpreter"[}][}]'

In [16]:
# displaying the predicted JSON (preprocessed)
pred_greedy[5:-5]

'[{]["body"][:][{]["content"][:][[]"648"[,]"836%"[,]"88"[,]"737"[,]"928%"[,]"692%"[,]"445"[]][,]["headers"][:][{]["col"][:][[]"Freeman-Guerra"[,]"Harper-Wells"[,]"Johnson, Lewis and Harrington"[,]"Burton-Tran"[,]"Johnson and Sons"[,]"Campbell, Smith and Wilson"[,]"Thomas, Kelly and Griffin"[]][,]["row"][:][[]"April Lee"[]][}][}][,]["footer"][:][{]["table_creation_date:"][:]"28Jul2009"[,]["text"][:]"Creation: 28Jul2009 Nigeria"[}][,]["header"][:][{]["table_id"][:]"45.41"[,]["text"][:]"Table 45.41 Interpreter"[}][}]'

In [17]:
# Checking if the predicted JSON is the same as the original JSON
pred_greedy[5:-5] == json_data_str[idx]

True

In [18]:
# Reversing the tokenized JSON to get the original JSON object
json_pred_greedy = reverse_tokenized_json(pred_greedy, json_tokenizer)
json_pred_greedy

{'body': {'content': ['648', '836%', '88', '737', '928%', '692%', '445'],
  'headers': {'col': ['Freeman-Guerra',
    'Harper-Wells',
    'Johnson, Lewis and Harrington',
    'Burton-Tran',
    'Johnson and Sons',
    'Campbell, Smith and Wilson',
    'Thomas, Kelly and Griffin'],
   'row': ['April Lee']}},
 'footer': {'table_creation_date:': '28Jul2009',
  'text': 'Creation: 28Jul2009 Nigeria'},
 'header': {'table_id': '45.41', 'text': 'Table 45.41 Interpreter'}}

### Beam Search Translation

In [29]:
# Translating the HTML to JSON using beam search
pred_beam = translate_beam_search(transformer, html_idx, html_tokenizer, json_tokenizer)

In [30]:
# displaying the original JSON (preprocessed)
json_data_str[idx]

'[{]["body"][:][{]["content"][:][[]"648"[,]"836%"[,]"88"[,]"737"[,]"928%"[,]"692%"[,]"445"[]][,]["headers"][:][{]["col"][:][[]"Freeman-Guerra"[,]"Harper-Wells"[,]"Johnson, Lewis and Harrington"[,]"Burton-Tran"[,]"Johnson and Sons"[,]"Campbell, Smith and Wilson"[,]"Thomas, Kelly and Griffin"[]][,]["row"][:][[]"April Lee"[]][}][}][,]["footer"][:][{]["table_creation_date:"][:]"28Jul2009"[,]["text"][:]"Creation: 28Jul2009 Nigeria"[}][,]["header"][:][{]["table_id"][:]"45.41"[,]["text"][:]"Table 45.41 Interpreter"[}][}]'

In [31]:
# displaying the predicted JSON (preprocessed)
pred_beam[5:-5]

'[{]["body"][:][{]["content"][:][[]"648"[,]"836%"[,]"88"[,]"737"[,]"928%"[,]"692%"[,]"445"[]][,]["headers"][:][{]["col"][:][[]"Freeman-Guerra"[,]"Harper-Wells"[,]"Johnson, Lewis and Harrington"[,]"Burton-Tran"[,]"Johnson and Sons"[,]"Campbell, Smith and Wilson"[,]"Thomas, Kelly and Griffin"[]][,]["row"][:][[]"April Lee"[]][}][}][,]["footer"][:][{]["table_creation_date:"][:]"28Jul2009"[,]["text"][:]"Creation: 28Jul2009 Nigeria"[}][,]["header"][:][{]["table_id"][:]"45.41"[,]["text"][:]"Table 45.41 Interpreter"[}][}]'

In [32]:
# checking if the predicted JSON is the same as the original JSON
pred_beam[5:-5] == json_data_str[idx]

True

In [18]:
# Reversing the tokenized JSON to get the original JSON object
json_pred_beam = reverse_tokenized_json(pred_beam, json_tokenizer)
json_pred_beam

{'body': {'content': ['648', '836%', '88', '737', '928%', '692%', '445'],
  'headers': {'col': ['Freeman-Guerra',
    'Harper-Wells',
    'Johnson, Lewis and Harrington',
    'Burton-Tran',
    'Johnson and Sons',
    'Campbell, Smith and Wilson',
    'Thomas, Kelly and Griffin'],
   'row': ['April Lee']}},
 'footer': {'table_creation_date:': '28Jul2009',
  'text': 'Creation: 28Jul2009 Nigeria'},
 'header': {'table_id': '45.41', 'text': 'Table 45.41 Interpreter'}}

### BLEU Score Evaluation

In [19]:
bleu = load("bleu")

In [24]:
samples_to_evaluate = 100
predictions = [translate_beam_search(transformer, html_data_str[idx], html_tokenizer, json_tokenizer) for i, idx in enumerate(val_idx) if i <= samples_to_evaluate]
references = [json_data_str[idx] for i, idx in enumerate(val_idx) if i <= samples_to_evaluate]

In [25]:
bleu.compute(predictions=predictions, references=references)

{'bleu': 0.9840250297524256,
 'precisions': [0.9840887174541948,
  0.9840464104423495,
  0.9840038778477945,
  0.9839611178614823],
 'brevity_penalty': 1.0,
 'length_ratio': 1.0161685448309652,
 'translation_length': 4148,
 'reference_length': 4082}