In [1]:
from rdflib import Graph

g_train = Graph()
g_val = Graph()

g_train = g_train.parse('FB15k-237/train.nt', format='nt')
g_val   = g_val.parse('FB15k-237/valid.nt', format='nt')

#### Use objectives from T5 https://arxiv.org/abs/1910.10683

Also look at Bert paper: https://arxiv.org/abs/1810.04805

First is 'Bert Style' masked language modeling. (MLM)

* Corrupt 15% of input tokens. 
* 90% of the corrupted tokens are replaced with out-of-alphabet masking token
* 10% of corrupted tokens are replaced with random tokens

#### Other Sources

* https://towardsdatascience.com/how-to-train-a-bert-model-from-scratch-72cfce554fc6 as example of Bert training
* https://huggingface.co/blog/how-to-train
* tiny bert https://arxiv.org/abs/2110.01518

#### Notes

* Bert: "In contrast to denoising auto-encoders (Vincent et al., 2008), we only predict the masked words rather than reconstructing the entire input."
* Which tokenizer?! BPE useful? I think not. Look at https://huggingface.co/docs/transformers/main/en/tokenizer_summary

## Construct Dataset of 'Sentences'

In [2]:
import torch
import numpy as np

In [3]:
# Just use each triple as a sentence ...
dataset_most_simple = [' '.join(x) for x in g_train]

In [4]:
dataset_most_simple[0:10]

['http://example.org/m/0l14md http://example.org/music/performance_role/track_performances./music/track_contribution/role http://example.org/m/03qlv7',
 'http://example.org/m/011yg9 http://example.org/film/film/genre http://example.org/m/02l7c8',
 'http://example.org/m/0565cz http://example.org/people/person/gender http://example.org/m/05zppz',
 'http://example.org/m/033db3 http://example.org/people/person/profession http://example.org/m/02jknp',
 'http://example.org/m/07m2y http://example.org/music/performance_role/track_performances./music/track_contribution/role http://example.org/m/02sgy',
 'http://example.org/m/0cq7tx http://example.org/award/award_winning_work/awards_won./award/award_honor/award http://example.org/m/0gq9h',
 'http://example.org/m/01p0vf http://example.org/music/artist/track_contributions./music/track_contribution/role http://example.org/m/03qjg',
 'http://example.org/m/05qd_ http://example.org/award/award_nominee/award_nominations./award/award_nomination/award_no

# Define and Train Tokenizer
It is unclear which tokenizer works. I will start with a really simple example.

In [5]:
from tokenizers.models import WordLevel
from tokenizers import Tokenizer
from transformers import BertTokenizer, EncoderDecoderModel, BertForTokenClassification
from tokenizers.pre_tokenizers import WhitespaceSplit
from tokenizers.trainers import WordLevelTrainer
from tokenizers.processors import BertProcessing
from transformers import BertConfig, BertModel, AutoModel
import copy
from collections import defaultdict
import torchmetrics

from tqdm import tqdm, trange

2022-11-18 14:51:28.961605: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-11-18 14:51:28.961625: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [6]:
tz = BertTokenizer.from_pretrained("bert-base-cased")
special_tokens_map = tz.special_tokens_map_extended
special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [7]:
from torch.utils.data import Dataset, DataLoader


def random_mask(token,mask_token,vocab_size,mask_chance = 0.15, mask_token_chance=0.9):
    mask_roll = torch.rand(())
    if mask_roll < mask_chance:
        mask_token_roll = torch.rand(())
        if mask_token_roll < mask_token_chance:
            return mask_token, 1
        else:
            return torch.randint(high=vocab_size,size=()), 2
        
    else:
        return token, 0

def mask_list_of_lists(l, mask_token,vocab_size,special_token_ids):

    # get random mask for each token, but not for special tokens
    return torch.tensor([[random_mask(y,mask_token,vocab_size) if y not in special_token_ids else y for y in x ] for x in l])

def mask_list(l, mask_token,vocab_size,special_token_ids):

    # get random mask for each token, but not for special tokens
    return torch.tensor([random_mask(y,mask_token,vocab_size) if y not in special_token_ids else (y,0) for y in l])

class dataseSimpleTriple(Dataset):
    def __init__(self, triples,special_tokens_map,max_length=128):
        
        word_level_tokenizer = Tokenizer(WordLevel(unk_token=special_tokens_map['unk_token']))
        word_level_trainer = WordLevelTrainer(special_tokens=list(special_tokens_map.values()))
        # Pretokenizer. This is important and could lead to better/worse results!
        word_level_tokenizer.pre_tokenizer = WhitespaceSplit()
        
        word_level_tokenizer.train_from_iterator(dataset_most_simple,word_level_trainer)
        
        word_level_tokenizer.post_processor = BertProcessing(
            ("[SEP]", word_level_tokenizer.token_to_id("[SEP]")),
            ('[CLS]', word_level_tokenizer.token_to_id('[CLS]')),
        )
        
        mask_token = word_level_tokenizer.token_to_id(special_tokens_map['mask_token'])
        word_level_tokenizer.enable_truncation(max_length=max_length)
        self.labels = torch.tensor([x.ids for x in word_level_tokenizer.encode_batch(triples)])
        

        self.special_token_ids = [word_level_tokenizer.token_to_id(x) for x in special_tokens_map.values()]
        
        
        
        self.attention_masks = torch.stack([torch.ones(len(x)) for x in self.labels])
        self.word_level_tokenizer = word_level_tokenizer
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, i):
        return mask_list(self.labels[i],self.word_level_tokenizer.token_to_id(special_tokens_map['mask_token']),self.word_level_tokenizer.get_vocab_size(),self.special_token_ids), self.attention_masks[i], self.labels[i]
    
    def get_tokenizer(self):
        return self.word_level_tokenizer

In [8]:
dataset_simple = dataseSimpleTriple(dataset_most_simple,special_tokens_map)
tz = dataset_simple.get_tokenizer()

In [9]:
torch.tensor([random_mask(10,tz.token_to_id(special_tokens_map['mask_token']),tz.get_vocab_size()) for x in range(10)])

tensor([[ 4,  1],
        [ 4,  1],
        [ 4,  1],
        [10,  0],
        [10,  0],
        [10,  0],
        [10,  0],
        [10,  0],
        [10,  0],
        [10,  0]])

In [10]:
dataset_simple[0]

(tensor([[  3,   0],
         [127,   0],
         [ 21,   0],
         [582,   0],
         [  1,   0]]),
 tensor([1., 1., 1., 1., 1.]),
 tensor([  3, 127,  21, 582,   1]))

# Setup Model

In [11]:
tiny_pretrained = AutoModel.from_pretrained('prajjwal1/bert-tiny')

Some weights of the model checkpoint at prajjwal1/bert-tiny were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:

tiny_config = tiny_pretrained.config
tiny_config._name_or_path="otautz/tiny"

encoder_config = copy.copy(tiny_config)
encoder_config.is_decoder = False
encoder_config.add_cross_attention = False
encoder_config.num_labels=tz.get_vocab_size()

del tiny_pretrained

In [13]:
tiny_encoder  = BertForTokenClassification(encoder_config)
lossF = torch.nn.CrossEntropyLoss()

In [14]:
dl = DataLoader(dataset_simple,batch_size=1000,shuffle=True,pin_memory=True)
optimizer = torch.optim.Adam(tiny_encoder.parameters())

In [15]:
optimizer.zero_grad()

loss_metric = torchmetrics.aggregation.MeanMetric()
history = defaultdict(list)

for epochs in trange(10):
    for inputs, batch_mask, batch_labels in dl:
        optimizer.zero_grad()
        batch_id = inputs[:,:,0]
        
        out = tiny_encoder.forward(batch_id, batch_mask)
        logits = out.logits

        # (batchsize, sequence_len, no_labels)
        logits_shape = logits.shape

        # (batchsize * sequence_len, no_labels)
        logits_no_sequence =  logits.reshape(logits_shape[0]*logits_shape[1],logits_shape[2])

        # (batchsize)
        batch_labels_no_sequence = batch_labels.flatten()
        
        batch_mask = (inputs[:,:,1]>0).flatten()

        loss= lossF(logits_no_sequence[batch_mask],batch_labels_no_sequence[batch_mask])

        loss.backward()
        optimizer.step()
        
        loss_metric(loss)
    
    history['loss'].append(loss_metric.compute().item())
    loss_metric.reset()

  0%|                                                                                                      | 0/1000 [01:21<?, ?it/s]
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/olli/stdpy/std_env/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3457, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_99465/1031614443.py", line 11, in <module>
    out = tiny_encoder.forward(batch_id, batch_mask)
  File "/home/olli/stdpy/std_env/lib/python3.10/site-packages/transformers/models/bert/modeling_bert.py", line 1757, in forward
    logits = self.classifier(sequence_output)
  File "/home/olli/stdpy/std_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/olli/stdpy/std_env/lib/python3.10/site-packages/torch/nn/modules/linear.py", line 103, in forward
    return F.linear(input, self.weight, self.bias)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/olli/stdpy/std_env/lib/python3.10/site-p

TypeError: object of type 'NoneType' has no len()

In [None]:
import pandas as pd
pd.DataFrame(history).plot()