In [161]:
from rdflib import Graph
import sys
g_train = Graph()
g_val = Graph()

g_train = g_train.parse('FB15k-237/train.nt', format='nt')
g_val   = g_val.parse('FB15k-237/valid.nt', format='nt')


In [231]:
sys.path.append('../code')

from settings import VECTOR_SIZE

#### Use objectives from T5 https://arxiv.org/abs/1910.10683

Also look at Bert paper: https://arxiv.org/abs/1810.04805

First is 'Bert Style' masked language modeling. (MLM)

* Corrupt 15% of input tokens. 
* 90% of the corrupted tokens are replaced with out-of-alphabet masking token
* 10% of corrupted tokens are replaced with random tokens

#### Other Sources

* https://towardsdatascience.com/how-to-train-a-bert-model-from-scratch-72cfce554fc6 as example of Bert training
* https://huggingface.co/blog/how-to-train
* tiny bert https://arxiv.org/abs/2110.01518

#### Notes

* Bert: "In contrast to denoising auto-encoders (Vincent et al., 2008), we only predict the masked words rather than reconstructing the entire input."
* Which tokenizer?! BPE useful? I think not. Look at https://huggingface.co/docs/transformers/main/en/tokenizer_summary

## Construct Dataset of 'Sentences'

In [2]:
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Just use each triple as a sentence ...
dataset_most_simple = [' '.join(x) for x in g_train]

In [4]:
dataset_most_simple[0:10]

['http://example.org/m/03jv8d http://example.org/military/military_conflict/combatants./military/military_combatant_group/combatants http://example.org/m/07ssc',
 'http://example.org/m/022lly http://example.org/common/topic/webpage./common/webpage/category http://example.org/m/08mbj5d',
 'http://example.org/m/03tps5 http://example.org/film/film/produced_by http://example.org/m/06cgy',
 'http://example.org/m/06y3r http://example.org/people/person/employment_history./business/employment_tenure/company http://example.org/m/0k8z',
 'http://example.org/m/05zvzf3 http://example.org/film/film/other_crew./film/film_crew_gig/film_crew_role http://example.org/m/0263ycg',
 'http://example.org/m/0412f5y http://example.org/people/person/profession http://example.org/m/0nbcg',
 'http://example.org/m/09xq9d http://example.org/user/tsegaran/random/taxonomy_subject/entry./user/tsegaran/random/taxonomy_entry/taxonomy http://example.org/m/04n6k',
 'http://example.org/m/0bdwqv http://example.org/award/awa

# Define and Train Tokenizer
It is unclear which tokenizer works. I will start with a really simple example.

In [5]:
import transformers

In [6]:
from tokenizers.models import WordLevel
from tokenizers import Tokenizer
from transformers import BertTokenizer, EncoderDecoderModel, BertForTokenClassification
from tokenizers.pre_tokenizers import WhitespaceSplit
from tokenizers.trainers import WordLevelTrainer
from tokenizers.processors import BertProcessing
from transformers import BertConfig, BertModel, AutoModel
import copy
from collections import defaultdict
import torchmetrics

from tqdm import tqdm, trange

In [7]:
tz = BertTokenizer.from_pretrained("bert-base-cased")
special_tokens_map = tz.special_tokens_map_extended
special_tokens_map

Downloading: 100%|██████████| 213k/213k [00:00<00:00, 583kB/s] 
Downloading: 100%|██████████| 29.0/29.0 [00:00<00:00, 30.6kB/s]
Downloading: 100%|██████████| 570/570 [00:00<00:00, 595kB/s]


{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [8]:
from torch.utils.data import Dataset, DataLoader


def random_mask(token,mask_token,vocab_size,mask_chance = 0.15, mask_token_chance=0.9):
    mask_roll = torch.rand(())
    if mask_roll < mask_chance:
        mask_token_roll = torch.rand(())
        if mask_token_roll < mask_token_chance:
            return mask_token, 1
        else:
            return torch.randint(high=vocab_size,size=()), 2
        
    else:
        return token, 0

def mask_list_of_lists(l, mask_token,vocab_size,special_token_ids):

    # get random mask for each token, but not for special tokens
    return torch.tensor([[random_mask(y,mask_token,vocab_size) if y not in special_token_ids else y for y in x ] for x in l])

def mask_list(l, mask_token,vocab_size,special_token_ids):

    # get random mask for each token, but not for special tokens
    return torch.tensor([random_mask(y,mask_token,vocab_size) if y not in special_token_ids else (y,0) for y in l])

class dataseSimpleTriple(Dataset):
    def __init__(self, triples,special_tokens_map,max_length=128):
        
        word_level_tokenizer = Tokenizer(WordLevel(unk_token=special_tokens_map['unk_token']))
        word_level_trainer = WordLevelTrainer(special_tokens=list(special_tokens_map.values()))
        # Pretokenizer. This is important and could lead to better/worse results!
        word_level_tokenizer.pre_tokenizer = WhitespaceSplit()
        
        word_level_tokenizer.train_from_iterator(dataset_most_simple,word_level_trainer)
        
        word_level_tokenizer.post_processor = BertProcessing(
            ("[SEP]", word_level_tokenizer.token_to_id("[SEP]")),
            ('[CLS]', word_level_tokenizer.token_to_id('[CLS]')),
        )
        
        mask_token = word_level_tokenizer.token_to_id(special_tokens_map['mask_token'])
        word_level_tokenizer.enable_truncation(max_length=max_length)
        self.labels = torch.tensor([x.ids for x in word_level_tokenizer.encode_batch(triples)])
        

        self.special_token_ids = [word_level_tokenizer.token_to_id(x) for x in special_tokens_map.values()]
        
        
        
        self.attention_masks = torch.stack([torch.ones(len(x)) for x in self.labels])
        self.word_level_tokenizer = word_level_tokenizer
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, i):
        return mask_list(self.labels[i],self.word_level_tokenizer.token_to_id(special_tokens_map['mask_token']),self.word_level_tokenizer.get_vocab_size(),self.special_token_ids), self.attention_masks[i], self.labels[i]
    
    def get_tokenizer(self):
        return self.word_level_tokenizer

In [122]:
dataset_simple = dataseSimpleTriple(dataset_most_simple,special_tokens_map)
tz = dataset_simple.get_tokenizer()
tz.save_model('tiny_bert_from_scratch_simple_tokenizer')

AttributeError: 'tokenizers.Tokenizer' object has no attribute 'save_model'

In [86]:
torch.tensor([random_mask(10,tz.token_to_id(special_tokens_map['mask_token']),tz.get_vocab_size()) for x in range(10)])

tensor([[ 4,  1],
        [10,  0],
        [ 4,  1],
        [10,  0],
        [10,  0],
        [10,  0],
        [10,  0],
        [10,  0],
        [ 4,  1],
        [10,  0]])

In [87]:
dataset_simple[0]

(tensor([[   3,    0],
         [9195,    0],
         [ 184,    0],
         [  51,    0],
         [   1,    0]]),
 tensor([1., 1., 1., 1., 1.]),
 tensor([   3, 9195,  184,   51,    1]))

# Setup Model

In [12]:
tiny_pretrained = AutoModel.from_pretrained('prajjwal1/bert-tiny')

Downloading: 100%|██████████| 285/285 [00:00<00:00, 340kB/s]
Downloading: 100%|██████████| 17.8M/17.8M [00:00<00:00, 34.0MB/s]
Some weights of the model checkpoint at prajjwal1/bert-tiny were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mo

In [13]:
tiny_config = tiny_pretrained.config
tiny_config._name_or_path="otautz/tiny"

encoder_config = copy.copy(tiny_config)
encoder_config.is_decoder = False
encoder_config.add_cross_attention = False
encoder_config.num_labels=tz.get_vocab_size()
encoder_config.hidden_size = VECTOR_SIZE
del tiny_pretrained

In [232]:
tiny_config

BertConfig {
  "_name_or_path": "otautz/tiny",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [39]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tiny_encoder  = BertForTokenClassification(encoder_config)
tiny_encoder  = tiny_encoder.to(device)
lossF = torch.nn.CrossEntropyLoss()
dl = DataLoader(dataset_simple,batch_size=5000,shuffle=True,pin_memory=True)
optimizer = torch.optim.Adam(tiny_encoder.parameters())

In [40]:
print(device)

cuda


In [41]:
loss_metric = torchmetrics.aggregation.MeanMetric().to(device)
batchloss_metric = torchmetrics.aggregation.CatMetric().to(device)
history = defaultdict(list)

for epochs in trange(100):
    for inputs, batch_mask, batch_labels in dl:
        optimizer.zero_grad()
        batch_id = inputs[:,:,0]
        
        out = tiny_encoder.forward(batch_id.to(device), batch_mask.to(device))
        logits = out.logits

        # (batchsize, sequence_len, no_labels)
        logits_shape = logits.shape

        # (batchsize * sequence_len, no_labels)
        logits_no_sequence =  logits.reshape(logits_shape[0]*logits_shape[1],logits_shape[2])

        # (batchsize)
        batch_labels_no_sequence = batch_labels.flatten().to(device)
        
        batch_mask = (inputs[:,:,1]>0).flatten().to(device)

        loss= lossF(logits_no_sequence[batch_mask],batch_labels_no_sequence[batch_mask])

        loss.backward()
        optimizer.step()
        
        loss_metric(loss)
        batchloss_metric(loss)
    
    history['loss'].append(loss_metric.compute().item())
    loss_metric.reset()

100%|██████████| 100/100 [1:36:42<00:00, 58.02s/it]


In [226]:
import pandas as pd
pd.DataFrame(history).to_csv('bert_loss.csv')


In [227]:
pd.DataFrame(batchloss_metric.compute().detach().cpu()).to_csv('bert_batchloss.csv')

tensor([9.6154, 9.5263, 9.4488,  ..., 3.4017, 3.5319, 3.2700], device='cuda:0',
       grad_fn=<CatBackward0>)

# Save pretrained

In [95]:
tiny_encoder.save_pretrained("tiny_bert_from_scratch_simple")
tz.save('tiny_bert_from_scratch_simple_tokenizer.json')

# Test Loading

In [116]:
model = AutoModel.from_pretrained('tiny_bert_from_scratch_simple')
tz = Tokenizer(WordLevel(unk_token=special_tokens_map['unk_token']))

Some weights of the model checkpoint at tiny_bert_from_scratch_simple were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at tiny_bert_from_scratch_simple and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [125]:
tz = PreTrainedTokenizerFast(tokenizer_file='tiny_bert_from_scratch_simple_tokenizer.json')

In [144]:
test = dataset_most_simple[0]
test = tz.encode(test)

In [145]:
torch.tensor(test).

tensor([   3, 9195,  184,   51,    1])

In [152]:
model(torch.tensor(test).unsqueeze(0))['last_hidden_state'].squeeze().shape

torch.Size([5, 128])

In [163]:
from utils_graph import get_entities

In [210]:
entities = get_entities([g_train])

[rdflib.term.URIRef('http://example.org/m/03jv8d'),
 rdflib.term.URIRef('http://example.org/m/022lly'),
 rdflib.term.URIRef('http://example.org/m/03tps5'),
 rdflib.term.URIRef('http://example.org/m/06y3r'),
 rdflib.term.URIRef('http://example.org/m/05zvzf3'),
 rdflib.term.URIRef('http://example.org/m/0412f5y'),
 rdflib.term.URIRef('http://example.org/m/09xq9d'),
 rdflib.term.URIRef('http://example.org/m/0bdwqv'),
 rdflib.term.URIRef('http://example.org/m/0j0k'),
 rdflib.term.URIRef('http://example.org/m/02zjd'),
 rdflib.term.URIRef('http://example.org/m/0_lr1'),
 rdflib.term.URIRef('http://example.org/m/0bs8s1p'),
 rdflib.term.URIRef('http://example.org/m/0gqy2'),
 rdflib.term.URIRef('http://example.org/m/02_j1w'),
 rdflib.term.URIRef('http://example.org/m/02mqc4'),
 rdflib.term.URIRef('http://example.org/m/02vtnf'),
 rdflib.term.URIRef('http://example.org/m/02x6dqb'),
 rdflib.term.URIRef('http://example.org/m/0g_zyp'),
 rdflib.term.URIRef('http://example.org/m/0cj2k3'),
 rdflib.term.U

In [228]:
embs = get_embeddings(entities,model,tz)

In [229]:
embs.shape

torch.Size([14505, 128])

In [205]:
def get_embeddings(entities,bert_model,tokenizer):
    entities = [tokenizer.encode(x) for x in np.array(entities)]
    embeddings = model(torch.tensor(entities))
    embeddings = embeddings['last_hidden_state'][:,1]
    return embeddings