
### BERT-Danish

Download the model and vocab from https://github.com/botxo/nordic_bert. Place them in folder named models.

```bash
conda create -n bert -c conda-forge python=3.7 watermark jupyterlab ipykernel ipywidgets gensim scikit-learn pandas seaborn tqdm
conda activate bert
conda install tensorflow-estimator=2.1
conda install tensorflow-gpu=2.1
pip install bert-for-tf2 sentencepiece transformers
python -m ipykernel install --user --name bert --display-name "Bert"
conda install -c anaconda tensorflow-datasets
jupyter lab
```

In [1]:
# Works only on jupyter lab/notebooks
%reload_ext watermark
%watermark -uniz --author "Prayson W. Daniel" -vm -p pandas,numpy,matplotlib,tensorflow,bert,gensim
!nvidia-smi

Prayson W. Daniel 
last updated: Sat Sep 12 2020 Romance Daylight Time 2020-09-12T08:09:50+02:00

CPython 3.7.8
IPython 7.17.0

pandas 1.1.2
numpy 1.19.1
matplotlib 3.3.1
tensorflow 2.1.0
bert 0.14.6
gensim not installed

compiler   : MSC v.1916 64 bit (AMD64)
system     : Windows
release    : 10
machine    : AMD64
processor  : Intel64 Family 6 Model 158 Stepping 13, GenuineIntel
CPU cores  : 12
interpreter: 64bit
Sat Sep 12 08:10:20 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 442.83       Driver Version: 442.83       CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Quadro RTX 3000    WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   38C    P8    16W /  N/A |    164MiB /  6144MiB |      0%      

In [2]:
import tensorflow as tf
import bert

In [3]:
class Bert:
    def __init__(self, input_shape, model_dir, vocab_file):
        '''
        input = MAX_SEQ_LENGTH
        model_dir = MODEL_DIRECTORY
        vocal_file = VOCAB_FILE
        '''
        
        # Input Layer
        input_layer =  tf.keras.layers.Input(shape=(input_shape,),
                                        dtype='int32')
        
        # Bert Layer
        bert_params = bert.params_from_pretrained_ckpt(model_dir)
        bert_layer = bert.BertModelLayer.from_params(bert_params, name='bert')
        
        # Output Layer
        output_layer = bert_layer(input_layer)
        
        
        # Build Model
        model = tf.keras.Model(inputs=input_layer, outputs=output_layer)
        model.build(input_shape=(None, input_shape))
        
        
        # To use in functions
        self.input_shape = input_shape
        self.model = model
        self.tokenizer = bert.bert_tokenization.FullTokenizer(vocab_file)
        
    def generate_embeddings(self, text):
        text_ids =  self._convert_text_to_ids(text)
        return self.model.predict([text_ids])
        
        
        
    # Utils    
    def _convert_text_ids(self, tokens):
        
        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        
        return tokens_ids + [0]*(self.input_shape - len(tokens_ids))
    
    def _convert_text_to_tokens(self, text):
        
        return ['[CLS]'] + self.tokenizer.tokenize(text) + ['[SEP]']
    
    def _convert_text_to_ids(self, text):
        
        tokens =self._convert_text_to_tokens(text)
        
        return self._convert_text_ids(tokens)

In [4]:
MODEL_DIR = 'models/danish_bert_uncased_v2/'
VOCAB_FILE = f'{MODEL_DIR}/vocab.txt'
MAX_SEQ_LENGTH = 128

In [5]:
b = Bert(input_shape=MAX_SEQ_LENGTH, model_dir=MODEL_DIR, vocab_file=VOCAB_FILE)

In [6]:
print(b.model.summary())

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 128)]             0         
_________________________________________________________________
bert (BertModelLayer)        (None, 128, 768)          110025216 
Total params: 110,025,216
Trainable params: 110,025,216
Non-trainable params: 0
_________________________________________________________________
None


In [7]:
# test
text = 'Hej min skat pige!'

In [8]:
print(b.generate_embeddings(text))

[[[ 1.566651    0.76866275  0.7201991  ...  0.49181592 -0.71183765
   -0.56769574]
  [ 2.2829924  -0.078491    1.0017303  ...  0.9041473  -1.0147523
    0.15431353]
  [-0.9577784   0.36371198 -0.25006995 ... -1.3931506  -0.01940357
    1.3867458 ]
  ...
  [ 0.5891271  -0.6892314  -0.46708322 ... -0.95656055 -1.0666904
    1.4403996 ]
  [ 1.3486811  -1.0827739   0.615178   ...  0.0667549  -1.5458696
    1.3150258 ]
  [ 1.1293281  -0.79656374  0.11924702 ...  0.3409599  -1.3880537
    1.7114571 ]]]
