In [1]:
from pathlib import Path

In [2]:
from utils_data import configure_logging
configure_logging()

from utils_data import init_random
init_random()

In [3]:
import mxnet as mx
import gluonnlp as nlp
from bert import *

import torch
import transformers
from transformers import BertModel, BertConfig
from transformers import AutoModelForSequenceClassification

In [4]:
from utils_gluon import setup_bert_epi128bce, setup_bert_epi512bce
from utils_gluon import setup_bert_pro128bce, setup_bert_pro512bce

### Load MXNet/GluonNLP model

In [5]:
run_name = "yelp_1_pro128BCE"
fn_run_path = Path(f"data/{run_name}")
num_epochs = 3

In [6]:
model, vocabulary, ctx, tokenizer, transform, loss_function, metric, all_labels = setup_bert_pro128bce(gpu=None)
print(f"Build BERT same-side classifier model ...")

Build BERT same-side classifier model ...


```python
    # ...
    bert_base, vocabulary = nlp.model.get_model(
        'bert_12_768_12',
        dataset_name='book_corpus_wiki_en_uncased',
        pretrained=True,
        ctx=ctx,
        use_pooler=True,
        use_decoder=False,
        use_classifier=False)
    # ...
```

In [178]:
fn_model_state = fn_run_path / f"bert.model.checkpoint{num_epochs - 1}.params"
print(f"Load {fn_model_state} ...")
model.load_parameters(str(fn_model_state), ctx=ctx)

Load data/yelp_1_pro128BCE/bert.model.checkpoint2.params ...


### Load generic PyTorch BERT model (transformers)

In [179]:
# dev = torch.device("cpu")
pytorch_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")

I0525 19:59:24.888704 140389655271232 configuration_utils.py:151] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/ekoerner/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
I0525 19:59:24.890974 140389655271232 configuration_utils.py:168] Model config {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pad_token_id": 0,
  "pruned_heads": {},
  "torchscript": false

In [184]:
dir_name = Path("./temp")
if not dir_name.exists():
    dir_name.mkdir()
    
fn_model_params = dir_name / "pytorch_model.bin"
fn_model_bert_params = dir_name / "pytorch_model_bert.bin"
fn_model_classifier_params = dir_name / "pytorch_model_classifier.bin"

# just the BERT part
#pytorch_model = pytorch_model.bert
pytorch_model.save_pretrained(str(dir_name))
pytorch_params = torch.load(str(fn_model_params))

# save only BERT part
torch.save(pytorch_model.bert.state_dict(), str(fn_model_bert_params))
pytorch_params = torch.load(str(fn_model_bert_params))

# save classifier part
torch.save(pytorch_model.classifier.state_dict(), str(fn_model_classifier_params))
pytorch_classifier_params = torch.load(str(fn_model_classifier_params))

I0525 20:00:30.726449 140389655271232 configuration_utils.py:71] Configuration saved in temp/config.json
I0525 20:00:33.205898 140389655271232 modeling_utils.py:205] Model weights saved in temp/pytorch_model.bin


### Copy params from GluonNLP to PyTorch

In [181]:
# mapping from GluonNLP to PyTorch
# NOTE: only BERT part

mapping = {
    'encoder.layer_norm.beta': 'embeddings.LayerNorm.bias',
    'encoder.layer_norm.gamma': 'embeddings.LayerNorm.weight',
    'encoder.position_weight': 'embeddings.position_embeddings.weight',
    'word_embed.0.weight': 'embeddings.word_embeddings.weight',
    'token_type_embed.0.weight': 'embeddings.token_type_embeddings.weight',

    'pooler': 'pooler.dense',
    'encoder.transformer_cells': 'encoder.layer',  # 'transformer.layer'

    'attention_cell': 'attention',
    '.proj.': '.attention.output.dense.',  # '.attention.out_lin.'
    'proj_key': 'self.key',  # 'k_lin'
    'proj_query': 'self.query',  # 'q_lin'
    'proj_value': 'self.value',  # 'v_lin'
    'ffn.ffn_1': 'intermediate.dense',  # 'lin1'
    'ffn.ffn_2': 'output.dense',  # 'lin2'
    'ffn.layer_norm.beta': 'output.LayerNorm.bias',  # output_layer_norm.bias'
    'ffn.layer_norm.gamma': 'output.LayerNorm.weight',  # 'output_layer_norm.weight'
}

secondary_map = {
    # because of overlap (if sorted reversed) of '.ffn.layer_norm.*'
    'layer_norm.beta': 'attention.output.LayerNorm.bias',  # 'sa_layer_norm.bias'
    'layer_norm.gamma': 'attention.output.LayerNorm.weight',  # 'sa_layer_norm.weight'
}

In [182]:
# params = model.bert.collect_params()
# pytorch_model.bert.named_parameters()

# NOTE: need to feed dummy input to build model to get initialized params ... or something like it
model.bert.initialize(init=mx.init.Normal(0.02))
ones = mx.nd.ones((2, 8))
out = model.bert(ones, ones, mx.nd.array([5, 6]), mx.nd.array([[1], [2]]))
params = model.bert._collect_params_with_prefix()

In [186]:
# Test/Debug

names_gluon1, names_gluon2 = list(), list()

for name, tensor in params.items():
    pytorch_name = name
    print(pytorch_name, "[GluonNlp]", end=" -> ")
    for k, v in mapping.items():
        pytorch_name = pytorch_name.replace(k, v)
    # print(pytorch_name, end=" -> ")
    for k, v in secondary_map.items():
        pytorch_name = pytorch_name.replace(k, v)
    print(pytorch_name, "[PyTorch]")
    
    names_gluon1.append(name)
    names_gluon2.append(pytorch_name)
    
    assert pytorch_name in pytorch_params, f"Gluon:{name}, Torch?:{pytorch_name} not in PyTorch model?"
    arr = mx.nd.array(pytorch_params[pytorch_name])
    assert arr.shape == params[name].shape, f"Gluon:{name}, Torch?:{pytorch_name} mismatch?"
    
    #tensor
    
names_gluon1, names_gluon2 = set(names_gluon1), set(names_gluon2)
names_torch = set(pytorch_params.keys())

sorted(names_torch)[-18:], sorted(names_gluon2)[-19:], sorted(names_gluon1)[-20:]
# names_gluon2 - names_torch
# names_torch - names_gluon2

encoder.position_weight [GluonNlp] -> embeddings.position_embeddings.weight [PyTorch]
encoder.layer_norm.gamma [GluonNlp] -> embeddings.LayerNorm.weight [PyTorch]
encoder.layer_norm.beta [GluonNlp] -> embeddings.LayerNorm.bias [PyTorch]
encoder.transformer_cells.0.attention_cell.proj_query.weight [GluonNlp] -> encoder.layer.0.attention.self.query.weight [PyTorch]
encoder.transformer_cells.0.attention_cell.proj_query.bias [GluonNlp] -> encoder.layer.0.attention.self.query.bias [PyTorch]
encoder.transformer_cells.0.attention_cell.proj_key.weight [GluonNlp] -> encoder.layer.0.attention.self.key.weight [PyTorch]
encoder.transformer_cells.0.attention_cell.proj_key.bias [GluonNlp] -> encoder.layer.0.attention.self.key.bias [PyTorch]
encoder.transformer_cells.0.attention_cell.proj_value.weight [GluonNlp] -> encoder.layer.0.attention.self.value.weight [PyTorch]
encoder.transformer_cells.0.attention_cell.proj_value.bias [GluonNlp] -> encoder.layer.0.attention.self.value.bias [PyTorch]
encoder.t

(['encoder.layer.9.attention.output.LayerNorm.bias',
  'encoder.layer.9.attention.output.LayerNorm.weight',
  'encoder.layer.9.attention.output.dense.bias',
  'encoder.layer.9.attention.output.dense.weight',
  'encoder.layer.9.attention.self.key.bias',
  'encoder.layer.9.attention.self.key.weight',
  'encoder.layer.9.attention.self.query.bias',
  'encoder.layer.9.attention.self.query.weight',
  'encoder.layer.9.attention.self.value.bias',
  'encoder.layer.9.attention.self.value.weight',
  'encoder.layer.9.intermediate.dense.bias',
  'encoder.layer.9.intermediate.dense.weight',
  'encoder.layer.9.output.LayerNorm.bias',
  'encoder.layer.9.output.LayerNorm.weight',
  'encoder.layer.9.output.dense.bias',
  'encoder.layer.9.output.dense.weight',
  'pooler.dense.bias',
  'pooler.dense.weight'],
 ['encoder.layer.8.output.dense.weight',
  'encoder.layer.9.attention.output.LayerNorm.bias',
  'encoder.layer.9.attention.output.LayerNorm.weight',
  'encoder.layer.9.attention.output.dense.bias',
 

In [187]:
# Copy GluonNLP to PyTorch

for name, tensor in params.items():
    pytorch_name = name
    for k, v in mapping.items():
        pytorch_name = pytorch_name.replace(k, v)
    for k, v in secondary_map.items():
        pytorch_name = pytorch_name.replace(k, v)
    
    assert pytorch_name in pytorch_params, f"Gluon:{name}, Torch?:{pytorch_name} not in PyTorch model?"
    arr = mx.nd.array(pytorch_params[pytorch_name])
    assert arr.shape == params[name].shape, f"Gluon:{name}, Torch?:{pytorch_name} mismatch?"
    
    tensor_pytorch = torch.tensor(tensor.data().asnumpy())
    # tensor_pytorch = torch.nn.parameter.Parameter(tensor_pytorch)
    pytorch_params[pytorch_name] = tensor_pytorch
    
torch.save(pytorch_params, str(fn_model_bert_params))

In [188]:
params_classifier = model.classifier._collect_params_with_prefix()
pytorch_classifier_params = torch.load(str(fn_model_classifier_params))

pytorch_classifier_params["weight"] = torch.tensor(params_classifier["1.weight"].data().asnumpy())
pytorch_classifier_params["bias"] = torch.tensor(params_classifier["1.bias"].data().asnumpy())

torch.save(pytorch_classifier_params, str(fn_model_classifier_params))

---

#### Rest (Test)

In [135]:
params2 = model._collect_params_with_prefix()
params2 = {k: v for k, v in params2.items() if k.startswith("classifier.")}
params2

{'classifier.1.weight': Parameter bertclassifier0_dense0_weight (shape=(1, 768), dtype=float32),
 'classifier.1.bias': Parameter bertclassifier0_dense0_bias (shape=(1,), dtype=float32)}

In [159]:
#params2_pytorch = dict(pytorch_model.named_parameters())
params2_pytorch = dict(pytorch_model.state_dict())
params2_pytorch = {k: v for k, v in params2_pytorch.items() if k.startswith("classifier.")}
params2_pytorch

{'classifier.weight': tensor([[-0.0442, -0.0256, -0.0154,  ...,  0.0229,  0.0080, -0.0004],
         [ 0.0159, -0.0092,  0.0290,  ..., -0.0198, -0.0095,  0.0116]]),
 'classifier.bias': tensor([0., 0.])}

### Load converted model ...

In [189]:
from pathlib import Path

dir_name = Path("./temp")
if not dir_name.exists():
    dir_name.mkdir()
    
fn_model_params = dir_name / "pytorch_model.bin"
fn_model_bert_params = dir_name / "pytorch_model_bert.bin"
fn_model_classifier_params = dir_name / "pytorch_model_classifier.bin"

In [240]:
import torch
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification

# TODO: check num_labels and pytorch_classifier_params["bias"].shape == (2,) or (1,)?
config = AutoConfig.from_pretrained("bert-base-uncased", num_labels=1)
pytorch_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", config=config)

pytorch_bert_params = torch.load(str(fn_model_bert_params))
pytorch_classifier_params = torch.load(str(fn_model_classifier_params))

# copy BERT params
pytorch_model.bert.load_state_dict(pytorch_bert_params)

# copy classifier params
pytorch_model_classifier_params = pytorch_model.classifier.load_state_dict(pytorch_classifier_params)
# ??pytorch_model.classifier.load_state_dict
# ??pytorch_model.classifier._load_from_state_dict
#pytorch_model.classifier.state_dict()['weight'].copy_(pytorch_classifier_params["weight"])
#pytorch_model.classifier.state_dict()['bias'].copy_(pytorch_classifier_params["bias"])
# pytorch_model.classifier._parameters['weight'].copy_(pytorch_classifier_params["weight"])
# pytorch_model.classifier._parameters['bias'].copy_(pytorch_classifier_params["bias"])
    
#pytorch_model.num_labels = 1  # ???

I0525 20:36:31.150555 140389655271232 configuration_utils.py:151] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/ekoerner/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
I0525 20:36:31.152992 140389655271232 configuration_utils.py:168] Model config {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pad_token_id": 0,
  "pruned_heads": {},
  "torchscript": false

- https://huggingface.co/transformers/usage.html
- 

In [241]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

I0525 20:37:16.405678 140389655271232 tokenization_utils.py:374] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/ekoerner/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [248]:
sequence_0 = "The company HuggingFace is based in New York City"
sequence_1 = "Apples are especially bad for your health"

pytorch_model.num_labels = 1  # ???

paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, return_tensors="pt")
paraphrase_classification_logits = pytorch_model(**paraphrase)[0]
if pytorch_model.num_labels >= 2:
    paraphrase_results = torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0]
    class_probab = [f"{round(p * 100, 1)}%" for p in paraphrase_results]
else:
    paraphrase_results = torch.sigmoid(paraphrase_classification_logits).tolist()[0]
    class_probab = [round(p) for p in paraphrase_results]

In [249]:
paraphrase_classification_logits, paraphrase_results, class_probab

(tensor([[-0.2331]], grad_fn=<AddmmBackward>), [0.44198086857795715], [0])