In [2]:
import torch
import math
import numpy as np
from transformers import BertModel
from transformers import BertTokenizer

bert = BertModel.from_pretrained("./bert-base-chinese", return_dict=False)
tokenizer = BertTokenizer.from_pretrained("./bert-base-chinese")

In [3]:
state_dict = bert.state_dict()
bert.eval()
x = np.array([2450, 15486, 102, 2110])
torch_x = torch.LongTensor([x])
squence_output, pooled_output = bert(torch_x)
# print(squence_output.shape, pooled_output.shape)
# print(squence_output, pooled_output)
# print(bert.state_dict().keys())
# print(state_dict["embeddings.word_embeddings.weight"].shape)



  torch_x = torch.LongTensor([x])


In [8]:
def softmax(x):
    return np.exp(x) / np.sum(np.exp(x), axis=-1, keepdims=True)

def gelu(x):
    return 0.5 * x * (1 + np.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * np.power(x, 3))))

class DiyBert:
    def __init__(self, state_dict):
        self.num_attention_heads = 12
        self.hidden_size = 768
        self.num_layers = 1
        self.load_weights(state_dict)
    
    def load_weights(self, state_dict):
        #embedding
        self.word_embeddings = state_dict['embeddings.word_embeddings.weight'].numpy()
        self.position_embeddings = state_dict['embeddings.position_embeddings.weight'].numpy()
        self.toeken_type_embeddings = state_dict['embeddings.token_type_embeddings.weight'].numpy()
        self.embeddings_layer_norm_weight = state_dict['embeddings.LayerNorm.weight'].numpy()
        self.embeddings_layer_norm_bias = state_dict['embeddings.LayerNorm.bias'].numpy()
        self.transformer_weights = []

        #transformer
        for i in range(self.num_layers):
            q_w = state_dict['encoder.layer.{}.attention.self.query.weight'.format(i)].numpy()
            q_b = state_dict['encoder.layer.{}.attention.self.query.bias'.format(i)].numpy()
            k_w = state_dict['encoder.layer.{}.attention.self.key.weight'.format(i)].numpy()
            k_b = state_dict['encoder.layer.{}.attention.self.key.bias'.format(i)].numpy()
            v_w = state_dict['encoder.layer.{}.attention.self.value.weight'.format(i)].numpy()
            v_b = state_dict['encoder.layer.{}.attention.self.value.bias'.format(i)].numpy()
            attention_output_weight = state_dict['encoder.layer.{}.attention.output.dense.weight'.format(i)].numpy()
            attention_output_bias = state_dict['encoder.layer.{}.attention.output.dense.bias'.format(i)].numpy()
            attention_layer_norm_weight = state_dict['encoder.layer.{}.attention.output.LayerNorm.weight'.format(i)].numpy()
            attention_layer_norm_bias = state_dict['encoder.layer.{}.attention.output.LayerNorm.bias'.format(i)].numpy()
            intermediate_weight = state_dict['encoder.layer.{}.intermediate.dense.weight'.format(i)].numpy()
            intermediate_bias = state_dict['encoder.layer.{}.intermediate.dense.bias'.format(i)].numpy()
            output_weight = state_dict['encoder.layer.{}.output.dense.weight'.format(i)].numpy()
            output_bias = state_dict['encoder.layer.{}.output.dense.bias'.format(i)].numpy()
            ff_layer_norm_w = state_dict['encoder.layer.{}.output.LayerNorm.weight'.format(i)].numpy()
            ff_layer_norm_b = state_dict['encoder.layer.{}.output.LayerNorm.bias'.format(i)].numpy()
            self.transformer_weights.append([
                q_w, q_b, k_w, k_b, v_w, v_b, attention_output_weight, attention_output_bias, attention_layer_norm_weight, attention_layer_norm_bias,
                intermediate_weight, intermediate_bias, output_weight, output_bias, ff_layer_norm_w, ff_layer_norm_b
            ])
        
        #pooler
        self.pooler_dense_weight = state_dict['pooler.dense.weight'].numpy()
        self.pooler_dense_bias = state_dict['pooler.dense.bias'].numpy()

    def embedding_forward(self, x):
        word_embedding = self.get_embedding(self.word_embeddings, x)
        position_embedding = self.get_embedding(self.position_embeddings, np.array(list(range(len(x)))))
        token_embedding = self.get_embedding(self.toeken_type_embeddings, np.array([0] * len(x)))
        embedding = word_embedding + position_embedding + token_embedding
        embedding = self.layer_norm(embedding, self.embeddings_layer_norm_weight, self.embeddings_layer_norm_bias)
        return embedding

    def get_embedding(self, embedding_matrix, x):
        return np.array([embedding_matrix[i] for i in x])
    
    def all_transformer_layer_forward(self, x):
        for i in range(self.num_layers):
            x = self.single_transformer_layer_forward(x, i)
        return x
    
    def single_transformer_layer_forward(self, x, layer_index):
        weights = self.transformer_weights[layer_index]
        q_w, q_b, k_w, k_b, v_w, v_b, attention_output_weight, attention_output_bias, attention_layer_norm_weight, attention_layer_norm_bias, intermediate_weight, intermediate_bias, output_weight, output_bias, ff_layer_norm_w, ff_layer_norm_b = weights
        attention_output = self.self_attention(x, q_w, q_b, k_w, k_b, v_w, v_b, attention_output_weight, attention_output_bias, self.num_attention_heads, self.hidden_size)
        x = self.layer_norm(x + attention_output, attention_layer_norm_weight, attention_layer_norm_bias)
        feed_forward_x = self.feed_forward(x, intermediate_weight, intermediate_bias, output_weight, output_bias)
        x = self.layer_norm(x + feed_forward_x, ff_layer_norm_w, ff_layer_norm_b)
        return x

    def self_attention(self, x, q_w, q_b, k_w, k_b, v_w, v_b, attention_output_weight, attention_output_bias, num_attention_heads, hidden_size):
        q = np.dot(x, q_w.T) + q_b
        k = np.dot(x, k_w.T) + k_b
        v = np.dot(x, v_w.T) + v_b
        attention_head_size = int(hidden_size / num_attention_heads)
        q = self.transpose_for_scores(q, attention_head_size, num_attention_heads)
        k = self.transpose_for_scores(k, attention_head_size, num_attention_heads)
        v = self.transpose_for_scores(v, attention_head_size, num_attention_heads)
        qk = np.matmul(q, k.swapaxes(1, 2))
        qk /= np.sqrt(attention_head_size)
        qk = softmax(qk)
        qkv = np.matmul(qk, v)
        qkv = qkv.swapaxes(0, 1).reshape(-1, hidden_size)
        attention = np.dot(qkv, attention_output_weight.T) + attention_output_bias
        return attention
    
    def transpose_for_scores(self, x, attention_head_size, num_attention_heads):
        max_len, hidden_size = x.shape
        x = x.reshape(max_len, num_attention_heads, attention_head_size)
        x = x.swapaxes(1, 0)
        return x
    
    def layer_norm(self, x, w, b):
        x = (x - np.mean(x, axis=-1, keepdims=True)) / np.sqrt(np.var(x, axis=-1, keepdims=True))
        x = x * w + b
        return x
    
    def feed_forward(self, x, intermediate_weight, intermediate_bias, output_weight, output_bias):
        x = np.dot(x, intermediate_weight.T) + intermediate_bias
        x = gelu(x)
        x = np.dot(x, output_weight.T) + output_bias
        return x
    
    def pooler_output_layer(self, x):
        x = np.dot(x, self.pooler_dense_weight.T) + self.pooler_dense_bias
        return np.tanh(x)
    
    def forward(self, x):
        print('before embedding_forward', x)
        x = self.embedding_forward(x)
        print('after embedding_forward', x.shape)
        x = self.all_transformer_layer_forward(x)
        print('after all_transformer_layer_forward', x.shape)
        pooled_output = self.pooler_output_layer(x[0])
        print('after pooler_output_layer', x.shape)
        print('pooled_output', pooled_output.shape)
        return x, pooled_output
    
db = DiyBert(state_dict)
diy_sequence_output, diy_pooled_output = db.forward(x)
torch_sequence_output, torch_pooled_output = bert(torch_x)

# print(diy_sequence_output)
# print(torch_sequence_output)
        
            

before embedding_forward [ 2450 15486   102  2110]
after embedding_forward (4, 768)
after all_transformer_layer_forward (4, 768)
after pooler_output_layer (4, 768)
pooled_output (768,)
