In [4]:
import torch as t
from torch import einsum
from einops import rearrange, reduce, repeat
import bert_tests
import numpy as np
from tqdm import tqdm


In [5]:
def raw_attention_pattern(token_activations, num_heads, project_query, project_key):
    query = project_query(token_activations)
    query = rearrange(query, "b s (n h) -> b n s h", n=num_heads)
    key = project_key(token_activations)
    key = rearrange(key, "b s (n h) -> b n s h", n=num_heads)
    head_size = key.shape[-1]
    dot_prod = t.einsum("bsnh,bsmh->bsmn", query, key)
    dot_prod /= np.sqrt(head_size)
    return dot_prod

In [6]:
bert_tests.test_attention_pattern_fn(raw_attention_pattern)

attention pattern raw MATCH!!!!!!!!
 SHAPE (2, 12, 3, 3) MEAN: 0.0007749 STD: 0.1164 VALS [0.1197 0.02408 0.03616 0.01532 0.1504 -0.1547 -0.04518 0.08466 -0.1337 -0.09462...]


In [7]:
def bert_attention(token_activations, num_heads, attention_pattern, project_value, project_output):
    attention_score = t.softmax(attention_pattern, dim=-2)
    value = project_value(token_activations)
    value = rearrange(value, "batch seq (head size) -> batch seq size head", head=num_heads)
    attention_value = t.einsum("bhkq,bksh->bshq", attention_score, value)
    attention_value = rearrange(attention_value, "batch size head seq-> batch seq (head size)", head=num_heads)
    output = project_output(attention_value)
    return output
    

In [8]:
bert_tests.test_attention_fn(bert_attention)

attention MATCH!!!!!!!!
 SHAPE (2, 3, 768) MEAN: -0.001968 STD: 0.1205 VALS [0.1741 -0.002676 -0.08375 0.2172 -0.1002 0.04842 0.04215 -0.1167 0.1185 -0.1826...]


In [9]:
class MultiHeadedSelfAttention(t.nn.Module):
    def __init__(self, num_heads: int, hidden_size:int):
        super().__init__()
        self.num_heads = num_heads
        self.hidden_size = hidden_size
        self.query = t.nn.Linear(hidden_size, hidden_size)
        self.key = t.nn.Linear(hidden_size, hidden_size)
        self.value = t.nn.Linear(hidden_size, hidden_size)
        self.output = t.nn.Linear(hidden_size, hidden_size)
        
        
    def forward(self, input):
        attn_pattern = raw_attention_pattern(input, self.num_heads, self.query, self.key)
        bert_attn = bert_attention(input, self.num_heads, attn_pattern, self.value, self.output)
        return bert_attn

In [10]:
bert_tests.test_bert_attention(MultiHeadedSelfAttention)

bert MATCH!!!!!!!!
 SHAPE (2, 3, 768) MEAN: -0.001554 STD: 0.1736 VALS [-0.08316 -0.09165 -0.03188 -0.03013 0.1001 0.09549 -0.1046 0.07742 0.0424 0.05553...]


In [11]:
def bert_mlp(token_activations, linear_1, linear_2):
    x = linear_1(token_activations)
    y = t.nn.functional.gelu(x)
    return linear_2(y)

bert_tests.test_bert_mlp(bert_mlp)

bert mlp MATCH!!!!!!!!
 SHAPE (2, 3, 768) MEAN: -0.0001934 STD: 0.1044 VALS [-0.1153 0.1189 -0.0813 0.1021 0.0296 0.06182 0.0341 0.1446 0.2622 -0.08507...]


In [12]:
class BertMLP(t.nn.Module):
    def __init__(self, input_size: int, intermediate_size: int):
        super().__init__()
        self.input_size = input_size
        self.intermediate_size = intermediate_size
        self.linear1 = t.nn.Linear(input_size, intermediate_size)
        self.linear2 = t.nn.Linear(intermediate_size, input_size)
    def forward(self, input):
        return bert_mlp(input, self.linear1, self.linear2)

# bert_tests

In [13]:
class LayerNorm(t.nn.Module):
    def __init__(self, normalized_dim: int):
        super().__init__()
        self.weight = t.nn.Parameter(t.ones((normalized_dim,)))
        self.bias = t.nn.Parameter(t.zeros((normalized_dim,)))
        
    def forward(self, input):
        mean = t.mean(input, dim=-1, keepdims=True).detach()
        var = t.std(input, dim=-1, unbiased=False, keepdims=True).detach()
        input = (input - mean) / var
        input = input * self.weight + self.bias
        return input
    
bert_tests.test_layer_norm(LayerNorm)

layer norm MATCH!!!!!!!!
 SHAPE (20, 10) MEAN: -1.431e-08 STD: 1.003 VALS [0.6906 -0.84 1.881 1.711 -0.5117 -0.9577 -0.1387 -0.6943 -0.6741 -0.4662...]


In [14]:
class BertBlock(t.nn.Module):
    def __init__(self, hidden_size: int, intermediate_size: int, num_heads: int, dropout: float):
        super().__init__()
        self.norm1 = LayerNorm(hidden_size)
        self.attn = MultiHeadedSelfAttention(num_heads, hidden_size)
        self.mlp = BertMLP(hidden_size, intermediate_size)
        self.norm2 = LayerNorm(hidden_size)
        self.dropout = t.nn.Dropout(dropout)
        
    def forward(self, input):
        x = self.attn(input)
        y = self.norm1(x + input)
        x = self.mlp(y)
        x = self.dropout(x)
        return self.norm2(x + y)
        
bert_tests.test_bert_block(BertBlock)

bert MATCH!!!!!!!!
 SHAPE (2, 3, 768) MEAN: -2.897e-09 STD: 1 VALS [0.007131 -0.04372 0.6502 -0.5972 -1.097 0.7267 0.1275 -0.6035 -0.2226 0.2145...]


In [15]:
class Embedding(t.nn.Module):
    def __init__(self, vocab_size, embed_size):
        super().__init__()
        self.embedding = t.nn.Parameter(t.randn((vocab_size, embed_size)))
    
    def forward(self, inputs):
        return self.embedding[inputs, :] #TODO look at solution

bert_tests.test_embedding(Embedding)

embedding MATCH!!!!!!!!
 SHAPE (2, 3, 5) MEAN: -0.06748 STD: 1.062 VALS [1.176 -0.1914 0.8212 1.047 -0.481 0.7106 -1.304 -1.307 -0.438 -0.2764...]


In [16]:
def bert_embedding(input_ids, token_type_ids, position_embedding, token_embedding, token_type_embedding, layer_norm, dropout):
    x = position_embedding(t.arange(input_ids.shape[-1])) + token_embedding(input_ids) + token_type_embedding(token_type_ids)
    x = layer_norm(x)
    return dropout(x)

bert_tests.test_bert_embedding_fn(bert_embedding)

bert embedding MATCH!!!!!!!!
 SHAPE (2, 3, 768) MEAN: 0 STD: 1 VALS [-1.319 -0.4378 -2.074 0.9679 0.9274 1.479 -0.501 -1.9 -0.212 0.7961...]


In [17]:
class BertEmbedding(t.nn.Module):
    def __init__(self, vocab_size: int, hidden_size: int, max_position_embeddings: int, type_vocab_size: int, dropout: float):
        super().__init__()
        self.token_em = Embedding(vocab_size, hidden_size)
        self.pos_em = Embedding(max_position_embeddings, hidden_size)
        self.token_type_em = Embedding(2, hidden_size)
        self.layer_norm = LayerNorm(hidden_size)
        self.dropout = t.nn.Dropout(dropout)
        
    def forward(self, input_ids, token_type_ids):
        return bert_embedding(input_ids, token_type_ids, self.pos_em, self.token_em, self.token_type_em, self.layer_norm, self.dropout)
    
bert_tests.test_bert_embedding(BertEmbedding)
        

bert embedding MATCH!!!!!!!!
 SHAPE (2, 3, 768) MEAN: -4.553e-09 STD: 1 VALS [-0.009385 -0.4919 0.9852 -0.3535 -3.624 1.333 1.163 1.449 1.063 0.246...]


In [18]:
class Bert(t.nn.Module):
    def __init__(self, vocab_size: int, hidden_size: int, 
            max_position_embeddings: int, type_vocab_size: int, 
            dropout: float, intermediate_size: int, num_heads: int, 
            num_layers: int):
        super().__init__()
        self.dropout = dropout
        self.hidden_size = hidden_size
        self.embedding = BertEmbedding(vocab_size, hidden_size, max_position_embeddings, type_vocab_size, dropout)
        self.layers = t.nn.Sequential(
            t.nn.Sequential(*[BertBlock(hidden_size, intermediate_size, num_heads, dropout) for _ in range(num_layers)]),
            t.nn.Linear(hidden_size, hidden_size),
            t.nn.GELU(),
            LayerNorm(hidden_size),
        )
        self.linear = t.nn.Linear(hidden_size, vocab_size)
    
    def forward(self, input_ids):
        return self.linear(self.layers(self.embedding(input_ids, t.zeros(1, dtype=t.long, device=input_ids.device))))

bert_tests.test_bert(Bert)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

bert MATCH!!!!!!!!
 SHAPE (1, 4, 28996) MEAN: 0.003031 STD: 0.5765 VALS [-0.5742 -0.4321 0.1186 -0.7165 -0.5262 0.4967 1.223 0.3165 -0.3247 -0.5717...]


In [19]:
my_bert = Bert(
    vocab_size=28996, hidden_size=768, max_position_embeddings=512, 
    type_vocab_size=2, dropout=0.1, intermediate_size=3072, 
    num_heads=12, num_layers=12
)
pretrained_bert = bert_tests.get_pretrained_bert()
pretrained_bert_dict = pretrained_bert.state_dict()
del pretrained_bert_dict['classification_head.weight']
del pretrained_bert_dict['classification_head.bias']


Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
mapped_weights = {v[0]: pretrained_bert_dict[v[1]] for v in zip(my_bert.state_dict(), pretrained_bert_dict)}
mapping_replacements = {
    'layers.3.weight' : 'linear.weight',
    'linear.weight' : 'layers.3.weight',
    'layers.3.bias' : 'linear.bias',
    'linear.bias' : 'layers.3.bias',
}

# mapped_weights = {'a':2,'b':3}
# mapping_replacements = {'a':'b','b':'a'}
another_mapped_weights = {mapping_replacements[i] if i in mapping_replacements else i : mapped_weights[i] for i in mapped_weights}
# another_mapped_weights

In [21]:
print(len([v for v in my_bert.state_dict()]))
print(len([v for v in pretrained_bert_dict]))


203
203


In [22]:
my_bert.load_state_dict(another_mapped_weights)
my_bert.eval()
print()




In [23]:
bert_tests.test_same_output(my_bert, pretrained_bert, tol=0.1)

comparing Berts MATCH!!!!!!!!
 SHAPE (10, 20, 28996) MEAN: -2.732 STD: 2.414 VALS [-5.65 -6.041 -6.096 -6.062 -5.945 -5.777 -5.977 -6.015 -6.028 -5.935...]


In [24]:
import transformers
tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-cased")
bad_tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-uncased")

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [25]:
tokenizer.decode(bad_tokenizer.encode("Hi, my name is bert"))

'[CLS] colleges 天 largest happened smile donation [SEP]'

In [26]:
tokenizer("[MASK]")

{'input_ids': [101, 103, 102], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}

In [27]:
type(tokenizer.encode("The firetruck was painted bright [MASK].")[0])

int

In [28]:
encoding = t.tensor([tokenizer.encode("The fish loves to eat [MASK].")], dtype=t.long)
mask_location = t.where(encoding == tokenizer.mask_token_id)[1]
probs = t.softmax(my_bert(encoding)[0,mask_location,:], dim=-1)
ids = t.topk(probs, 4).indices[0]
print([x for x in zip(tokenizer.decode(ids).split(" "), probs[0, ids])])
print(probs[0, ids])

[('it', tensor(0.1736, grad_fn=<UnbindBackward0>)), ('fish', tensor(0.0982, grad_fn=<UnbindBackward0>)), ('them', tensor(0.0946, grad_fn=<UnbindBackward0>)), ('meat', tensor(0.0410, grad_fn=<UnbindBackward0>))]
tensor([0.1736, 0.0982, 0.0946, 0.0410], grad_fn=<IndexBackward0>)


In [29]:
def ascii_art_probs(input, tokenizer, model):
    print(input.replace("[MASK]", "___"))
    encoding = t.tensor([tokenizer.encode(input)], dtype=t.long)
    mask_location = t.where(encoding == tokenizer.mask_token_id)[1]
    probs = t.softmax(model(encoding)[0,mask_location,:], dim=-1)
    ids = t.topk(probs, 4).indices[0]
    spaces = " "*input.index("[MASK]")
    for word, p in zip(tokenizer.decode(ids).split(" "), probs[0, ids]):
        print(f"{spaces}{word}\t{p*100:.1f}%")


In [30]:
ascii_art_probs("The firetruck was painted bright [MASK].", tokenizer, my_bert)

The firetruck was painted bright ___.
                                 red	56.0%
                                 yellow	12.1%
                                 white	7.1%
                                 blue	6.9%


In [31]:
class BertSentiment(t.nn.Module):
    def __init__(self, bert_model, num_classes: int):
        super().__init__()
        self.bert_model = bert_model
        self.num_classes = num_classes
        self.dropout = t.nn.Dropout(bert_model.dropout)
        self.linear = t.nn.Linear(bert_model.hidden_size, num_classes)
    def forward(self, input_ids):
        return self.linear(self.dropout(self.bert_model.layers(self.bert_model.embedding(input_ids, t.zeros(1, dtype=t.long, device=input_ids.device)))))
    def parameters(self):
        return self.linear.parameters()

In [33]:
import torchtext
import random

data_train, data_test = torchtext.datasets.IMDB(root='.data', split=('train', 'test'))
data_train = list(data_train)
random.shuffle(data_train)
data_test = list(data_test)
random.shuffle(data_test)


TypeError: IMDB.__init__() missing 3 required positional arguments: 'path', 'text_field', and 'label_field'

In [30]:
def get_batches(data, batch_size, tokenizer, max_seq_len = 2048):
    assert len(data) > batch_size
    n_batches = len(data) // batch_size
    batched_data = t.zeros(n_batches, batch_size, max_seq_len, dtype=t.long)
    batched_labels = t.zeros(n_batches, batch_size, dtype=t.long)
    for batch in range(n_batches):
        for i in range(batch_size):
            label, text = data[batch*batch_size + i]
            tokens = tokenizer.encode(text)
            l = min(len(tokens), max_seq_len)
            batched_data[batch,i,:l] = t.tensor(tokens)[:l]
            batched_labels[batch,i] = int(label == 'pos')
    return batched_data, batched_labels
            
        

In [31]:
res = get_batches(data_train, 16, tokenizer, max_seq_len=50)

Token indices sequence length is longer than the specified maximum sequence length for this model (642 > 512). Running this sequence through the model will result in indexing errors


In [32]:
res[0].shape

torch.Size([1562, 16, 50])

In [39]:
def train_sentiments(data_train, data_test, model, batch_size, tokenizer, max_seq_len=2048, lr=1e-5, epochs=1):
    batched_data, batched_labels = get_batches(data_train, batch_size, tokenizer, max_seq_len=max_seq_len)
    #print(batched_data.shape)
    model = model.cuda()
    optimizer = t.optim.Adam(model.parameters(),lr=lr)
    losses = []
    try:
        for _ in range(epochs):
            for batch in tqdm(range(batched_data.shape[0])):
                batched_data_cuda = batched_data[batch].cuda()
                batched_labels_cuda = batched_labels[batch].cuda()
                #print(batched_data_cuda.shape)
                #print(batched_labels_cuda.shape)
                optimizer.zero_grad()
                pred = model(batched_data_cuda)[:,0,:]
                #print(pred.shape)
                #print(batched_labels[batch].shape)
                loss = t.nn.CrossEntropyLoss()(pred, batched_labels_cuda)

                #print(loss)
                losses.append(loss)
                loss.backward()
                optimizer.step()
    except KeyboardInterrupt:
        return losses
    return losses

            

In [41]:
# sentiment_model = BertSentiment(my_bert, 2)
sentiment_model.cuda()
losses = train_sentiments(data_train, data_test, sentiment_model, 32, tokenizer, 512)

  0%|          | 0/781 [00:00<?, ?it/s]


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [35]:
print(pred.shape)

NameError: name 'pred' is not defined

In [None]:
batched_data, batched_labels = get_batches(data_train, 64, tokenizer, max_seq_len=2048)

In [None]:
batched_data.shape

In [None]:
batched_data = batched_data.to("cuda")

In [None]:
batched_data.shape

In [None]:
A = t.zeros(10)
B = A.cuda()

In [None]:
A.device

In [None]:
B.device