In [81]:
from transformers import AutoTokenizer, AutoModel, AutoConfig
import torch
checkpoint='roberta-base'

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
config = AutoConfig.from_pretrained(checkpoint)
model = AutoModel.from_pretrained(checkpoint)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Before modification

In [85]:
batch = tokenizer(["yeet the cat out of here"],padding=True, truncation=True, return_tensors="pt")
print("Input: ",batch)
output = model(**batch)
print("\n\nOutput: ",output['last_hidden_state'])

Input:  {'input_ids': tensor([[   0, 4717,  594,    5, 4758,   66,    9,  259,    2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}


Output:  tensor([[[-0.0431,  0.0622, -0.0273,  ..., -0.0284, -0.0659, -0.0054],
         [ 0.0162,  0.2585,  0.3546,  ...,  0.7177,  0.2241, -0.0208],
         [-0.0698,  0.0751,  0.2477,  ..., -0.3056,  0.1697,  0.0822],
         ...,
         [ 0.0429, -0.3221, -0.0973,  ..., -0.4146, -0.0282,  0.3446],
         [ 0.1115, -0.0527, -0.0417,  ...,  0.5172,  0.0404,  0.1875],
         [-0.0341,  0.0536, -0.0554,  ..., -0.0705, -0.0627, -0.0357]]],
       grad_fn=<NativeLayerNormBackward>)


## After modification

In [86]:
#Assume we gonna add the word  yeet into vocab and we have the embedding from our network
yeet_embed = torch.rand(768).reshape(1,768)

new_embedding_layer = torch.cat((a,new_embed))

tokenizer.add_tokens('yeet')
model.resize_token_embeddings(len(tokenizer))
model.embeddings.word_embeddings.weight = torch.nn.Parameter(new_embedding_layer,requires_grad=True)

In [87]:
batch = tokenizer(["yeet the cat out of here"],padding=True, truncation=True, return_tensors="pt")
print("Input: ",batch)
output = model(**batch)
print("\n\nOutput: ",output['last_hidden_state'])

Input:  {'input_ids': tensor([[    0, 50265,     5,  4758,    66,     9,   259,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}


Output:  tensor([[[-0.0427,  0.0577, -0.0447,  ..., -0.0186, -0.0503,  0.0067],
         [-0.0822, -0.2524, -0.2250,  ...,  0.1061,  0.1537,  0.1627],
         [-0.2247, -0.2411, -0.0886,  ..., -0.1973, -0.0052, -0.2409],
         ...,
         [ 0.0658, -0.2896, -0.0971,  ..., -0.3445, -0.0174,  0.3703],
         [ 0.1236,  0.0275, -0.0706,  ...,  0.5766,  0.0081,  0.2497],
         [-0.0303,  0.0460, -0.0752,  ..., -0.0514, -0.0464, -0.0197]]],
       grad_fn=<NativeLayerNormBackward>)
