## This example demonstrates modifying the Base LLM Embedding and Head Layers with a New Tokenizer.
### The vocabulary size of the combined tokenizer is larger than that of the base LLM, so its embedding and head layers need to be extended.

In [2]:
!pip install -q transformers==4.44.1

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
seed = 42

import random
random.seed(seed)

import torch
torch.manual_seed(seed)

<torch._C.Generator at 0x7f895f94d830>

In [4]:
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, AutoTokenizer, AutoConfig

# Loading and saving the base LLM
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tinyllama_model = AutoModelForCausalLM.from_pretrained(model_name)
tinyllama_model.save_pretrained("tiny_llama")

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Loading the base model and its architecture
tinyllama_model = AutoModelForCausalLM.from_pretrained("./tiny_llama")
tinyllama_config = AutoConfig.from_pretrained("./tiny_llama")

In [6]:
tinyllama_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): 

In [7]:
tinyllama_config

LlamaConfig {
  "_name_or_path": "./tiny_llama",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 5632,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 22,
  "num_key_value_heads": 4,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.44.1",
  "use_cache": true,
  "vocab_size": 32000
}

In [8]:
import torch

# Loading the embedding layer weights
embed_weight = tinyllama_model.state_dict()[f'model.embed_tokens.weight']
print(f"Shape of original embedding layer: {embed_weight.shape}")
print(torch.max(embed_weight))
print(torch.min(embed_weight))

Shape of original embedding layer: torch.Size([32000, 2048])
tensor(0.1494)
tensor(-0.1172)


In [9]:
# Loading the head layer weights
head_weight = tinyllama_model.state_dict()[f'lm_head.weight']
print(f"Shape of original head layer: {head_weight.shape}")
print(torch.max(head_weight))
print(torch.min(head_weight))

Shape of original head layer: torch.Size([32000, 2048])
tensor(0.3945)
tensor(-0.4707)


In [10]:
import json
import os

# Loading the merged tokenizer
old_vocab = json.load(open(os.path.join('./tiny_tokenizer', 'tokenizer.json')))["model"]["vocab"]
new_vocab = json.load(open(os.path.join('./merged_tokenizer', 'tokenizer.json')))["model"]["vocab"]
print(len(old_vocab))
print(len(new_vocab))

# Calculating the number of new tokens in merged tokenizer
tokenizer_diff = len(new_vocab) - len(old_vocab)
print(tokenizer_diff)

32000
55796
23796


In [11]:
hidden_size = embed_weight.shape[1]
print(hidden_size)

# Initilizing weights for new vocabs both in the embedding layer
random_embed_weight = torch.zeros((tokenizer_diff, hidden_size)) #.to('cuda')
# random_embed_weight = torch.rand((tokenizer_diff, hidden_size)) #.to('cuda')
print(random_embed_weight.shape)
print(torch.max(random_embed_weight))
print(torch.min(random_embed_weight))

# Initilizing weights for new vocabs both in the head layer
random_embed_head = torch.zeros((tokenizer_diff, hidden_size)) #.to('cuda')
# random_embed_head = torch.rand((tokenizer_diff, hidden_size)) #.to('cuda')
print(random_embed_head.shape)
print(torch.max(random_embed_head))
print(torch.min(random_embed_head))

2048
torch.Size([23796, 2048])
tensor(0.)
tensor(0.)
torch.Size([23796, 2048])
tensor(0.)
tensor(0.)


In [12]:
# Adding initialized weights for new vocabulary to the head weights
new_head_weight = torch.cat((head_weight, random_embed_head), dim=0)
print(new_head_weight.shape)
print(torch.max(new_head_weight))
print(torch.min(new_head_weight))

torch.Size([55796, 2048])
tensor(0.3945)
tensor(-0.4707)


In [13]:
# Adding initialized weights for new vocabulary to the embedding layer weights
new_embed_weight = torch.cat((embed_weight, random_embed_weight), dim=0)
print(new_embed_weight.shape)
print(torch.max(new_embed_weight))
print(torch.min(new_embed_weight))

torch.Size([55796, 2048])
tensor(0.1494)
tensor(-0.1172)


In [14]:
print(torch.count_nonzero(head_weight))
print(torch.count_nonzero(new_head_weight))

tensor(65536000)
tensor(65536000)


In [15]:
# Updating the model state dictionary with new weights for head and embedding layers
state_dict = tinyllama_model.state_dict()
state_dict[f'model.embed_tokens.weight'] = new_embed_weight
state_dict[f'lm_head.weight'] = new_head_weight

print(state_dict[f'model.embed_tokens.weight'].shape)
print(state_dict[f'lm_head.weight'].shape)

torch.Size([55796, 2048])
torch.Size([55796, 2048])


In [16]:
# Updating the base LLM config file
print(type(tinyllama_config))

print(tinyllama_config.vocab_size)
tinyllama_config.vocab_size = state_dict[f'model.embed_tokens.weight'].shape[0]

print(tinyllama_config._name_or_path)
tinyllama_config._name_or_path = "./extended_tiny_llama_model"

print(tinyllama_config)

<class 'transformers.models.llama.configuration_llama.LlamaConfig'>
32000
./tiny_llama
LlamaConfig {
  "_name_or_path": "./extended_tiny_llama_model",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 5632,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 22,
  "num_key_value_heads": 4,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.44.1",
  "use_cache": true,
  "vocab_size": 55796
}



In [17]:
from transformers import LlamaForCausalLM

extended_tiny_llama_model = LlamaForCausalLM(tinyllama_config)

In [18]:
extended_tiny_llama_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(55796, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): 

In [19]:
# Saving the extended model
extended_tiny_llama_model.load_state_dict(state_dict)
extended_tiny_llama_model.tie_weights()
extended_tiny_llama_model.save_pretrained('./extended_tiny_llama_model')

### Testing the extended model

In [20]:
new_tokenizer = AutoTokenizer.from_pretrained("./merged_tokenizer")
print(new_tokenizer.eos_token)
print(new_tokenizer.bos_token)
print(new_tokenizer.unk_token)
print(new_tokenizer.all_special_tokens)

def print_tokenize(txt):
    for el in list(zip(new_tokenizer.tokenize(txt), new_tokenizer(txt)['input_ids'])):
        print(el)
        
txt = 'The TinyLlama project aims'

print_tokenize(txt)    

</s>
<s>
<unk>
['<s>', '</s>', '<unk>']
('▁The', 1)
('▁T', 450)
('iny', 323)
('L', 4901)
('l', 29931)
('ama', 29880)
('▁project', 3304)
('▁aims', 2060)


In [21]:
from transformers import AutoTokenizer
import transformers 
import torch

model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model)

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

sequences = pipeline(
    'Germany is a country',
    do_sample=False,
    top_k=10,
    num_return_sequences=1,
    repetition_penalty=1.5,
    eos_token_id=tokenizer.eos_token_id,
    max_length=100,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Result: Germany is a country in Europe that has been influenced by many different cultures. The German language, music and cuisine are all unique to the region due to its long history of migration from other countries such as France, England, Poland, Russia etc..
The most famous example would be Beethoven's Ninth Symphony which was composed during his stay at an Austrian monastery where he learned about Gregorian chanting (a form of choral singing). This influence


In [22]:
from transformers import AutoTokenizer
import transformers 
import torch

model = "extended_tiny_llama_model"

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=new_tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

sequences = pipeline(
    'Germany is a country',
    do_sample=False,
    top_k=10,
    num_return_sequences=1,
    repetition_penalty=1.5,
    eos_token_id=tokenizer.eos_token_id,
    max_length=100,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Result: Germany is a country in Europe that has been influenced by many different cultures. The German language, music and cuisine are all unique to the region due to its long history of migration from other countries such as France, England, Poland, Russia etc..
The most famous example would be Beethoven's Ninth Symphony which was composed during his stay at an Austrian monastery where he learned about Gregorian chanting (a form of choral singing). This influence


### After extending the model, it is necessary to continue pretraining on a multilingual dataset to ensure effective adaptation. It can be done with transformer library. Refer to Fine-tune a pretrained model https://huggingface.co/docs/transformers/en/training.