In [None]:
from huggingface_hub import login

login(token = "YOUR_TOKEN") # Replace YOUR_TOKEN with your Hugging Face token

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")

In [3]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb):

In [None]:
import torch
from transformers import pipeline
import torch.nn as nn

original_vocab_size, embedding_dim = model.model.embed_tokens.weight.shape

extra_embedding_1 = nn.Embedding(original_vocab_size, embedding_dim)
extra_embedding_2 = nn.Embedding(original_vocab_size, embedding_dim)

nn.init.xavier_uniform_(extra_embedding_1.weight)
nn.init.xavier_uniform_(extra_embedding_2.weight)

extra_embedding_1.weight.requires_grad = True
extra_embedding_2.weight.requires_grad = True

# Add the new embeddings as attributes
model.extra_embedding_1 = extra_embedding_1
model.extra_embedding_2 = extra_embedding_2

In [None]:
class CustomEmbedding(nn.Module):
    def __init__(self, original_embeddings, new_embeddings_1, new_embeddings_2):
        super().__init__()
        self.original_embeddings = original_embeddings
        self.new_embeddings_1 = new_embeddings_1
        self.new_embeddings_2 = new_embeddings_2
    
    def forward(self, input_ids):
        original_embeds = self.original_embeddings(input_ids)
        new_embeds_1 = self.new_embeddings_1(input_ids)
        new_embeds_2 = self.new_embeddings_2(input_ids)
        
        # Combine embeddings
        combined_embeds = original_embeds + new_embeds_1 + new_embeds_2  # Summing up
        return combined_embeds

# Replace original embedding layer with custom one
model.model.embed_tokens = CustomEmbedding(model.model.embed_tokens, extra_embedding_1, extra_embedding_2)

In [13]:
# Freeze everething, except the new embeddings
for param in model.parameters():
    param.requires_grad = False

for param in model.extra_embedding_1.parameters():
    param.requires_grad = True

for param in model.extra_embedding_2.parameters():
    param.requires_grad = True

In [14]:
#check if the embeddings are trainable
for name, param in model.named_parameters():
    print(name, param.requires_grad)

model.embed_tokens.original_embeddings.weight False
model.embed_tokens.new_embeddings_1.weight True
model.embed_tokens.new_embeddings_2.weight True
model.layers.0.self_attn.q_proj.weight False
model.layers.0.self_attn.k_proj.weight False
model.layers.0.self_attn.v_proj.weight False
model.layers.0.self_attn.o_proj.weight False
model.layers.0.mlp.gate_proj.weight False
model.layers.0.mlp.up_proj.weight False
model.layers.0.mlp.down_proj.weight False
model.layers.0.input_layernorm.weight False
model.layers.0.post_attention_layernorm.weight False
model.layers.1.self_attn.q_proj.weight False
model.layers.1.self_attn.k_proj.weight False
model.layers.1.self_attn.v_proj.weight False
model.layers.1.self_attn.o_proj.weight False
model.layers.1.mlp.gate_proj.weight False
model.layers.1.mlp.up_proj.weight False
model.layers.1.mlp.down_proj.weight False
model.layers.1.input_layernorm.weight False
model.layers.1.post_attention_layernorm.weight False
model.layers.2.self_attn.q_proj.weight False
model

In [7]:
from datasets import load_dataset

ds = load_dataset("mlabonne/FineTome-Alpaca-100k", split="train")


In [8]:
print(ds[0])

{'source': 'infini-instruct-top-500k', 'score': 5.212620735168457, 'instruction': 'Explain what boolean operators are, what they do, and provide examples of how they can be used in programming. Additionally, describe the concept of operator precedence and provide examples of how it affects the evaluation of boolean expressions. Discuss the difference between short-circuit evaluation and normal evaluation in boolean expressions and demonstrate their usage in code. \n\nFurthermore, add the requirement that the code must be written in a language that does not support short-circuit evaluation natively, forcing the test taker to implement their own logic for short-circuit evaluation.\n\nFinally, delve into the concept of truthiness and falsiness in programming languages, explaining how it affects the evaluation of boolean expressions. Add the constraint that the test taker must write code that handles cases where truthiness and falsiness are implemented differently across different programm

In [None]:
def tokenize(examples):
    texts = [
        f"### Instruction: {instruction}\n### Response: {output}" 
        for instruction, output in zip(examples['instruction'], examples['output'])
    ]
    
    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=1024,  
        padding="max_length",
        return_tensors=None  
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    
    return tokenized

tokenizer.pad_token = tokenizer.eos_token

tokenized_ds = ds.map(
    tokenize, 
    batched=True,
    remove_columns=['instruction',"source","score",'output']  
)

tokenized_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
from torch.utils.data import DataLoader
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

model.train()

optimizer = torch.optim.AdamW([
        {"params": model.model.embed_tokens.new_embeddings_1.parameters(), "lr": 2e-5},
        {"params": model.model.embed_tokens.new_embeddings_2.parameters(), "lr": 2e-5}
])
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./trained_embeddings",
    per_device_train_batch_size=4,  
    num_train_epochs=3,
    logging_dir="./logs",
    save_strategy="epoch",
    learning_rate=2e-5,
    remove_unused_columns=False,  
    fp16=True,  
    optim="adamw_torch"  
)

# Define Trainer with the data collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    optimizers=(optimizer, None)  
)


trainer.train()


  trainer = Trainer(


Step,Training Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.96 GiB. GPU 