In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
import torch
from torch.nn.utils.rnn import pad_sequence



In [None]:

model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(model_name, 
                                             torch_dtype=torch.float16, 
                                             device_map='auto',
                                             )

tokenizer.pad_token = tokenizer.eos_token  # Required
model.config.pad_token_id = tokenizer.pad_token_id

model._is_parallelizable = True
model.model_parallel = True




Device set to use cuda:0


In [4]:
pipe("Can a dog Fly?")  # Test the pipeline

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[{'generated_text': 'Can a dog Fly? No, a dog cannot fly, but it can walk on the ceiling. In this article, we'}]

In [2]:
print(next(model.parameters()).is_meta)  # Should print False


False


In [3]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb):

In [4]:
dataset = load_dataset("qiaojin/PubMedQA", "pqa_artificial")["train"]
check_dataset = dataset.select(range(2000))
dataset = check_dataset


In [5]:
dataset

Dataset({
    features: ['pubid', 'question', 'context', 'long_answer', 'final_decision'],
    num_rows: 2000
})

In [6]:
def format_prompt(example):
    question = example['question']
    contexts = example['context']['contexts']
    context = "\n".join([f"Context {i+1}: {c}" for i, c in enumerate(contexts)])
    return {
        "text": (
                f"Contexts:\n{context}\n\n"
                "Based on the contexts above, answer the question below with 'yes', 'no', or 'maybe'.\n"
                "Then, provide a short explanation that justifies your answer using evidence from the context.\n"
                f"Question: {question}\n"
                f"Answer: {example['final_decision']}\n"
                f"Explanation: {example['long_answer']}\n"
            )
    }

formatted_dataset = dataset.map(format_prompt, remove_columns=dataset.column_names)




In [7]:
formatted_dataset

Dataset({
    features: ['text'],
    num_rows: 2000
})

In [8]:
def tokenize_function(example):
    result = tokenizer(
        example["text"],

        truncation=False,
    )
    return result
tokenized_dataset = formatted_dataset.map(tokenize_function, batched=False)
tokenized_dataset = tokenized_dataset.remove_columns(['text'])



In [9]:
from torch.nn.utils.rnn import pad_sequence

def collator(batch):
  input_ids = pad_sequence([torch.tensor(x['input_ids']) for x in batch], batch_first=True,padding_value=tokenizer.pad_token_id)
  attention_mask = pad_sequence([torch.tensor(x['attention_mask']) for x in batch], batch_first=True, padding_value=0)
  labels = input_ids.clone()

  # print(input_ids.shape)
  # print(attention_mask.shape)
  # print(labels.shape)
  return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}


# def collator(batch):
#     # Convert lists of token ids to tensors
#     input_ids = [torch.tensor(example['input_ids'], dtype=torch.long) for example in batch]
#     attention_masks = [torch.tensor(example['attention_mask'], dtype=torch.long) for example in batch]

#     # Compute the maximum sequence length in the batch
#     max_len = max(len(seq) for seq in input_ids)
#     print(f"Max length in batch: {max_len}")

#     # Pad all sequences to max_len
#     input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
#     attention_masks = pad_sequence(attention_masks, batch_first=True, padding_value=0)

#     # Sanity check
#     assert input_ids.shape == attention_masks.shape, "Shape mismatch between input_ids and attention_mask"

#     # Create labels
#     labels = input_ids.clone()

#     return {
#         'input_ids': input_ids,
#         'attention_mask': attention_masks,
    #     'labels': labels
    # }

In [10]:
DataLoader = torch.utils.data.DataLoader(tokenized_dataset, batch_size=2, collate_fn=collator)

In [11]:
counter = 4
for batch in DataLoader:
    print('---- Batch ----')
    print(batch['input_ids'].shape)
    print(batch['attention_mask'].shape)
    print(batch['labels'].shape)

    counter -= 1
    if counter == 0:
        break
    

---- Batch ----
torch.Size([2, 615])
torch.Size([2, 615])
torch.Size([2, 615])
---- Batch ----
torch.Size([2, 355])
torch.Size([2, 355])
torch.Size([2, 355])
---- Batch ----
torch.Size([2, 438])
torch.Size([2, 438])
torch.Size([2, 438])
---- Batch ----
torch.Size([2, 548])
torch.Size([2, 548])
torch.Size([2, 548])


In [13]:

model.gradient_checkpointing_enable()


# Training args
training_args = TrainingArguments(
    output_dir="./llama-finetuned",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    learning_rate=2e-5,
    num_train_epochs=1,
    logging_dir="./logs",
    fp16=True,
    save_total_limit=2,
    save_steps=100,
    # gradient_checkpointing=True

    # torch_compile=False,    
    # skip_memory_metrics=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=collator,
)
trainer.train()


  trainer = Trainer(


AttributeError: module 'torch' has no attribute 'xla'

In [13]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)

model.embed_tokens.weight
model.layers.0.self_attn.q_proj.weight
model.layers.0.self_attn.k_proj.weight
model.layers.0.self_attn.v_proj.weight
model.layers.0.self_attn.o_proj.weight
model.layers.0.mlp.gate_proj.weight
model.layers.0.mlp.up_proj.weight
model.layers.0.mlp.down_proj.weight
model.layers.0.input_layernorm.weight
model.layers.0.post_attention_layernorm.weight
model.layers.1.self_attn.q_proj.weight
model.layers.1.self_attn.k_proj.weight
model.layers.1.self_attn.v_proj.weight
model.layers.1.self_attn.o_proj.weight
model.layers.1.mlp.gate_proj.weight
model.layers.1.mlp.up_proj.weight
model.layers.1.mlp.down_proj.weight
model.layers.1.input_layernorm.weight
model.layers.1.post_attention_layernorm.weight
model.layers.2.self_attn.q_proj.weight
model.layers.2.self_attn.k_proj.weight
model.layers.2.self_attn.v_proj.weight
model.layers.2.self_attn.o_proj.weight
model.layers.2.mlp.gate_proj.weight
model.layers.2.mlp.up_proj.weight
model.layers.2.mlp.down_proj.weight
model.layers.2.inp