# Fine-Tune Llama3.2 1B

## Objective

We will be fine-tuning a Llama 3.2 3B model using a synthetic dataset with approximately 5,000 datapoints.

## Set Up


1. Install required libraries
2. Mount google drive
3. Load librarires
4. Log in to Huggingface

### Install required libraries for colab, if needed

In [1]:
!pip install -q datasets bitsandbytes accelerate loralib peft

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m72.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m56.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

### Import libraries

In [2]:
import bitsandbytes as bnb
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer)
from peft import get_peft_model, LoraConfig
from huggingface_hub import notebook_login

## Mount google drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Log in to Huggingface

In [4]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Prepare Tokenizer and Model

In [5]:
# Define base model to use
base_model = "meta-llama/Llama-3.2-1B-Instruct"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)

# Define model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=True,
    device_map="auto",
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

### Set Up LoRa Adapter

In [6]:
# Set up Lora Config
config = LoraConfig(
    r=16, # Attention heads
    lora_alpha=32, # Alpha scaling
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
    base_model_name_or_path="meta-llama/Llama-3.2-1B-Instruct"
)

# Combine base model and Lora Config
model = get_peft_model(model, config)

## Load and Prepare Data

In [7]:
# Load dataset
dataset = load_dataset("json", data_files="/content/drive/MyDrive/data/clean_training_data.json") # Update data file path, if necessary

# Define padding token
tokenizer.pad_token = tokenizer.eos_token

Generating train split: 0 examples [00:00, ? examples/s]

In [8]:
# Functions to process the data for fine-tuning

def format_alpaca(data_point):
  """
  Format the data in the Alpaca format.
  """
  text = f"""
  ### Instructions:
  {data_point['input']}

  ### Response:
  {data_point['output']}
  """

  return {"text": text}

def tokenize_function(examples):
  """
  Tokenize the formatted instruction-response text and create the labels.
  """
  # Tokenize the full instruction-response text
  tokenized = tokenizer(
      examples["text"],
      padding="max_length",
      truncation=True,
      max_length=512
  )

  batch_size = len(examples["text"])  # Get the batch size

  # Compute the instruction length for each example in the batch
  instruction_texts = [
      f"### Instructions:\n{inp}\n\n### Response:\n"
      for inp in examples["input"]
  ]
  instruction_ids = tokenizer(
      instruction_texts,
      truncation=True,
      max_length=512
  )["input_ids"]

  # Create labels by masking the instruction part with -100
  labels = []
  for i in range(batch_size):
      input_len = len(instruction_ids[i])  # Length of the instruction part
      labels.append(
          [-100] * input_len + tokenized["input_ids"][i][input_len:]  # Mask input, keep response
      )

  # Ensure labels are padded to max_length=512
  for i in range(batch_size):
      labels[i] = labels[i][:512] + [-100] * (512 - len(labels[i]))  # Truncate/pad to 512

  tokenized["labels"] = labels

  return tokenized

In [9]:
# Format the dataset
formated_dataset = dataset["train"].map(format_alpaca)

# Tokenize the dataset
tokenized_dataset = formated_dataset.map(tokenize_function, batched=True)

# Create train-test split
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)

Map:   0%|          | 0/5827 [00:00<?, ? examples/s]

Map:   0%|          | 0/5827 [00:00<?, ? examples/s]

In [10]:
# Remove unnecessary columns
tokenized_dataset = tokenized_dataset.remove_columns(["input", "output", "text"])

# Verify the updated dataset structure
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5244
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 583
    })
})


## Model Training

In [11]:
# Training arguments
training_args = TrainingArguments(
    num_train_epochs=3,
    output_dir="./results",
    warmup_ratio=0.03,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_dir="./logs",
    logging_steps=500,
    learning_rate=1e-4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    fp16=False,
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)



In [12]:
# Fine-tune the model
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msuniljit[0m ([33msuniljit-beyond-limits[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss,Validation Loss
500,0.9883,0.169918
1000,0.1717,0.167135
1500,0.1635,0.165375
2000,0.1671,0.16417
2500,0.1607,0.163851
3000,0.1628,0.162937
3500,0.1578,0.161883
4000,0.1575,0.161134
4500,0.1519,0.160876
5000,0.1584,0.160329




TrainOutput(global_step=7866, training_loss=0.21022948665895325, metrics={'train_runtime': 4529.0841, 'train_samples_per_second': 3.474, 'train_steps_per_second': 1.737, 'total_flos': 4.711323941417779e+16, 'train_loss': 0.21022948665895325, 'epoch': 3.0})

## Save Model

In [13]:
# Save tokenizer and model to huggingface
model.push_to_hub("Sunil91/llama3_2-1B-trump")

adapter_model.safetensors:   0%|          | 0.00/6.82M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Sunil91/llama3_2-1B-trump/commit/f65064bcb36b345a88a34eaf191357d2f5bed72f', commit_message='Upload model', commit_description='', oid='f65064bcb36b345a88a34eaf191357d2f5bed72f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Sunil91/llama3_2-1B-trump', endpoint='https://huggingface.co', repo_type='model', repo_id='Sunil91/llama3_2-1B-trump'), pr_revision=None, pr_num=None)