In [None]:
%%capture
# ! pip install pip3-autoremove
# ! pip-autoremove torch torchvision torchaudio -y
# ! pip install torch==2.3.0 xformers triton
# ! pip install transformers==4.45.0
# ! pip install unsloth==2024.9.post3
# ! pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# ! pip install --no-deps --upgrade "flash-attn>=2.6.3"

In [1]:
%env CUDA_VISIBLE_DEVICES=1
%env TOKENIZERS_PARALLELISM=false

env: CUDA_VISIBLE_DEVICES=1
env: TOKENIZERS_PARALLELISM=false


In [2]:
BASE_PATH = "/home/stepan/kaggle-arc-agi"
MODEL_ID = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"
MAX_NEW_TOKENS = 4096
MAX_SEQ_LENGTH = 32768 - MAX_NEW_TOKENS

In [3]:
import sys

sys.path.append(BASE_PATH)
sys.path.append(f"{BASE_PATH}/scripts")

In [4]:
import torch  # type: ignore
import numpy as np  # type: ignore

from datasets import DatasetDict, Dataset  # type: ignore

from unsloth import FastLanguageModel  # type: ignore

from tqdm.auto import tqdm  # type: ignore

from trl import SFTTrainer  # type: ignore
from transformers import TrainingArguments  # type: ignore
from unsloth import is_bfloat16_supported  # type: ignore

from logger import get_logger  # type: ignore
import train_utils  # type: ignore
import data_utils  # type: ignore

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [5]:
log = get_logger(f"{BASE_PATH}/logs/llama-3_2-3b", "arc-agi")

In [6]:
def get_model_tokenizer(dtype=None, load_in_4bit=True, add_lora=False):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=MODEL_ID,
        max_seq_length=MAX_SEQ_LENGTH,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
        attn_implementation="flash_attention_2",
        device_map="auto",
        max_memory={0: "23GiB", "cpu": "16GiB"},
    )

    if add_lora:
        model = FastLanguageModel.get_peft_model(
            model,
            r=16,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
            target_modules=[
                "q_proj",
                "k_proj",
                "v_proj",
                "o_proj",
                "gate_proj",
                "up_proj",
                "down_proj",
            ],
            lora_alpha=16,
            lora_dropout=0,  # Supports any, but = 0 is optimized
            bias="none",  # Supports any, but = "none" is optimized
            # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
            use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
            random_state=3407,
            use_rslora=False,  # We support rank stabilized LoRA
            loftq_config=None,  # And LoftQ
        )

    return model, tokenizer

In [7]:
def eval(f):
    def wrapper(model, tokenizer, *args, **kwargs):
        FastLanguageModel.for_inference(model)
        log.info(f"Evaluating model {model}, tokenizer {tokenizer.padding_side}")
        return f(model, tokenizer, *args, **kwargs)

    return wrapper


def train(f):
    def wrapper(model, tokenizer, *args, **kwargs):
        FastLanguageModel.for_training(model)
        return f(model, tokenizer, *args, **kwargs)

    return wrapper

In [8]:
model, tokenizer = get_model_tokenizer(add_lora=True)

==((====))==  Unsloth 2024.9.post3: Fast Llama patching. Transformers = 4.45.1.
   \\   /|    GPU: NVIDIA RTX A5000. Max memory: 23.679 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unsloth 2024.9.post3 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [9]:
def prepare_inputs(dct, prepare_solution=False):
    if prepare_solution:
        return "<output>\n" + "\n".join(" ".join(map(str, row)) for row in dct) + "\n</output>"
    else:
        input_str = "\n".join(" ".join(map(str, row)) for row in dct["input"])
        output_str = "\n".join(" ".join(map(str, row)) for row in dct["output"]) if "output" in dct else ""
        text = f"<input>\n{input_str}\n</input>\n\n<output>\n{output_str}\n</output>"
        return text

In [10]:
dataset = data_utils.prepare_dataset(tokenizer, fit_dataset=False, base_path=BASE_PATH, final_training=False, prepare_inputs_func=prepare_inputs)
dataset

Map: 100%|██████████| 105/105 [00:00<00:00, 1395.34 examples/s]
Map: 100%|██████████| 416/416 [00:00<00:00, 1361.42 examples/s]
Map: 100%|██████████| 419/419 [00:00<00:00, 697.84 examples/s]
Map: 100%|██████████| 416/416 [00:00<00:00, 1477.83 examples/s]
Map: 100%|██████████| 419/419 [00:00<00:00, 956.06 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'challenge', 'solution', 'texts', 'messages'],
        num_rows: 416
    })
    test: Dataset({
        features: ['id', 'challenge', 'solution', 'texts', 'messages'],
        num_rows: 293
    })
    val: Dataset({
        features: ['id', 'challenge', 'solution', 'texts', 'messages'],
        num_rows: 126
    })
    predict: Dataset({
        features: ['id', 'challenge', 'texts', 'messages'],
        num_rows: 105
    })
})

In [12]:
# Calculate min, max, and avg number of training samples per dataset record for each split
def calculate_training_samples_stats(dataset_split):
    training_samples_counts = [len(challenge["train"]) for challenge in dataset_split["challenge"]]
    return {
        "min": min(training_samples_counts),
        "max": max(training_samples_counts),
        "avg": sum(training_samples_counts) / len(training_samples_counts),
    }


splits = ["train", "val", "test", "predict"]
for split in splits:
    if split not in dataset:
        continue
    stats = calculate_training_samples_stats(dataset[split])
    print(f"{split.capitalize()} split:")
    print(f"  Min training samples: {stats['min']}")
    print(f"  Max training samples: {stats['max']}")
    print(f"  Avg training samples: {stats['avg']:.2f}")
    print()

Train split:
  Min training samples: 2
  Max training samples: 10
  Avg training samples: 3.35

Val split:
  Min training samples: 2
  Max training samples: 7
  Avg training samples: 3.48

Test split:
  Min training samples: 2
  Max training samples: 7
  Avg training samples: 3.47

Predict split:
  Min training samples: 2
  Max training samples: 8
  Avg training samples: 3.34



In [13]:
def filter_dataset_by_token_limit(dataset, max_seq_length):
    def filter_split(split_name):
        filtered_split = dataset[split_name].filter(lambda x: data_utils.count_tokens(tokenizer, x["texts"]) <= max_seq_length)
        filtered_out_tasks = len(dataset[split_name]) - len(filtered_split)
        return filtered_split, filtered_out_tasks

    filtered_splits = {}
    for split in ["train", "val", "test", "predict"]:
        if split not in dataset:
            continue
        filtered_splits[split], filtered_out_tasks = filter_split(split)
        print(f"{filtered_out_tasks} {split} tasks were filtered out because they exceed the {max_seq_length} token limit.")
        print(f"The filtered {split} dataset contains {len(filtered_splits[split])} tasks for {'fine-tuning' if split == 'train' else 'evaluation'}.")

    return filtered_splits


filtered_dataset = filter_dataset_by_token_limit(dataset, MAX_SEQ_LENGTH)

Filter: 100%|██████████| 416/416 [00:01<00:00, 304.80 examples/s]


0 train tasks were filtered out because they exceed the 28672 token limit.
The filtered train dataset contains 416 tasks for fine-tuning.


Filter: 100%|██████████| 126/126 [00:00<00:00, 227.66 examples/s]


0 val tasks were filtered out because they exceed the 28672 token limit.
The filtered val dataset contains 126 tasks for evaluation.


Filter: 100%|██████████| 293/293 [00:01<00:00, 204.70 examples/s]


0 test tasks were filtered out because they exceed the 28672 token limit.
The filtered test dataset contains 293 tasks for evaluation.


Filter: 100%|██████████| 105/105 [00:00<00:00, 332.98 examples/s]

0 predict tasks were filtered out because they exceed the 28672 token limit.
The filtered predict dataset contains 105 tasks for evaluation.





In [14]:
def generate_with_temp(model, inputs, temperature):
    outputs = model.generate(
        **inputs,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=True,
        temperature=temperature,
        top_k=50,
        use_cache=True,
    )
    return outputs


def evaluate_batch(model, tokenizer, batch):
    inputs = {"input_ids": batch["input_ids"], "attention_mask": batch["attention_mask"]}

    with torch.no_grad():
        outputs1 = generate_with_temp(model, inputs, 0.3)
        outputs2 = generate_with_temp(model, inputs, 0.7)

    input_ids_length = inputs["input_ids"].shape[1]  # sequence length without new tokens
    new_tokens1 = outputs1[:, input_ids_length:]
    new_tokens2 = outputs2[:, input_ids_length:]

    generated_texts1 = tokenizer.batch_decode(new_tokens1, skip_special_tokens=True)
    generated_texts2 = tokenizer.batch_decode(new_tokens2, skip_special_tokens=True)

    return generated_texts1, generated_texts2

In [15]:
@train
def training(model, tokenizer, dataset, max_seq_length):
    common_args = {
        "model": model,
        "tokenizer": tokenizer,
        "train_dataset": dataset["train"],
        "dataset_text_field": "texts",
        "max_seq_length": max_seq_length,
        "dataset_num_proc": 2,
        "packing": False,
    }

    training_args = TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=8,
        logging_steps=100,
        warmup_steps=5,
        max_steps=500,
        learning_rate=2e-5,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir=f"{BASE_PATH}/models/llama-3_2-3b-it",
        save_strategy="steps",
        save_steps=250,
        save_total_limit=2,
    )

    if "val" in dataset:
        common_args["eval_dataset"] = dataset["val"]
        training_args.per_device_eval_batch_size = 1
        training_args.eval_strategy = "steps"
        training_args.eval_steps = 100
        training_args.metric_for_best_model = "eval_loss"
        training_args.save_best_model = True

    trainer = SFTTrainer(args=training_args, **common_args)
    stats = trainer.train()
    return trainer, stats

In [16]:
trainer, stats = training(model, tokenizer, dataset, max_seq_length=MAX_SEQ_LENGTH)
stats

Map (num_proc=2): 100%|██████████| 416/416 [00:01<00:00, 265.26 examples/s]
Map (num_proc=2): 100%|██████████| 126/126 [00:01<00:00, 87.52 examples/s] 
max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 416 | Num Epochs = 10
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 8
\        /    Total batch size = 8 | Total steps = 500
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss,Validation Loss
100,0.4212,0.219637
200,0.1911,0.200357
300,0.1831,0.196181
400,0.1755,0.194012
500,0.1723,0.19318


TrainOutput(global_step=500, training_loss=0.22864407348632812, metrics={'train_runtime': 4324.6254, 'train_samples_per_second': 0.925, 'train_steps_per_second': 0.116, 'total_flos': 1.8298041746009088e+17, 'train_loss': 0.22864407348632812, 'epoch': 9.615384615384615})

In [None]:
trainer.save_model(f"{BASE_PATH}/models/llama-3_2-3b-it")