In [None]:
%%capture
# ! pip install pip3-autoremove
# ! pip-autoremove torch torchvision torchaudio -y
# ! pip install torch==2.3.0 xformers triton
# ! pip install unsloth
# ! pip install --no-deps --upgrade "flash-attn>=2.6.3"

In [1]:
%env CUDA_VISIBLE_DEVICES=1
%env TOKENIZERS_PARALLELISM=false

env: CUDA_VISIBLE_DEVICES=1
env: TOKENIZERS_PARALLELISM=false


In [2]:
BASE_PATH = "/home/stepan/kaggle-arc-agi"
MODEL_ID = "unsloth/gemma-2-9b-it-bnb-4bit"
MAX_NEW_TOKENS = 2048
MAX_SEQ_LENGTH = 8192 - MAX_NEW_TOKENS

In [3]:
import sys

sys.path.append(BASE_PATH)
sys.path.append(f"{BASE_PATH}/scripts")

In [4]:
import torch  # type: ignore
import numpy as np  # type: ignore

from datasets import DatasetDict, Dataset  # type: ignore

from unsloth import FastLanguageModel  # type: ignore

from tqdm.auto import tqdm  # type: ignore

from trl import SFTTrainer  # type: ignore
from transformers import TrainingArguments  # type: ignore
from unsloth import is_bfloat16_supported  # type: ignore

from logger import get_logger  # type: ignore
import train_utils  # type: ignore
import data_utils  # type: ignore

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [5]:
log = get_logger(f"{BASE_PATH}/logs/gemma-2-2b", "arc-agi")

In [6]:
def get_model_tokenizer(dtype=None, load_in_4bit=True, add_lora=False):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=MODEL_ID,
        max_seq_length=MAX_SEQ_LENGTH,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
        device_map={"": 0},
        attn_implementation="flash_attention_2",
        # token = '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', # use one if using gated models like meta-llama/Llama-2-7b-hf
    )

    if add_lora:
        model = FastLanguageModel.get_peft_model(
            model,
            r=16,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
            target_modules=[
                "q_proj",
                "k_proj",
                "v_proj",
                "o_proj",
                "gate_proj",
                "up_proj",
                "down_proj",
            ],
            lora_alpha=16,
            lora_dropout=0,  # Supports any, but = 0 is optimized
            bias="none",  # Supports any, but = "none" is optimized
            # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
            use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
            random_state=3407,
            use_rslora=False,  # We support rank stabilized LoRA
            loftq_config=None,  # And LoftQ
        )

    return model, tokenizer

In [7]:
def eval(f):
    def wrapper(model, tokenizer, *args, **kwargs):
        FastLanguageModel.for_inference(model)
        log.info(f"Evaluating model {model}, tokenizer {tokenizer.padding_side}")
        return f(model, tokenizer, *args, **kwargs)

    return wrapper


def train(f):
    def wrapper(model, tokenizer, *args, **kwargs):
        FastLanguageModel.for_training(model)
        return f(model, tokenizer, *args, **kwargs)

    return wrapper

In [8]:
model, tokenizer = get_model_tokenizer(add_lora=True)

==((====))==  Unsloth 2024.9: Fast Gemma2 patching. Transformers = 4.43.4.
   \\   /|    GPU: NVIDIA RTX A5000. Max memory: 23.679 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unsloth 2024.9 patched 42 layers with 42 QKV layers, 42 O layers and 42 MLP layers.


In [9]:
dataset = data_utils.prepare_dataset(tokenizer, fit_dataset=True, base_path=BASE_PATH, final_training=True)
dataset

Map: 100%|██████████| 430/430 [00:00<00:00, 960.74 examples/s]
Map: 100%|██████████| 112/112 [00:00<00:00, 1516.51 examples/s]
Map: 100%|██████████| 459/459 [00:00<00:00, 984.14 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'challenge', 'solution', 'texts', 'messages'],
        num_rows: 430
    })
    test: Dataset({
        features: ['id', 'challenge', 'solution', 'texts', 'messages'],
        num_rows: 321
    })
    val: Dataset({
        features: ['id', 'challenge', 'solution', 'texts', 'messages'],
        num_rows: 138
    })
    predict: Dataset({
        features: ['id', 'challenge', 'texts', 'messages'],
        num_rows: 112
    })
})

In [10]:
# Calculate min, max, and avg number of training samples per dataset record for each split
def calculate_training_samples_stats(dataset_split):
    training_samples_counts = [len(challenge["train"]) for challenge in dataset_split["challenge"]]
    return {
        "min": min(training_samples_counts),
        "max": max(training_samples_counts),
        "avg": sum(training_samples_counts) / len(training_samples_counts),
    }


splits = ["train", "val", "test", "predict"]
for split in splits:
    stats = calculate_training_samples_stats(dataset[split])
    print(f"{split.capitalize()} split:")
    print(f"  Min training samples: {stats['min']}")
    print(f"  Max training samples: {stats['max']}")
    print(f"  Avg training samples: {stats['avg']:.2f}")
    print()

Train split:
  Min training samples: 1
  Max training samples: 10
  Avg training samples: 3.24

Val split:
  Min training samples: 1
  Max training samples: 7
  Avg training samples: 3.16

Test split:
  Min training samples: 1
  Max training samples: 7
  Avg training samples: 3.18

Predict split:
  Min training samples: 1
  Max training samples: 8
  Avg training samples: 3.13



In [11]:
def filter_dataset_by_token_limit(dataset, max_seq_length):
    def filter_split(split_name):
        filtered_split = dataset[split_name].filter(lambda x: data_utils.count_tokens(tokenizer, x["texts"]) <= max_seq_length)
        filtered_out_tasks = len(dataset[split_name]) - len(filtered_split)
        return filtered_split, filtered_out_tasks

    filtered_splits = {}
    for split in ["train", "val", "test", "predict"]:
        filtered_splits[split], filtered_out_tasks = filter_split(split)
        print(f"{filtered_out_tasks} {split} tasks were filtered out because they exceed the {max_seq_length} token limit.")
        print(f"The filtered {split} dataset contains {len(filtered_splits[split])} tasks for {'fine-tuning' if split == 'train' else 'evaluation'}.")

    return filtered_splits


filtered_dataset = filter_dataset_by_token_limit(dataset, MAX_SEQ_LENGTH)

Filter: 100%|██████████| 430/430 [00:00<00:00, 745.32 examples/s]


0 train tasks were filtered out because they exceed the 6144 token limit.
The filtered train dataset contains 430 tasks for fine-tuning.


Filter: 100%|██████████| 138/138 [00:00<00:00, 674.92 examples/s]


0 val tasks were filtered out because they exceed the 6144 token limit.
The filtered val dataset contains 138 tasks for evaluation.


Filter: 100%|██████████| 321/321 [00:00<00:00, 672.33 examples/s]


0 test tasks were filtered out because they exceed the 6144 token limit.
The filtered test dataset contains 321 tasks for evaluation.


Filter: 100%|██████████| 112/112 [00:00<00:00, 1025.25 examples/s]

0 predict tasks were filtered out because they exceed the 6144 token limit.
The filtered predict dataset contains 112 tasks for evaluation.





In [12]:
def generate_with_temp(model, inputs, temperature):
    outputs = model.generate(
        **inputs,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=True,
        temperature=temperature,
        top_k=50,
        use_cache=True,
    )
    return outputs


def evaluate_batch(model, tokenizer, batch):
    inputs = {"input_ids": batch["input_ids"], "attention_mask": batch["attention_mask"]}

    with torch.no_grad():
        outputs1 = generate_with_temp(model, inputs, 0.3)
        outputs2 = generate_with_temp(model, inputs, 0.7)

    input_ids_length = inputs["input_ids"].shape[1]  # sequence length without new tokens
    new_tokens1 = outputs1[:, input_ids_length:]
    new_tokens2 = outputs2[:, input_ids_length:]

    generated_texts1 = tokenizer.batch_decode(new_tokens1, skip_special_tokens=True)
    generated_texts2 = tokenizer.batch_decode(new_tokens2, skip_special_tokens=True)

    return generated_texts1, generated_texts2

In [13]:
@train
def training(model, tokenizer, dataset, max_seq_length):
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=dataset["train"],
        eval_dataset=dataset["val"],
        dataset_text_field="texts",
        max_seq_length=max_seq_length,
        dataset_num_proc=2,
        packing=False,  # Can make training 5x faster for short sequences.
        args=TrainingArguments(
            per_device_train_batch_size=1,
            per_device_eval_batch_size=1,
            gradient_accumulation_steps=8,
            eval_strategy="steps",
            eval_steps=100,
            logging_steps=100,
            # num_train_epochs=1,  # Set this for 1 full training run.
            warmup_steps=5,
            max_steps=500,
            learning_rate=2e-5,
            fp16=not is_bfloat16_supported(),
            bf16=is_bfloat16_supported(),
            optim="adamw_8bit",
            weight_decay=0.01,
            lr_scheduler_type="linear",
            seed=3407,
            output_dir=f"{BASE_PATH}/models/gemma-2-9b-it",
            metric_for_best_model="eval_loss",
            save_best_model=True,
            # save_strategy='no',
            save_strategy="steps",
            save_steps=250,
            save_total_limit=2,
        ),
        # compute_metrics=compute_metrics
    )
    stats = trainer.train()
    return trainer, stats

In [14]:
trainer, stats = training(model, tokenizer, dataset, max_seq_length=MAX_SEQ_LENGTH)
stats

Map (num_proc=2): 100%|██████████| 430/430 [00:01<00:00, 280.17 examples/s]
Map (num_proc=2): 100%|██████████| 138/138 [00:01<00:00, 88.99 examples/s] 
max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 430 | Num Epochs = 19
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 8
\        /    Total batch size = 8 | Total steps = 1,000
 "-____-"     Number of trainable parameters = 54,018,048


Step,Training Loss,Validation Loss
100,0.4442,0.303148
200,0.2951,0.283015
300,0.2505,0.261734
400,0.2055,0.260017
500,0.1817,0.268342
600,0.1514,0.283499
700,0.1284,0.290501
800,0.1081,0.304979
900,0.0975,0.320863
1000,0.0889,0.332176


TrainOutput(global_step=1000, training_loss=0.1951298542022705, metrics={'train_runtime': 14093.8311, 'train_samples_per_second': 0.568, 'train_steps_per_second': 0.071, 'total_flos': 6.154787005051023e+17, 'train_loss': 0.1951298542022705, 'epoch': 18.6046511627907})

In [None]:
trainer.save_model(f"{BASE_PATH}/models/gemma-2-9b-it")

In [15]:
! nvidia-smi

Sun Sep 22 09:13:05 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA RTX A5000               On  | 00000000:2B:00.0 Off |                  Off |
| 30%   33C    P8              20W / 230W |  15171MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA RTX A5000               On  | 00000000:41:00.0 Off |  