In [None]:
%%capture
! pip install pip3-autoremove
! pip-autoremove torch torchvision torchaudio -y
! pip install torch xformers triton
# ! pip install "unsloth[kaggle-new] @ git+https://github.com/unslothai/unsloth.git"
! pip install unsloth

In [1]:
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_VISIBLE_DEVICES=0


In [None]:
BASE_PATH = '..'

In [2]:
import os

print(os.getenv('CUDA_VISIBLE_DEVICES'))

0


In [None]:
import sys
sys.path.append(BASE_PATH)

In [4]:
import json

import torch
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from datasets import Dataset, DatasetDict

from unsloth import FastLanguageModel

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

from tqdm.auto import tqdm

from logger import get_logger


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [5]:
# ! pip install --no-deps --upgrade "flash-attn>=2.6.3"

In [6]:
EVAL_BATCH_SIZE = 1
MAX_SEQ_LENGTH = 1024

In [7]:
log = get_logger(f'logs/gemma-2-2b', 'arc-agi')

In [8]:
device_id = 0

In [9]:
def get_model_tokenizer(max_seq_length=1024, dtype=None, load_in_4bit=True, add_lora=False):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="unsloth/gemma-2-2b-it",
        # model_name="models/gemma-2-2b/checkpoint-1562",
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
        device_map={'': 0},
        # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
    )

    if add_lora:
        model = FastLanguageModel.get_peft_model(
            model,
            r=16,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
            target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                            "gate_proj", "up_proj", "down_proj", ],
            lora_alpha=16,
            lora_dropout=0,  # Supports any, but = 0 is optimized
            bias="none",  # Supports any, but = "none" is optimized
            # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
            use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
            random_state=3407,
            use_rslora=False,  # We support rank stabilized LoRA
            loftq_config=None,  # And LoftQ
        )

    tokenizer.truncation_side = 'left'
    tokenizer.padding_side = 'left'

    return model, tokenizer

In [10]:
def eval(f):
    def wrapper(model, *args, **kwargs):
        FastLanguageModel.for_inference(model)
        return f(model, *args, **kwargs)

    return wrapper


def train(f):
    def wrapper(model, *args, **kwargs):
        FastLanguageModel.for_training(model)
        return f(model, *args, **kwargs)

    return wrapper

In [11]:
model, tokenizer = get_model_tokenizer(max_seq_length=MAX_SEQ_LENGTH, add_lora=True)

Unsloth: If you want to finetune Gemma 2, install flash-attn to make it faster!
To install flash-attn, do the below:

pip install --no-deps --upgrade "flash-attn>=2.6.3"
==((====))==  Unsloth 2024.8: Fast Gemma2 patching. Transformers = 4.43.4.
   \\   /|    GPU: NVIDIA RTX A5000. Max memory: 23.677 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unsloth 2024.8 patched 26 layers with 26 QKV layers, 26 O layers and 26 MLP layers.


In [12]:
EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN

In [13]:
def prepare_dataset(tokenizer, device, seed):
    # 3 classes
    # Load data from JSON files
    def load_data(file_path):
        with open(file_path, 'r') as f:
            data = json.load(f)
        return data
    
    # Load all datasets
    training_challenges = load_data(f'{BASE_PATH}/arc-agi_training_challenges.json')
    training_solutions = load_data(f'{BASE_PATH}/arc-agi_training_solutions.json')
    evaluation_challenges = load_data(f'{BASE_PATH}/arc-agi_evaluation_challenges.json')
    evaluation_solutions = load_data(f'{BASE_PATH}/arc-agi_evaluation_solutions.json')
    test_challenges = load_data(f'{BASE_PATH}/arc-agi_test_challenges.json')

    PROMPT = '''### Tweet:
    {}
    
    ### Classification:
    {}'''

    def formatting_prompts_func(examples):
        inputs = examples["tweet"]
        outputs = [
            HATE_TOKEN if label in [1, 0] else NORMAL_TOKEN for label in examples["class"]
        ]
        texts = []
        prompts = []
        for input, output in zip(inputs, outputs):
            text = PROMPT.format(input.strip(), output) + EOS_TOKEN
            texts.append(text)
            prompts.append(PROMPT.format(input, ""))
        return {"ref": texts, 'prompt': prompts}

    dataset = dataset.map(formatting_prompts_func, batched=True)

    # Split the training set into training (80%) and validation (20%) sets
    train_testvalid = dataset['train'].train_test_split(test_size=0.2, seed=42)
    test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=42)

    # Assign datasets
    train_dataset = train_testvalid['train']
    val_dataset = test_valid['test']
    test_dataset = test_valid['train']

    dataset = DatasetDict({
        'train': train_dataset,
        'test': test_dataset,
        'val': val_dataset
    })
    return dataset.rename_columns({'class': 'label'})


In [14]:
dataset = prepare_dataset(tokenizer, f"cuda:{device_id}", seed=3407)
dataset

DatasetDict({
    train: Dataset({
        features: ['count', 'hate_speech_count', 'offensive_language_count', 'neither_count', 'label', 'tweet', 'ref', 'prompt'],
        num_rows: 19826
    })
    test: Dataset({
        features: ['count', 'hate_speech_count', 'offensive_language_count', 'neither_count', 'label', 'tweet', 'ref', 'prompt'],
        num_rows: 2478
    })
    val: Dataset({
        features: ['count', 'hate_speech_count', 'offensive_language_count', 'neither_count', 'label', 'tweet', 'ref', 'prompt'],
        num_rows: 2479
    })
})

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference
inputs = tokenizer(
    [
        dataset['test']['prompt'][0],
    ], return_tensors="pt").to(f"cuda:{device_id}")

outputs = model.generate(**inputs, max_new_tokens=1)
res = tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [17]:
model.device

device(type='cuda', index=0)

In [18]:
def gpu_stats(device_id=0):
    #@title Show current memory stats
    gpu_stats = torch.cuda.get_device_properties(device_id)
    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
    return {'gpu': gpu_stats.name, 'max_memory': max_memory, 'start_gpu_memory': start_gpu_memory}

In [19]:
TOKENS = {
    'Positive': ...,
    'Negative': ...,

    'Hate': ...,
    'Normal': ...,
    'Fake': ...,
    'Truth': ...
}

In [20]:
for key, val in TOKENS.items():
    code = tokenizer.encode(key, add_special_tokens=False)
    print(f"{key}: {code}")
    TOKENS[key] = code[0]

Positive: [35202]
Negative: [39654]
Hate: [88060]
Normal: [15273]
Fake: [41181]
Truth: [55882]


In [21]:
def softmax(x, axis=None):
    # Subtract the max for numerical stability
    x_max = np.max(x, axis=axis, keepdims=True)
    x_stable = x - x_max

    # Compute the exponentials
    exp_x = np.exp(x_stable)

    # Compute the softmax
    sum_exp_x = np.sum(exp_x, axis=axis, keepdims=True)
    softmax_x = exp_x / sum_exp_x

    return softmax_x

In [22]:
@eval
def evaluate(model, tokenizer, dataset, batch_size, threshold=0.5):
    eval_dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False)

    preds = []
    labels = []
    for i, batch in tqdm(enumerate(eval_dataloader), total=len(eval_dataloader)):
        texts = batch["prompt"]

        inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True).to(model.device)

        outputs = model.generate(
            **inputs,
            max_new_tokens=1,
            output_scores=True,
            return_dict_in_generate=True
        )  # (B, len, vocab_size)

        pos = outputs.scores[0][:, TOKENS['Hate']].cpu().float().numpy()
        neg = outputs.scores[0][:, TOKENS['Normal']].cpu().float().numpy()

        preds.extend(np.array([neg, pos]).T)
        labels.extend(batch["label"])

        if (i + 1) % 1000 == 0:
            log.warn(f'GPU Stats: {gpu_stats(device_id)}')

    preds = softmax(np.array(preds), axis=-1)[:, 1] > threshold

    val_acc = accuracy_score(labels, preds)
    val_f1 = f1_score(labels, preds, average='macro')
    val_precision = precision_score(labels, preds, average='macro')
    val_recall = recall_score(labels, preds, average='macro')

    log.info(f"Accuracy: {val_acc}, F1: {val_f1}, Precision: {val_precision}, Recall: {val_recall}")
    return val_acc, val_f1, val_precision, val_recall, preds, labels

In [23]:
# eval_acc, eval_f1, eval_precision, eval_recall, preds, labels = evaluate(model, tokenizer, dataset["test"],
#                                                                          batch_size=EVAL_BATCH_SIZE)

In [24]:
@train
def training(model, tokenizer, dataset, max_seq_length):
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=dataset,
        dataset_text_field="ref",
        max_seq_length=max_seq_length,
        dataset_num_proc=2,
        packing=False,  # Can make training 5x faster for short sequences.
        args=TrainingArguments(
            per_device_train_batch_size=4,
            gradient_accumulation_steps=4,
            warmup_steps=500,
            num_train_epochs=1,  # Set this for 1 full training run.
            # max_steps=60,
            learning_rate=2e-5,
            fp16=not is_bfloat16_supported(),
            bf16=is_bfloat16_supported(),
            logging_steps=200,
            optim="adamw_8bit",
            weight_decay=0.01,
            lr_scheduler_type="linear",
            seed=3407,
            output_dir="models/gemma-2-2b",
        ),
    )
    return trainer.train()

In [26]:
stats = training(model, tokenizer, dataset["train"], max_seq_length=MAX_SEQ_LENGTH)
stats

Map (num_proc=2): 100%|██████████| 19826/19826 [00:01<00:00, 9955.52 examples/s] 
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 19,826 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 1,239
 "-____-"     Number of trainable parameters = 20,766,720


Step,Training Loss
200,3.6554
400,2.4263
600,2.3498
800,2.3386
1000,2.3196
1200,2.2922


TrainOutput(global_step=1239, training_loss=2.5560759574391363, metrics={'train_runtime': 1122.5902, 'train_samples_per_second': 17.661, 'train_steps_per_second': 1.104, 'total_flos': 1.9255666156886016e+16, 'train_loss': 2.5560759574391363, 'epoch': 0.9997982650796853})

In [27]:
eval_acc, eval_f1, eval_precision, eval_recall, preds, labels = evaluate(model, tokenizer, dataset["test"], batch_size=EVAL_BATCH_SIZE)

 40%|████      | 998/2478 [00:56<01:23, 17.71it/s]GPU Stats: {'gpu': 'NVIDIA RTX A5000', 'max_memory': 23.677, 'start_gpu_memory': 6.328}
 81%|████████  | 1998/2478 [01:52<00:26, 17.85it/s]GPU Stats: {'gpu': 'NVIDIA RTX A5000', 'max_memory': 23.677, 'start_gpu_memory': 6.328}
100%|██████████| 2478/2478 [02:19<00:00, 17.79it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Accuracy: 0.761501210653753, F1: 0.3198172890128062, Precision: 0.30552159442649357, Recall: 0.3397935378674577
