In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any
dtype = torch.float16 # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True 

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [3]:
model, tokenizer = FastLanguageModel.from_pretrained("/home/rna3535/105K")

==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla V100-PCIE-32GB. Max memory: 31.739 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.0. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2024.11.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [4]:
from datasets import load_dataset
dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp")
dataset

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'is_correct', 'answer', 'solution'],
        num_rows: 1000000
    })
    test: Dataset({
        features: ['question', 'is_correct', 'answer', 'solution'],
        num_rows: 10000
    })
})

In [6]:
prompt = """You are a highly skilled mathematician. Determine if the provided Answer and Explanation to a math question is correct or incorrect. Return True if it’s correct and False if it’s wrong. 

### Question:
{}

### Answer:
{}

### Explanation:
{}

### True/False:
{}"""


EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    question = examples["question"]
    ans       = examples["answer"]
    solution = examples["solution"]
    output      = examples["is_correct"]
    texts = []
    for q, a, s, o in zip(question, ans, solution, output):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = prompt.format(q, a, s, o) + EOS_TOKEN
        # print(text)
        texts.append(text)
    return { "text" : texts, }

In [7]:
complete_train_dataset = dataset['train'].map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/1000000 [00:00<?, ? examples/s]

In [8]:
split_dataset = complete_train_dataset.train_test_split(test_size=0.005, seed=42)
train_dataset = split_dataset['train']
val_dataset = split_dataset['test']

print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(val_dataset)}")

Training set size: 995000
Validation set size: 5000


In [9]:
per_device_train_batch_size = 6
gradient_accumulation_steps = 4
max_steps = 200
print(f'Total training samples: {per_device_train_batch_size * gradient_accumulation_steps * max_steps}')
print(f'Samples per step: {per_device_train_batch_size * gradient_accumulation_steps}')

Total training samples: 4800
Samples per step: 24


In [10]:
checkpoint_path = "/home/rna3535/105K"

In [11]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
training_args = TrainingArguments(
        per_device_train_batch_size = per_device_train_batch_size,
        gradient_accumulation_steps = gradient_accumulation_steps,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = max_steps,
        warmup_steps = int(0.1 * max_steps),
        learning_rate = 1e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        resume_from_checkpoint=checkpoint_path,
        report_to = "none", # Use this for WandB etc
        eval_strategy="steps",
        eval_steps=10, 
        save_steps=50,            # Save a checkpoint every 10 steps
        save_total_limit=2,
        output_dir = "outputs",
    )

In [12]:
from torch.optim import AdamW
from transformers import TrainingArguments, get_scheduler
# Optimizer
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)

# Scheduler
scheduler = get_scheduler(
    name="cosine",
    optimizer=optimizer,
    num_warmup_steps=int(0.1 * max_steps),
    num_training_steps=max_steps
)

In [13]:
from transformers import TrainerCallback, TrainerControl, TrainerState
import numpy as np

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=4,
    packing=False,
    args=training_args,
    optimizers=(optimizer, scheduler),
)

Map (num_proc=4):   0%|          | 0/995000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/5000 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [14]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 995,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 6 | Gradient Accumulation steps = 4
\        /    Total batch size = 24 | Total steps = 200
 "-____-"     Number of trainable parameters = 83,886,080


Step,Training Loss,Validation Loss
10,0.3917,0.452648
20,0.4409,0.452542
30,0.4013,0.452526
40,0.4588,0.452613
50,0.4235,0.452611
60,0.4152,0.452523
70,0.4448,0.452415
80,0.4073,0.452282
90,0.3817,0.452255
100,0.3927,0.452269


TrainOutput(global_step=200, training_loss=0.4281532627344131, metrics={'train_runtime': 15157.1457, 'train_samples_per_second': 0.317, 'train_steps_per_second': 0.013, 'total_flos': 1.059439567662121e+17, 'train_loss': 0.4281532627344131, 'epoch': 0.004824101209643378})

Move on to inference.
