# Training for critical question

## Setup
First we define some constant values and also install all needed libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive




### Installation

In [2]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth


from unsloth import FastLanguageModel
import torch
from datasets import load_dataset
import logging
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

### Path Variables and Logger

In [3]:
################################################################################
#######################   STATIC VARIABLES      ################################
################################################################################

TRAINING_NUMBER = 1
BASE_MODEL_REPO = "unsloth/Meta-Llama-3.1-8B-Instruct"
MODEL_NAME = "Meta-Llama-3.1-8B-Instruct"

################################################################################
#######################   PATH VARIABLES        ################################
################################################################################

train_dataset_path = "/content/drive/MyDrive/HSG/NLP/Project NLP/Data/Datasets/SocraticQ/train.csv"

log_base_path = f"/content/drive/MyDrive/HSG/NLP/Project NLP/Training/Training_{TRAINING_NUMBER}/Logs/"
os.makedirs(log_base_path, exist_ok=True)

log_file_path = f"/content/drive/MyDrive/HSG/NLP/Project NLP/Training/Training_{TRAINING_NUMBER}/Logs/training_{TRAINING_NUMBER}.log"

model_save_path = f"/content/drive/MyDrive/HSG/NLP/Project NLP/Training/Training_{TRAINING_NUMBER}/Model/{MODEL_NAME}_finetuned/"
os.makedirs(model_save_path, exist_ok=True)

model_lora_adapter_save_path = f"/content/drive/MyDrive/HSG/NLP/Project NLP/Training/Training_{TRAINING_NUMBER}/Model/{MODEL_NAME}_lora_adapters/"
os.makedirs(model_lora_adapter_save_path, exist_ok=True)


################################################################################
#######################   LOGGER                ################################
################################################################################

# Setup logger manually
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Create file handler (only if not already added)
if not logger.handlers:
    fh = logging.FileHandler(log_file_path)
    fh.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    fh.setFormatter(formatter)
    logger.addHandler(fh)

# Detect device
device = torch.device(
    "mps" if torch.backends.mps.is_available()
    else "cuda" if torch.cuda.is_available()
    else "cpu"
)

In [4]:
logger.info("--------  Start with Training  -------------")
logger.info(f'Device selected: {device}')
logger.info(f'Model: {MODEL_NAME}')
logger.info(f'Training number: {TRAINING_NUMBER}')

INFO:__main__:--------  Start with Baseline Predictions  -------------
INFO:__main__:Device selected: cuda
INFO:__main__:Model: Meta-Llama-3.1-8B-Instruct
INFO:__main__:Training number: 1


## Training Parameters

In [5]:
################################################################################
#######################   Unlsoth Parameters    ################################
################################################################################

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


################################################################################
#######################   PEFT Parameters       ################################
################################################################################

r = 16 # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                  "gate_proj", "up_proj", "down_proj",]
lora_alpha = 16
lora_dropout = 0 # Supports any, but = 0 is optimized
bias = "none"    # Supports any, but = "none" is optimized
use_gradient_checkpointing = "unsloth" # True or "unsloth" for very long context
random_state = 3407
use_rslora = False  # Unsloth supports rank stabilized LoRA
loftq_config = None # And LoftQ


################################################################################
#######################   SFT Trainer Parameters   #############################
################################################################################

per_device_train_batch_size = 2
gradient_accumulation_steps = 4
warmup_steps = 5
max_steps = 1 # Normaly 60
learning_rate = 2e-4
fp16 = not is_bfloat16_supported()
bf16 = is_bfloat16_supported()
logging_steps = 1
optim = "adamw_8bit"
weight_decay = 0.01
lr_scheduler_type = "linear"
seed = 3407
output_dir = "outputs"
report_to = "none" # Use this for WandB etc

################################################################################
#######################   Log Parameters            ############################
################################################################################

logger.info("Unlsoth Parameters")
logger.info(f"max_seq_length: {max_seq_length}")
logger.info(f"dtype: {dtype}")
logger.info(f"load_in_4bit: {load_in_4bit}")

logger.info("PEFT Parameters")
logger.info(f"r: {r}")
logger.info(f"target_modules: {target_modules}")
logger.info(f"lora_alpha: {lora_alpha}")
logger.info(f"lora_dropout: {lora_dropout}")
logger.info(f"bias: {bias}")
logger.info(f"use_gradient_checkpointing: {use_gradient_checkpointing}")
logger.info(f"random_state: {random_state}")
logger.info(f"use_rslora: {use_rslora}")

logger.info("SFT Trainer Parameters")
logger.info(f"per_device_train_batch_size: {per_device_train_batch_size}")
logger.info(f"gradient_accumulation_steps: {gradient_accumulation_steps}")
logger.info(f"warmup_steps: {warmup_steps}")
logger.info(f"max_steps: {max_steps}")
logger.info(f"learning_rate: {learning_rate}")
logger.info(f"fp16: {fp16}")
logger.info(f"bf16: {bf16}")
logger.info(f"logging_steps: {logging_steps}")
logger.info(f"optim: {optim}")
logger.info(f"weight_decay: {weight_decay}")
logger.info(f"lr_scheduler_type: {lr_scheduler_type}")
logger.info(f"seed: {seed}")
logger.info(f"output_dir: {output_dir}")
logger.info(f"report_to: {report_to}")

INFO:__main__:Unlsoth Parameters
INFO:__main__:max_seq_length: 2048
INFO:__main__:dtype: None
INFO:__main__:load_in_4bit: True
INFO:__main__:PEFT Parameters
INFO:__main__:r: 16
INFO:__main__:target_modules: ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']
INFO:__main__:lora_alpha: 16
INFO:__main__:lora_dropout: 0
INFO:__main__:bias: none
INFO:__main__:use_gradient_checkpointing: unsloth
INFO:__main__:random_state: 3407
INFO:__main__:use_rslora: False
INFO:__main__:SFT Trainer Parameters
INFO:__main__:per_device_train_batch_size: 2
INFO:__main__:gradient_accumulation_steps: 4
INFO:__main__:warmup_steps: 5
INFO:__main__:max_steps: 1
INFO:__main__:learning_rate: 0.0002
INFO:__main__:fp16: True
INFO:__main__:bf16: False
INFO:__main__:logging_steps: 1
INFO:__main__:optim: adamw_8bit
INFO:__main__:weight_decay: 0.01
INFO:__main__:lr_scheduler_type: linear
INFO:__main__:seed: 3407
INFO:__main__:output_dir: outputs
INFO:__main__:report_to: none


### Unsloth

In [6]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = BASE_MODEL_REPO,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2025.4.3: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!




model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.5k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]



tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [7]:
model = FastLanguageModel.get_peft_model(
    model,
    r = r,
    target_modules = target_modules,
    lora_alpha = lora_alpha,
    lora_dropout = lora_dropout,
    bias = bias,
    use_gradient_checkpointing = use_gradient_checkpointing,
    random_state = random_state,
    use_rslora = use_rslora,
    loftq_config = loftq_config,
)

## Load and preprocess dataset
The raw dataset [SocratiQ](https://github.com/NUS-IDS/eacl23_soqg/tree/main) has a label at the begining of the context. We have to remove that and also tokenize the input for the model training.

In [8]:
dataset = load_dataset('csv', data_files=train_dataset_path,split="train")

Generating train split: 0 examples [00:00, ? examples/s]

In [9]:
# crop the dataset for testing
#dataset['train'] = dataset['train'].select(range(500))

In [10]:
# extract the input text and the label
def split_input(example):
    if ':' in example['input']:
        before, after = example['input'].split(':', 1)
        return {
            'label': before.strip(),
            'input_text': after.strip()
        }
    else:
        return {
            'label': example['input'].strip(),
            'input_text': ''
        }

# Apply the map function
dataset = dataset.map(split_input)

Map:   0%|          | 0/59207 [00:00<?, ? examples/s]

In [11]:
# If you want to see a sample
print(dataset)

Dataset({
    features: ['Unnamed: 0', 'input', 'target', 'label', 'input_text'],
    num_rows: 59207
})


In [12]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You will see a text and you should generate one critical question for this text.

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    inputs       = examples["input_text"]
    outputs      = examples["target"]
    texts = []
    for input, output in zip(inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

dataset = dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/59207 [00:00<?, ? examples/s]

In [13]:
# Check the structure of the dataset
print(dataset[0]['text'])

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You will see a text and you should generate one critical question for this text.

### Input:
I'm referring only to obese or morbidly obese. Nothing against you being overweight. Not everybody can be thin or a few people have it extremely difficult to not be fat. Like I said, yes, you can be overweight, yes, you can feel "fabulous" but it isn't in general healthy, so fashion brands shouldn't be inclusive in their marketing campaign in particular to obese.

### Response:
Are they obligated to make clothes that are as intentionally unattractive as possible?<|eot_id|>


In [14]:
print(dataset)

Dataset({
    features: ['Unnamed: 0', 'input', 'target', 'label', 'input_text', 'text'],
    num_rows: 59207
})


In [20]:
def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=max_seq_length,
    )
    return model_inputs

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=2,
    remove_columns=dataset.column_names, # remove all unnecessary
)

Map (num_proc=2):   0%|          | 0/59207 [00:00<?, ? examples/s]

In [21]:
print(tokenized_dataset)

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 59207
})


<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [23]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = tokenized_dataset,
    dataset_text_field = "input_ids",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = per_device_train_batch_size,
        gradient_accumulation_steps = gradient_accumulation_steps,
        warmup_steps = warmup_steps,
        max_steps = max_steps,
        learning_rate = learning_rate,
        fp16 = fp16,
        bf16 = bf16,
        logging_steps = logging_steps,
        optim = optim,
        weight_decay = weight_decay,
        lr_scheduler_type = lr_scheduler_type,
        seed = seed,
        output_dir = output_dir,
        report_to = report_to,
    ),
)

In [24]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)

logger.info(f"GPU  Information before Training")
logger.info(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
logger.info(f"{start_gpu_memory} GB of memory reserved.")

INFO:__main__:GPU  Information before Training
INFO:__main__:GPU = Tesla T4. Max memory = 14.741 GB.
INFO:__main__:7.135 GB of memory reserved.


In [25]:
trainer_stats = trainer.train()

Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,3.0467


In [26]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)

logger.info(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
logger.info(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
logger.info(f"Peak reserved memory = {used_memory} GB.")
logger.info(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
logger.info(f"Peak reserved memory % of max memory = {used_percentage} %.")
logger.info(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

INFO:__main__:52.4431 seconds used for training.
INFO:__main__:0.87 minutes used for training.
INFO:__main__:Peak reserved memory = 8.521 GB.
INFO:__main__:Peak reserved memory for training = 1.386 GB.
INFO:__main__:Peak reserved memory % of max memory = 57.805 %.
INFO:__main__:Peak reserved memory for training % of max memory = 9.402 %.


<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [28]:
model.save_pretrained(model_lora_adapter_save_path)  # Local saving
tokenizer.save_pretrained(model_lora_adapter_save_path)
logger.info(f"Saved LoRA adapters to {model_lora_adapter_save_path}")

INFO:__main__:Saved LoRA adapters to /content/drive/MyDrive/HSG/NLP/Project NLP/Training/Training_1/Model/Meta-Llama-3.1-8B-Instruct_lora_adapters/


### Saving to float16 for VLLM

We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.

In [29]:
# Merge to 16bit
if True:
  model.save_pretrained_merged(model_save_path, tokenizer, save_method = "merged_16bit",)
  logger.info(f"Saved merged model in 16bit to {model_save_path}")

# Merge to 4bit
if False:
  model.save_pretrained_merged(model_save_path, tokenizer, save_method = "merged_4bit",)
  logger.info(f"Saved merged model in 4bit to {model_save_path}")

# Just LoRA adapters
if False:
  model.save_pretrained_merged(model_save_path, tokenizer, save_method = "lora",)
  logger.info(f"Saved LoRA adapters to {model_save_path}")

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 29.98 out of 50.99 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 32/32 [00:23<00:00,  1.36it/s]


Unsloth: Saving tokenizer... Done.
Done.


INFO:__main__:Saved merged model in 16bit to /content/drive/MyDrive/HSG/NLP/Project NLP/Training/Training_1/Model/Meta-Llama-3.1-8B-Instruct_finetuned/


### GGUF / llama.cpp Conversion
To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.

Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
* `q8_0` - Fast conversion. High resource use, but generally acceptable.
* `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
* `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.

[**NEW**] To finetune and auto export to Ollama, try our [Ollama notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-Ollama.ipynb)

In [None]:
# Save to 8bit Q8_0
if False:
  model.save_pretrained_gguf(model_save_path, tokenizer,)
  logger.info(f"Saved merged model in 8bit Q8_0 to {model_save_path}")

# Save to 16bit GGUF
if False:
  model.save_pretrained_gguf(model_save_path, tokenizer, quantization_method = "f16")
  logger.info(f"Saved merged model in 16bit GGUF to {model_save_path}")

# Save to q4_k_m GGUF
if False:
  model.save_pretrained_gguf(model_save_path, tokenizer, quantization_method = "q4_k_m")
  logger.info(f"Saved merged model in q4_k_m GGUF to {model_save_path}")

# Save to multiple GGUF options - much faster if you want multiple!
if False:
    model.push_to_hub_gguf(
        "hf/model", # Change hf to your username!
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
        token = "",
    )