In [1]:
# ==========================================
# 1. ÂØºÂÖ•
# ==========================================
from unsloth import FastLanguageModel
import torch
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments

# ==========================================
# 2. ÈÖçÁΩÆ (RTX 5090 / Ubuntu)
# ==========================================
MAX_SEQ_LENGTH = 4096 
DTYPE = None 
LOAD_IN_4BIT = True 

DATA_FILE = "llama3_finetune_data.jsonl"
OUTPUT_DIR = "llama3_financial_analyst_checkpoint"

print(f"üî• Âî§ÈÜí RTX 5090... CUDA: {torch.cuda.is_available()}")

# 3. Âä†ËΩΩÊ®°Âûã
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit", 
    max_seq_length = MAX_SEQ_LENGTH,
    dtype = DTYPE,
    load_in_4bit = LOAD_IN_4BIT,
)

# 4. ÈÖçÁΩÆ LoRA
model = FastLanguageModel.get_peft_model(
    model,
    r = 32,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth", 
    random_state = 3407,
)

# 5. ÂáÜÂ§áÊï∞ÊçÆ
try:
    dataset = load_dataset("json", data_files=DATA_FILE, split="train")
except:
    print("‚ö†Ô∏è Ê≤°ÊâæÂà∞Êï∞ÊçÆÊñá‰ª∂ÔºåÁîüÊàêÊ®°ÊãüÊï∞ÊçÆÊµãËØï...")
    from datasets import Dataset
    dataset = Dataset.from_list([{"instruction":"test","input":"test","output":"test"}])

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + tokenizer.eos_token
        texts.append(text)
    return { "text" : texts, }

dataset = dataset.map(formatting_prompts_func, batched = True)

# 6. ËÆ≠ÁªÉÂô® (Trainer)
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = MAX_SEQ_LENGTH,
    dataset_num_proc = 4,
    packing = False, 
    args = TrainingArguments(
        per_device_train_batch_size = 8, # ÂÆâÂÖ®Ëµ∑ËßÅÂÖàËÆæ8ÔºåÁ®≥ÂÆöÂêéÂÜçÊîπ16
        gradient_accumulation_steps = 2,
        warmup_steps = 5,
        max_steps = 60, 
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = OUTPUT_DIR,
    ),
)

# 7. ËÆ≠ÁªÉ
print("üöÄ [Start] ËÆ≠ÁªÉÂºÄÂßã...")
trainer_stats = trainer.train()

print(f"‚úÖ ËÆ≠ÁªÉÂÆåÊàêÔºÅ")
model.save_pretrained(OUTPUT_DIR)

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
üî• Âî§ÈÜí RTX 5090... CUDA: True
==((====))==  Unsloth 2025.12.9: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    NVIDIA GeForce RTX 5090. Num GPUs = 1. Max memory: 31.357 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 12.0. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Unsloth 2025.12.9 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/526 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=212):   0%|          | 0/526 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


üöÄ [Start] ËÆ≠ÁªÉÂºÄÂßã...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 526 | Num Epochs = 2 | Total steps = 60
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 2 x 1) = 16
 "-____-"     Trainable parameters = 83,886,080 of 8,114,147,328 (1.03% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.7412
2,2.7621
3,2.7631
4,2.6562
5,2.432
6,2.0596
7,1.6592
8,1.202
9,1.0418
10,0.8223


‚úÖ ËÆ≠ÁªÉÂÆåÊàêÔºÅ
