In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device_count = torch.cuda.device_count()
print(f"Using device: {torch.cuda.current_device()}, Number of GPUs: {device_count}")

Using device: 0, Number of GPUs: 2


In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from src.data import CustomTrainDataset, DataCollatorForSupervisedDataset
from transformers import Trainer, TrainingArguments
import gc, torch

  from .autonotebook import tqdm as notebook_tqdm
2025-07-28 06:24:21.147164: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753683861.167669 1387230 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753683861.174024 1387230 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1753683861.190850 1387230 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1753683861.190871 1387230 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1753683861.190874 1387230

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.bfloat16
)

model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B"

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, quantization_config=bnb_config)

Loading checkpoint shards: 100%|██████████| 4/4 [00:39<00:00,  9.93s/it]


In [None]:
# Load model 

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_id = "LGAI-EXAONE/EXAONE-4.0-32B"
model_id

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, device_map="auto", quantization_config=bnb_config)

Loading checkpoint shards: 100%|██████████| 14/14 [00:23<00:00,  1.71s/it]
We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


In [None]:
from peft import LoraConfig, get_peft_model


lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 33,554,432 || all params: 32,036,770,816 || trainable%: 0.1047


In [None]:
TRAIN_PATH = "./data/korean_language_rag_V1.0_train.json"
DEV_PATH = "./data/korean_language_rag_V1.0_dev.json"

train_data = CustomTrainDataset(TRAIN_PATH, tokenizer)
dev_data = CustomTrainDataset(DEV_PATH, tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir = "./exaone_32b", 
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    learning_rate=2e-4,
    logging_steps=10,
    save_total_limit=2,
    report_to="none",
    save_steps=100
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSupervisedDataset(tokenizer)
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [10]:
# garbage VRAM collecting
gc.collect()
torch.cuda.empty_cache()

In [11]:
trainer.train()

  [torch.tensor(ids) for ids in input_ids], batch_first=True, padding_value=self.tokenizer.pad_token_id
  labels = torch.nn.utils.rnn.pad_sequence([torch.tensor(lbls) for lbls in labels], batch_first=True, padding_value=-100)


Step,Training Loss
10,1.4208
20,1.583
30,1.3824
40,1.4381
50,1.3724
60,1.2957
70,1.2212
80,1.1574
90,0.9078
100,0.9908


  [torch.tensor(ids) for ids in input_ids], batch_first=True, padding_value=self.tokenizer.pad_token_id
  labels = torch.nn.utils.rnn.pad_sequence([torch.tensor(lbls) for lbls in labels], batch_first=True, padding_value=-100)
  [torch.tensor(ids) for ids in input_ids], batch_first=True, padding_value=self.tokenizer.pad_token_id
  labels = torch.nn.utils.rnn.pad_sequence([torch.tensor(lbls) for lbls in labels], batch_first=True, padding_value=-100)


TrainOutput(global_step=234, training_loss=0.9825639551521367, metrics={'train_runtime': 1867.3678, 'train_samples_per_second': 0.999, 'train_steps_per_second': 0.125, 'total_flos': 1.587514430816133e+17, 'train_loss': 0.9825639551521367, 'epoch': 3.0})