In [None]:
import json, os, re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoModel
from transformers import Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training, PeftModel
from datasets import load_dataset
from huggingface_hub import login
from typing import List, Dict

In [None]:
from dotenv import load_dotenv
login(token=os.getenv('HF_TOKEN'))

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
model_name = "skt/A.X-4.0-Light"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)

In [None]:
# LoRA 설정
lora_config = LoraConfig(
    r=8,              # 랭크
    lora_alpha=16,    # 스케일
    target_modules=["q_proj","v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# 모델에 LoRA 적용
model = get_peft_model(model, lora_config)
model.enable_input_require_grads()
model.gradient_checkpointing_enable()
model.print_trainable_parameters()

In [None]:
def tokenize_function(example):
    messages = example["messages"]

    chat_text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False,
    )

    tokenized = tokenizer(
        chat_text,
        padding="max_length",
        truncation=True,
        max_length=256
    )

    labels = tokenized["input_ids"].copy()

    assistant_id = tokenizer.convert_tokens_to_ids("<|assistant|>")
    try:
        start = labels.index(assistant_id) + 1
    except ValueError:
        start = 0

    labels[:start] = [-100] * start
    tokenized["labels"] = labels

    return tokenized


In [None]:
train_list = []
folder_path = "TTA_scaling_v4__1"
file_list = os.listdir(folder_path)
for file in file_list:
    file_full_path = os.path.join(folder_path, file)
    train_list.append(file_full_path)

train_list = sorted(train_list)
train_list

In [None]:
# 데이터셋 로드
dataset = load_dataset("json", data_files={"train": ["finetune_glossary_scaling.jsonl"] + train_list})

print(dataset["train"][0])

In [None]:
tokenized_train = dataset["train"].map(tokenize_function, batched=False)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    bf16=True,
    fp16=False,

    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,

    logging_dir="./logs",
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=3,

    optim="adamw_torch",

    ddp_find_unused_parameters=False,
)


In [None]:
from transformers import Trainer
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    data_collator=data_collator,
)


trainer.train()

In [None]:
# 학습 끝난 후 모델 저장
trainer.save_model("./ax_trained_model_v3")

# 토크나이저도 같이 저장 (나중에 로드할 때 필요)
tokenizer.save_pretrained("./ax_trained_model_v3")

In [None]:
from huggingface_hub import HfApi, HfFolder, Repository, create_repo
from transformers import AutoModelForCausalLM, AutoTokenizer

# 1. 저장된 모델 불러오기
model = AutoModelForCausalLM.from_pretrained("./ax_trained_model_v3")
tokenizer = AutoTokenizer.from_pretrained("./ax_trained_model_v3")

In [None]:
# 2. Hugging Face Hub에 업로드
model.push_to_hub("ax-trained-model-v3")

In [None]:
tokenizer.push_to_hub("ax-trained-model-v3")