In [1]:
! pip install transforms datasets torch wandb huggingface_hub peft bitsandbytes



In [6]:
import os
import wandb
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, 
    Trainer, TrainingArguments, DataCollatorForSeq2Seq, 
    Seq2SeqTrainer, Seq2SeqTrainingArguments
)
from datasets import load_dataset
import huggingface_hub
import torch
from peft import LoraConfig, get_peft_model, TaskType
from bitsandbytes.optim import PagedAdamW8bit

In [None]:
config = {
  "hugginface_token": "",
  "wandb_key": "",
  "model_path": "meta-llama/Llama-3.2-1B", #  google/gemma-2-2b-it
  "save_model_name": "pretrain_open_source",
  # "use_lora": True,
  # "lora_r": 16,
  # "lora_alpha": 32,
  "lr": 3e-5,
  "epoch": 1,
  "batch_size": 1,
  "max_seq_len": 512, # max token seq, change with gpu memory
  "checkpoint_path": "./checkpoints",
  "OpenSource_data_path": "FiscalNote/billsum", #  ccdv/govreport-summarization 
  "OpenSource_version": "",
  "Youtube_data_path": "ht324/WhiteBoard_LLM_Data_response" # "ht324/WhiteBoard_LLM_Data_response"
}

hugginface_token = config["hugginface_token"]
wandb_key = config["wandb_key"]

model_path = config["model_path"]
save_model_name = config["save_model_name"]

# use_lora = config["use_lora"]
# lora_r = config["lora_r"]
# lora_alpha = config["lora_alpha"]

lr = config["lr"]
epoch = config["epoch"]
batch_size = config["batch_size"]
max_seq_len = config["max_seq_len"]

# checkpoint_path = config["checkpoint_path"]
OpenSource_data_path = config["OpenSource_data_path"]
OpenSource_version = config["OpenSource_version"]
Youtube_data_path = config["Youtube_data_path"]

In [8]:
# login to huggingface and wandb

huggingface_hub.login(token=hugginface_token)
if wandb_key:
    wandb.login(key=wandb_key)
    wandb.init(
        project="WhiteBoard_LLM",
        config={
            "model_name": save_model_name,
            "lr": lr,
            "epoch": epoch,
            "batch_size": batch_size,
            "max_seq_len": max_seq_len,
            # "use_lora": use_lora,
            # "lora_r": lora_r,
            # "lora_alpha": lora_alpha,
        },
        name=save_model_name
    )



The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True
)


lora_config = LoraConfig(
    r=4,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    task_type="CAUSAL_LM",
)


model = get_peft_model(model, lora_config)

# for llama tokenzier
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))


In [None]:
# Load data opensource dataset have to check column name
if OpenSource_data_path:
    if OpenSource_version:
        open_source_data = load_dataset(OpenSource_data_path, OpenSource_version)
    else:
        open_source_data = load_dataset(OpenSource_data_path)

if Youtube_data_path:
    youtube_data = load_dataset(Youtube_data_path)
    youtube_data["train"] = youtube_data["train"].rename_column('content','text')
    youtube_data["train"] = youtube_data["train"].rename_column('response','summary')
    

In [11]:
# preprocessing function
def preprocess_function(examples):
    # 입력 텍스트 토큰화
    inputs = tokenizer(
        examples["text"],
        max_length=max_seq_len,
        truncation=True,
        padding="max_length"  # 모든 시퀀스를 max_length로 맞춤
    )
    # 출력 텍스트 토큰화
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["summary"],
            max_length=max_seq_len,
            truncation=True,
            padding="max_length"
        )

    # 모델 입력 데이터 반환
    inputs["labels"] = labels["input_ids"]
    return inputs

def fix_dataset_format(example):
    example["text"] = " ".join(example["text"]) if isinstance(example["text"], list) else example["text"]
    example["summary"] = " ".join(example["summary"]) if isinstance(example["summary"], list) else example["summary"]
    return example

In [12]:
# open_source_data
youtube_data

DatasetDict({
    train: Dataset({
        features: ['custom_id', 'role', 'text', 'summary'],
        num_rows: 4840
    })
})

In [14]:
train_data = youtube_data["train"]
# train_data = open_source_data["train"]
# test_data = open_source_data["test"]

train_data = train_data.map(fix_dataset_format)
# test_data = test_data.map(fix_dataset_format)

# remove useless columns
train_data = train_data.map(
    lambda example: {"text": example["text"], "summary": example["summary"]},
    remove_columns=train_data.column_names,
)


# test_data = test_data.map(
#     lambda example: {"text": example["text"], "summary": example["summary"]},
#     remove_columns=train_data.column_names,
# )

train_data

Map:   0%|          | 0/4840 [00:00<?, ? examples/s]

Map:   0%|          | 0/4840 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'summary'],
    num_rows: 4840
})

In [None]:
tokenized_train_dataset = train_data.map(preprocess_function, batched=True)
# tokenized_test_dataset = test_data.map(preprocess_function, batched=True)

tokenized_train_dataset

In [None]:
tokenized_train_dataset = tokenized_train_dataset.remove_columns(train_data.column_names)
# tokenized_test_dataset = tokenized_test_dataset.remove_columns(test_data.column_names)

tokenized_train_dataset

In [None]:

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    # eval_strategy="steps",
    eval_strategy="no",
    eval_steps=500,
    save_steps=500,
    logging_steps=20,
    save_total_limit=2,
    per_device_train_batch_size=batch_size, # change with gpu 
    per_device_eval_batch_size=batch_size,  # change with gpu
    gradient_accumulation_steps=8,          # change with gpu
    optim="paged_adamw_8bit",
    num_train_epochs=epoch,
    learning_rate=lr,
    weight_decay=0.01,
    report_to="wandb",
    fp16=True, # mixed precision training
    hub_model_id="ht324/WhiteBoard_LLM_Models"
    push_to_hub=True, # huggingface hub model upload
    run_name="pretrain with opensource",
    remove_unused_columns=False
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    # eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    # peft_config=lora_config,               
)

In [None]:
!nvidia-smi

In [None]:
trainer.train()

In [None]:
# 수동 huggingface hub 업로드
# trainer.push_to_hub(repo_id="ht324/WhiteBoard_LLM_Models")

In [None]:
# save model
# save_dir = f"./checkpoints/{save_model_name}"
# model.save_pretrained(save_dir)
# tokenizer.save_pretrained(save_dir)