In [1]:
#!pip install transformers datasets evaluate rouge_score

from datasets import load_dataset

billsum = load_dataset("billsum", split="ca_test")

billsum = billsum.train_test_split(test_size=0.2)

from transformers import AutoTokenizer

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_billsum = billsum.map(preprocess_function, batched=True)

from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

import evaluate

rouge = evaluate.load("rouge")

import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_billsum_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."

from transformers import pipeline

summarizer = pipeline("summarization", model="stevhliu/my_awesome_billsum_model")
summarizer(text)


from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_billsum_model")
inputs = tokenizer(text, return_tensors="pt").input_ids

from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("stevhliu/my_awesome_billsum_model")
outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)

tokenizer.decode(outputs[0], skip_special_tokens=True)



Map:   0%|          | 0/989 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mthebestday[0m ([33mthebestdayor[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,2.836272,0.1258,0.0334,0.1044,0.1042,19.0
2,No log,2.618432,0.1331,0.0435,0.1109,0.1108,19.0
3,No log,2.555056,0.1441,0.0515,0.1191,0.1188,19.0
4,No log,2.537973,0.1419,0.0505,0.1183,0.118,19.0




config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/2.35k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Your max_length is set to 200, but your input_length is only 103. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=51)


"The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history. It'll ask the ultra-wealthy and corporations to pay their fair share."

---

---

In [None]:
!pip install transformers datasets evaluate rouge_score

In [1]:
from huggingface_hub import notebook_login
notebook_login()

# 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

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from datasets import load_dataset

dataset = load_dataset('csv', data_files={'train': "/data/ephemeral/home/data/train.csv", 'val': "/data/ephemeral/home/data/dev.csv"})

from transformers import AutoTokenizer

checkpoint="RichardErkhov/EleutherAI_-_polyglot-ko-5.8b-4bits"
#checkpoint= "EleutherAI/polyglot-ko-5.8b"
#checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

OSError: RichardErkhov/EleutherAI_-_polyglot-ko-5.8b-4bits does not appear to have a file named config.json. Checkout 'https://huggingface.co/RichardErkhov/EleutherAI_-_polyglot-ko-5.8b-4bits/None' for available files.

In [2]:
from datasets import concatenate_datasets

# The maximum total input sequence length after tokenization. 
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["val"]]).map(lambda x: tokenizer(x["dialogue"], truncation=True), batched=True, remove_columns=["dialogue", "summary"])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")
min_source_length = min([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Min source length: {min_source_length}")


# The maximum total sequence length for target text after tokenization. 
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([dataset["train"], dataset["val"]]).map(lambda x: tokenizer(x["summary"], truncation=True), batched=True, remove_columns=["dialogue", "summary"])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")
min_target_length = min([len(x) for x in tokenized_targets["input_ids"]])
print(f"Min target length: {min_target_length}")

Map:   0%|          | 0/12956 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Max source length: 1418
Min source length: 55


Map:   0%|          | 0/12956 [00:00<?, ? examples/s]

Max target length: 247
Min target length: 10


In [6]:
prefix = "다음 대화를 요약하세요.: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=max_source_length+20, padding="max_length",truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=max_target_length, padding="max_length",truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer, 
    model=checkpoint, 
    padding="max_length", 
    max_length=max_target_length, 
    return_tensors="pt")

import evaluate

rouge = evaluate.load("rouge")

import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}


Map:   0%|          | 0/12457 [00:00<?, ? examples/s]

Map:   0%|          | 0/499 [00:00<?, ? examples/s]

In [None]:
# t5-small
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

training_args = Seq2SeqTrainingArguments(
    output_dir="t5-small",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,     
    fp16=True,
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

dialogue = f"summarize: {dataset['val']['dialogue'][0]}"

from transformers import pipeline

repository_path="t5-small"

summarizer = pipeline("summarization", model=repository_path)
summarizer(dialogue)


from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(repository_path)
inputs = tokenizer(dialogue, return_tensors="pt").input_ids

from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(repository_path)
outputs = model.generate(inputs, 
                        max_new_tokens=100, 
                        do_sample=False,
                        num_beams=5, 
                        no_repeat_ngram_size=2
                        )

tokenizer.decode(outputs[0], skip_special_tokens=True)



In [7]:
checkpoint

'RichardErkhov/EleutherAI_-_polyglot-ko-5.8b-4bits'

In [19]:
    from transformers import AutoModelForCausalLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoTokenizer, BitsAndBytesConfig
    from peft import LoraConfig, get_peft_model
    import torch
    import gc  # 가비지 컬렉션
    import accelerate

    # PEFT 설정 (LoRA 사용)
    lora_config = LoraConfig(
        r=6,  # Low-rank matrix 차원
        lora_alpha=16,  # Scaling factor
        lora_dropout=0.1,  # Dropout 적용 (Optional)
        target_modules=["query_key_value", "dense"]  # 미세 조정할 모듈
    )

    # 명시적으로 각 모듈을 GPU 또는 CPU로 매핑
    device_map = {
        "transformer": "cuda",  # Transformer layers on GPU
        "lm_head": "cpu",  # lm_head on CPU to save GPU memory
        "embed_in": "cuda",  # Embedding layer on GPU
    }

    bnb_config = BitsAndBytesConfig(
        load_in_8bit=True,
        load_in_8bit_fp32_cpu_offload=True  # CPU에 오프로드된 모듈을 32비트로 유지
    )

    # 모델 로드 및 PEFT 적용
    model = AutoModelForCausalLM.from_pretrained(
        checkpoint,
        torch_dtype=torch.float16,
        device_map=device_map,  # 명시적 device_map 사용
        #device_map = "auto",
        low_cpu_mem_usage=True,  # 메모리 최적화
        quantization_config=bnb_config
    )

    # Gradient Checkpointing 활성화
    model.gradient_checkpointing_enable()

    # PEFT 적용
    model = get_peft_model(model, lora_config)

    # 토크나이저 로드
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)

    # 가속 라이브러리 적용 (가비지 컬렉션 및 메모리 최적화)
    accelerator = accelerate.Accelerator()

    # 훈련 설정
    training_args = Seq2SeqTrainingArguments(
        output_dir=checkpoint,
        save_strategy="epoch",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=2,
        predict_with_generate=True,
        fp16=True,  # Mixed precision 사용
        push_to_hub=False,
        load_best_model_at_end=True,
    )

    # 데이터셋 및 메트릭 설정
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["val"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # 훈련 중 캐시 정리
    def train_with_cache_clean(trainer):
        for epoch in range(int(training_args.num_train_epochs)):
            print(f"Epoch {epoch + 1}/{training_args.num_train_epochs}")
            trainer.train()
            # 캐시 정리
            torch.cuda.empty_cache()  # CUDA 메모리 캐시 해제
            gc.collect()  # 파이썬 가비지 컬렉션
            print(f"Cache cleared after epoch {epoch + 1}")

    # 훈련 시작
    train_with_cache_clean(trainer)

    # 추론 예시
    dialogue = f"다음 대화문을 요약하세요.: {dataset['val']['dialogue'][0]}"
    repository_path = checkpoint

    from transformers import pipeline

    summarizer = pipeline("summarization", model=repository_path)
    summary = summarizer(dialogue)

    # 모델을 사용한 직접 추론
    inputs = tokenizer(dialogue, return_tensors="pt").input_ids
    outputs = model.generate(inputs,
                            max_new_tokens=max_target_length,
                            do_sample=False,
                            num_beams=5,
                            no_repeat_ngram_size=2)

    print(tokenizer.decode(outputs[0], skip_special_tokens=True))


ValueError: 
                    Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the
                    quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules
                    in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom `device_map` to
                    `from_pretrained`. Check
                    https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu
                    for more details.
                    

In [8]:
from transformers import AutoModelForCausalLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoTokenizer, GPTQConfig
from datasets import load_dataset
from torch.utils.data import ConcatDataset
from peft import LoraConfig, get_peft_model
import torch
import gc
import accelerate

# PEFT 설정 (LoRA 사용)
lora_config = LoraConfig(
    r=6,  # Low-rank matrix 차원
    lora_alpha=16,  # Scaling factor
    lora_dropout=0.1,  # Dropout 적용 (Optional)
    target_modules=["query_key_value", "dense"]  # 미세 조정할 모듈
)


# Extract raw text (dialogue) from both train and val datasets for calibration
calibration_dataset = list(tokenized_dataset["train"]["dialogue"]) + list(tokenized_dataset["val"]["dialogue"])


# GPTQConfig 설정
gptq_config = GPTQConfig(
    bits=4,  # 4-bit 양자화 사용
    group_size=2,  # 그룹 크기 설정
    desc_act="relu",  # Activation function description for optimization
    enable_fp32_cpu_offload=True,  # CPU 오프로드 사용
    tokenizer=tokenizer,
    dataset=calibration_dataset
)

device_map = {
    "embed_in": "cpu",  # Move embeddings to CPU
    "transformer": "cuda",  # Transformer layers on GPU
    "lm_head": "cuda",  # Output head on GPU
}

# 모델 로드 및 PEFT 적용
model = AutoModelForCausalLM.from_pretrained(
    checkpoint,
    torch_dtype=torch.float16,
    device_map="auto",  # Use auto to let the library handle device placement
    #device_map=device_map
    low_cpu_mem_usage=True,  # 메모리 최적화
    quantization_config=gptq_config  # GPTQConfig 적용
)

# Gradient Checkpointing 활성화
model.gradient_checkpointing_enable()

# PEFT 적용
model = get_peft_model(model, lora_config)

# 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# 훈련 설정
training_args = Seq2SeqTrainingArguments(
    output_dir=checkpoint,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=True,  # Mixed precision 사용
    push_to_hub=False,
    load_best_model_at_end=True,
)

# 데이터셋 및 메트릭 설정
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 훈련 중 캐시 정리
def train_with_cache_clean(trainer):
    for epoch in range(int(training_args.num_train_epochs)):
        print(f"Epoch {epoch + 1}/{training_args.num_train_epochs}")
        trainer.train()
        # 캐시 정리
        torch.cuda.empty_cache()  # CUDA 메모리 캐시 해제
        gc.collect()  # 파이썬 가비지 컬렉션
        print(f"Cache cleared after epoch {epoch + 1}")

# 훈련 시작
train_with_cache_clean(trainer)

# 추론 예시
dialogue = f"다음 대화문을 요약하세요.: {tokenized_dataset['val']['dialogue'][0]}"
repository_path = checkpoint

from transformers import pipeline

summarizer = pipeline("summarization", model=repository_path)
summary = summarizer(dialogue)

# 모델을 사용한 직접 추론
inputs = tokenizer(dialogue, return_tensors="pt").input_ids
outputs = model.generate(inputs,
                        max_new_tokens=max_target_length,
                        do_sample=False,
                        num_beams=5,
                        no_repeat_ngram_size=2)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


OSError: RichardErkhov/EleutherAI_-_polyglot-ko-5.8b-4bits does not appear to have a file named config.json. Checkout 'https://huggingface.co/RichardErkhov/EleutherAI_-_polyglot-ko-5.8b-4bits/None' for available files.

In [9]:
print(tokenized_dataset)


DatasetDict({
    train: Dataset({
        features: ['fname', 'dialogue', 'summary', 'topic', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 12457
    })
    val: Dataset({
        features: ['fname', 'dialogue', 'summary', 'topic', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 499
    })
})
