In [None]:
# !pip3 install -q -U transformers==4.38.2
# !pip3 install -q -U datasets==2.18.0
# !pip3 install -q -U bitsandbytes==0.42.0
# !pip3 install -q -U peft==0.9.0
# !pip3 install -q -U trl==0.7.11
# !pip3 install -q -U accelerate==0.27.2

In [None]:
import torch
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline, TrainingArguments
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [None]:
from huggingface_hub import notebook_login
notebook_login()

# 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

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict

# 로컬 CSV 파일을 읽고 데이터프레임으로 변환
train_df = pd.read_csv('data/train.csv')
dev_df = pd.read_csv('data/dev.csv')
test_df = pd.read_csv('data/test.csv')

# 각 데이터프레임을 Dataset으로 변환
train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(dev_df)
test_dataset = Dataset.from_pandas(test_df)

# 데이터셋 딕셔너리로 병합
dataset = DatasetDict({
    'train': train_dataset,
    'validation': dev_dataset,
    'test': test_dataset
})

# 데이터셋 구조 확인
print(dataset)


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig


BASE_MODEL = "google/gemma-2-2b-it"
# trust_remote_code 옵션을 사용하여 모델 로드
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map={"":0}, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL,  trust_remote_code=True)


In [None]:
doc = dataset['train']['document'][0]

In [None]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512)

In [None]:
messages = [
    {
        "role": "user",
        "content": "다음 글을 요약해주세요 :\n\n{}".format(doc)
    }
]
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [None]:
prompt

In [None]:
outputs = pipe(
    prompt,
    do_sample=True,
    temperature=0.2,
    top_k=50,
    top_p=0.95,
    add_special_tokens=True
)

In [None]:
print(outputs[0]["generated_text"][len(prompt):])

---

In [None]:
import torch
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline, TrainingArguments
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict

# 로컬 CSV 파일을 읽고 데이터프레임으로 변환
train_df = pd.read_csv('data/train.csv')
dev_df = pd.read_csv('data/dev.csv')
test_df = pd.read_csv('data/test.csv')

# 각 데이터프레임을 Dataset으로 변환
train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(dev_df)
test_dataset = Dataset.from_pandas(test_df)

# 데이터셋 딕셔너리로 병합
dataset = DatasetDict({
    'train': train_dataset,
    'validation': dev_dataset,
    'test': test_dataset
})

# 데이터셋 구조 확인
print(dataset)


In [None]:
!nvidia-smi

In [None]:
# 4.1 학습용 프롬프트 조정
def generate_prompt(example):
    prompt_list = []
    for i in range(len(example['dialogue'])):
        prompt_list.append(r"""<bos><start_of_turn>user
                                다음 글을 요약해주세요:

                                {}<end_of_turn>
                                <start_of_turn>model
                                {}<end_of_turn><eos>""".format(example['dialogue'][i], example['summary'][i]))
    return prompt_list

train_data = dataset['train']
print(generate_prompt(train_data[:1])[0])


In [None]:
train_data = dataset['train']

print(generate_prompt(train_data[:1])[0])

In [None]:
lora_config = LoraConfig(
    r=6,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

In [None]:
# BASE_MODEL = "google/gemma-2-2b-it"
BASE_MODEL = "rtzr/ko-gemma-2-9b-it"
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map="auto"
                                             , quantization_config=bnb_config
                                             )
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer.padding_side = 'right'

In [None]:
# 훈련 데이터셋의 샘플 수
num_train_samples = len(train_data)

# 배치 크기
batch_size = 1  # per_device_train_batch_size=1로 설정되어 있음

# Gradient Accumulation Steps
gradient_accumulation_steps = 4

# 한 에포크당 스텝 수 계산
steps_per_epoch = num_train_samples // (batch_size * gradient_accumulation_steps)

print(f"한 에포크당 스텝 수: {steps_per_epoch}")


In [None]:
# 4.3 Trainer 실행
trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=dataset['validation'],  # Validation 데이터셋 추가
    max_seq_length=512,
    args=TrainingArguments(
        output_dir="outputs",
        num_train_epochs=1,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        optim="paged_adamw_8bit",
        warmup_steps=100,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=100,
        evaluation_strategy="epoch",  # 에포크마다 검증 수행
        save_strategy="epoch",  # 에포크마다 모델 저장
        push_to_hub=False,
        report_to='none',
    ),
    peft_config=lora_config,
    formatting_func=generate_prompt,
)


In [None]:

# 모델 훈련
trainer.train()


In [None]:
ADAPTER_MODEL = "lora_adapter"

trainer.model.save_pretrained(ADAPTER_MODEL)

In [None]:
!ls -alh lora_adapter

In [None]:
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map='auto', torch_dtype=torch.float16)
model = PeftModel.from_pretrained(model, ADAPTER_MODEL, device_map='auto', torch_dtype=torch.float16)

model = model.merge_and_unload()
model.save_pretrained('ko-gemma-2-9b_1EPOCH')

In [None]:
!ls -alh ./ko-gemma-2-9b_1EPOCH

In [None]:

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

In [None]:

import pandas as pd
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
# Fine-tuned 모델 로드
BASE_MODEL = "rtzr/ko-gemma-2-9b-it"
FINETUNE_MODEL = "./ko-gemma-2-9b_1EPOCH"

# 8bit Quantization 설정
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,  # 8bit 양자화 사용
    bnb_8bit_use_double_threshold=False,
    bnb_8bit_quant_type="nf4",  # 양자화 유형을 설정 (nf4는 노이즈를 줄여줌)
)

finetune_model = AutoModelForCausalLM.from_pretrained(FINETUNE_MODEL,
                                                      device_map="auto",
                                                      torch_dtype=torch.float16,
                                                      #quantization_config=bnb_config,
                                                      )
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

# 파이프라인 설정
pipe_finetuned = pipeline("text-generation", model=finetune_model, tokenizer=tokenizer, max_new_tokens=512)

# 테스트 데이터셋 로드
test_df = pd.read_csv('data/test.csv')

# 결과를 저장할 리스트 초기화
results = []

# 테스트 데이터셋에 대해 추론 수행
for idx, row in test_df.iterrows():
    doc = row['dialogue']
    messages = [
        {
            "role": "user",
            "content": "다음 글을 요약해주세요:\n\n{}".format(doc)
        }
    ]
    prompt = pipe_finetuned.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    output = pipe_finetuned(
        prompt,
        do_sample=False,
        #temperature=0.2,
        top_k=50,
        #top_p=0.95,
        add_special_tokens=True
    )
    summary = output[0]["generated_text"][len(prompt):]
    
    # 결과를 리스트에 추가
    results.append({
        "fname": row['fname'],
        "summary": summary
    })

# 결과를 데이터프레임으로 변환
results_df = pd.DataFrame(results)

# CSV 파일로 저장
results_df.to_csv(f'data/submission-ko-gemma-2-9b_1EPOCH.csv', index=False)

print("결과가 'submission.csv' 파일로 저장되었습니다.")
