In [None]:
# !pip install accelerate -U  # 허깅페이스에서 제공하는 라이브러리: 분산학습 지원, 혼합 정밀도 학습, 간편한 설정

In [6]:
import os
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, Trainer, TrainingArguments, EarlyStoppingCallback  # Trainer는 자동 GPU사용
from torch.utils.data import Dataset, DataLoader

In [3]:
# 모델과 토크나이저 불러오기
model_name = 'skt/kogpt2-base-v2'
tokenizer = GPT2TokenizerFast.from_pretrained(model_name)

# 추가 토큰 정의 및 추가
additional_special_tokens = ['<question>', '<answer>']
tokenizer.add_special_tokens({'additional_special_tokens': additional_special_tokens})

# 모델에 토크나이저의 추가 토큰 수용하도록 설정
model = GPT2LMHeadModel.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

# 패딩 토큰 설정
tokenizer.pad_token = tokenizer.eos_token

In [4]:
# 데이터셋 준비
train_data = pd.read_csv('rawdata.csv', index_col=0)

# 컬럼 이름 변경
train_data['question'] = train_data['Q']
train_data['answer'] = train_data['A']
train_data = train_data[['question', 'answer']]

In [5]:
# 데이터 전처리 함수
def preprocess_function(examples):
    inputs = ['<question> ' + ex for ex in examples['question']]
    targets = ['<answer> ' + ex for ex in examples['answer']]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
    labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length", return_tensors="pt")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [7]:
# 데이터를 train set과 test set으로 분리
train_data, test_data = train_test_split(train_data, test_size=0.2, random_state=42)

# 전처리 함수로 train set과 test set을 각각 토큰화
train_encodings = preprocess_function(train_data)
test_encodings = preprocess_function(test_data)

In [8]:
# encoding의 결과가 tensor이므로 torch로 변경해야함
class QADataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

In [9]:
# 데이터를 torch형식으로 변환
train_dataset = QADataset(train_encodings)
test_dataset = QADataset(test_encodings)

In [10]:
# 정확도를 계산하는 함수
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)
    accuracy = (predictions == labels).mean()
    return {"accuracy": accuracy}

In [11]:
# 학습 설정
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=20,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    save_total_limit=1
)



In [12]:
# Trainer 설정
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [13]:
# 모델 학습
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# 모델 저장
trainer.save_model('./kogpt2-finetuned')
tokenizer.save_pretrained('./kogpt2-finetuned')

# 평가
metrics = trainer.evaluate()
print(metrics)

In [None]:
train_metrics = trainer.state.log_history

# 마지막 에포크의 train_loss와 valid_loss
train_loss = train_metrics[-2]['loss']
valid_loss = train_metrics[-1]['eval_loss']

# train_accuracy와 test_accuracy는 가장 좋은 모델의 평가 메트릭으로 가져옵니다.
train_accuracy = trainer.state.best_metric

# train_accuracy와 test_accuracy는 가장 좋은 모델의 평가 메트릭으로 가져옵니다.
test_accuracy = metrics['eval_accuracy']

print(f"Train Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")
print(f"Train Loss: {train_loss}")
print(f"Valid Loss: {valid_loss}")


# 전체 코드

In [14]:
import os
import torch
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, Trainer, TrainingArguments, EarlyStoppingCallback
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
import numpy as np

# 모델과 토크나이저 불러오기
model_name = 'skt/kogpt2-base-v2'
tokenizer = GPT2TokenizerFast.from_pretrained(model_name)

# 추가 토큰 정의 및 추가
additional_special_tokens = ['<question>', '<answer>']
tokenizer.add_special_tokens({'additional_special_tokens': additional_special_tokens})

# 모델에 토크나이저의 추가 토큰 수용하도록 설정
model = GPT2LMHeadModel.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

# 패딩 토큰 설정
tokenizer.pad_token = tokenizer.eos_token

# 데이터셋 준비
train_data = pd.read_csv('rawdata.csv', index_col=0)
train_data['question'] = train_data['Q']
train_data['answer'] = train_data['A']
train_data = train_data[['question', 'answer']]

# 데이터 샘플링 (예: 10%만 사용)
train_data = train_data.sample(frac=0.1, random_state=42)

# 데이터 전처리 함수
def preprocess_function(examples):
    inputs = ['<question> ' + ex for ex in examples['question']]
    targets = ['<answer> ' + ex for ex in examples['answer']]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
    labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length", return_tensors="pt")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# 데이터를 train set과 test set으로 분리
train_data, test_data = train_test_split(train_data, test_size=0.2, random_state=42)

# 전처리 함수로 train set과 test set을 각각 토큰화
train_encodings = preprocess_function(train_data)
test_encodings = preprocess_function(test_data)

# PyTorch Dataset 클래스 정의
class QADataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

# 전처리된 데이터를 PyTorch Dataset 형식으로 변환
train_dataset = QADataset(train_encodings)
test_dataset = QADataset(test_encodings)

# 정확도를 계산하는 함수
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)
    accuracy = (predictions == labels).mean()
    return {"accuracy": accuracy}

# 학습 설정
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,  # 에포크 수 줄이기
    per_device_train_batch_size=4,  # 배치 크기 증가
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    save_total_limit=1,
    fp16=True  # 혼합 정밀도 학습
)

# Trainer 설정
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# 모델 학습
trainer.train()

# 모델 저장
trainer.save_model('./kogpt2-finetuned')
tokenizer.save_pretrained('./kogpt2-finetuned')

# 평가
metrics = trainer.evaluate()
print(metrics)

train_metrics = trainer.state.log_history

# 마지막 에포크의 train_loss와 valid_loss
train_loss = train_metrics[-2]['loss']
valid_loss = train_metrics[-1]['eval_loss']

# train_accuracy와 test_accuracy는 가장 좋은 모델의 평가 메트릭으로 가져옵니다.
train_accuracy = trainer.state.best_metric
test_accuracy = metrics['eval_accuracy']

print(f"Train Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")
print(f"Train Loss: {train_loss}")
print(f"Valid Loss: {valid_loss}")




Epoch,Training Loss,Validation Loss,Accuracy
1,11.4078,3.981872,0.468605


KeyboardInterrupt: 