<a href="https://colab.research.google.com/github/Tiabet/Complete_story/blob/master/KoGPT_FineTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
import locale
locale.getpreferredencoding = lambda: "UTF-8"

!pip install transformers
!pip install accelerate
!pip install datasets
!pip install evaluate
!pip install rouge
!pip install konlpy

In [None]:
import torch
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, DataCollatorWithPadding, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import Dataset, load_dataset
from rouge import Rouge
import evaluate
from konlpy.tag import Okt
import os
import numpy as np

In [None]:
model_checkpoint = "skt/kogpt2-base-v2"

tokenizer = PreTrainedTokenizerFast.from_pretrained(model_checkpoint, bos_token='</s>', eos_token='</s>', unk_token='<unk>',
  pad_token='<pad>', mask_token='<mask>', padding_side='left')

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.83M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [None]:
 model = GPT2LMHeadModel.from_pretrained(model_checkpoint)

Downloading pytorch_model.bin:   0%|          | 0.00/513M [00:00<?, ?B/s]

# 모델 테스트

In [None]:
text = '근육이 커지기 위해서는'
input_ids = tokenizer.encode(text, return_tensors='pt')
# Check if a GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the input data (input_ids) to the GPU
input_ids = input_ids.to(device)

# Move the model to the GPU (assuming your model is already initialized)
model.to(device)
gen_ids = model.generate(input_ids,
                           max_length=32,
                           repetition_penalty=2.0,
                           pad_token_id=tokenizer.pad_token_id,
                           eos_token_id=tokenizer.eos_token_id,
                           bos_token_id=tokenizer.bos_token_id,
                           use_cache=True)

gen_ids = gen_ids.to("cpu")
generated = tokenizer.decode(gen_ids[0])
generated

'근육이 커지기 위해서는 무엇보다 규칙적인 생활습관이 중요하다.\n특히, 아침식사는 단백질과 비타민이 풍부한 과일과 채소를 많이 섭취하는 것이 좋다.\n또한 하루 30분'

정상적으로 작동하는 것 확인

# 데이터셋 전처리

In [None]:
dataset = load_dataset('drive/MyDrive', data_files={
    'train': 'nikluge-sc-2023-train.jsonl',
    'validation': 'nikluge-sc-2023-dev.jsonl',
    'test': 'nikluge-sc-2023-test.jsonl'
})

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
sentence1 = []
sentence3 = []

for dictionary in dataset['train']['input']:
    sentence1.append(dictionary["sentence1"])
    sentence3.append(dictionary["sentence3"])

In [None]:
dataset['train'] = dataset['train'].add_column('sentence1', sentence1)
dataset['train'] = dataset['train'].add_column('sentence3', sentence3)

In [None]:
sentence1_val = []
sentence3_val = []

for dictionary in dataset['validation']['input']:
    sentence1_val.append(dictionary["sentence1"])
    sentence3_val.append(dictionary["sentence3"])

dataset['validation'] = dataset['validation'].add_column('sentence1', sentence1_val)
dataset['validation'] = dataset['validation'].add_column('sentence3', sentence3_val)

In [None]:
sentence1_test = []
sentence3_test = []

for dictionary in dataset['test']['input']:
    sentence1_test.append(dictionary["sentence1"])
    sentence3_test.append(dictionary["sentence3"])

dataset['test'] = dataset['test'].add_column('sentence1', sentence1_test)
dataset['test'] = dataset['test'].add_column('sentence3', sentence3_test)

In [None]:
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence3"], example['output'],truncation=True, padding='max_length',max_length=32, return_tensors="pt")

In [None]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/120140 [00:00<?, ? examples/s]

Map:   0%|          | 0/15017 [00:00<?, ? examples/s]

Map:   0%|          | 0/15018 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(dataset["train"].column_names)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) #동적 패딩, attention_mask, input_ids 등의 길이를 똑같이 맞춰줌

# 평가 지표 작성 및 모델 파인튜닝

평가 지표에서 차이를 보이고 있는 부분들이 있어서, 두 평가 지표를 작성해보고자 한다.

In [None]:
def evaluate(eval_preds):
    references, predictions = eval_preds

    mecab = Mecab()

    for idx, sentence_array in enumerate(references):
        tokenized = []
        for mor in mecab.morphs(sentence_array[0]):
            tokenized.append(mor)
        references[idx] = tokenized

    for idx, s in enumerate(predictions):
        tokenized = []
        for mor in mecab.morphs(s):
            tokenized.append(mor)
        predictions[idx] = tokenized

    for idx,sentence in enumerate(predictions):
      together = " ".join(sentence)
      predictions[idx] = together
    for idx,sentence in enumerate(references):
      together = " ".join(sentence)
      references[idx] = together

    # Calculate ROUGE-1 score
    rouge = Rouge()
    results = rouge.get_scores(predictions, references, avg=True)
    rouge_1_score = results['rouge-1']['f']

    #Calculate Bleu Score
    google_bleu = evaluate.load("google_bleu")
    results = google_bleu.compute(predictions=predictions, references=references)
    bleu_score = results['google_bleu']

    #Calculate BERTScore
    bertscore = evaluate.load("bertscore")
    results = bertscore.compute(predictions=predictions, references=references, model_type="distilbert-base-uncased")
    bert_score_f1 = sum(results['f1'])/len(predictions)

    # Calculate the mean of the three scores
    mean_score = (rouge_1_score + bleu_score + bert_score_f1) / 3.0

    return mean_score

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    okt = Okt()

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = [" ".join(okt.morphs(pred.strip())) for pred in decoded_preds]
    decoded_labels = [" ".join(okt.morphs(label.strip())) for label in decoded_labels]

    # Calculate ROUGE-1 score
    rouge = Rouge()
    results = rouge.get_scores(decoded_preds, decoded_labels, avg=True)
    rouge_1_score = results['rouge-1']['f']

    #Calculate Bleu Score
    google_bleu = evaluate.load("google_bleu")
    results = google_bleu.compute(predictions=decoded_preds, references=decoded_labels)
    bleu_score = results['google_bleu']

    #Calculate BERTScore
    bertscore = evaluate.load("bertscore")
    results = bertscore.compute(predictions=decoded_preds, references=decoded_labels, model_type="distilbert-base-uncased")
    bert_score_f1 = sum(results['f1'])/len(predictions)

    # Calculate the mean of the three scores
    mean_score = (rouge_1_score + bleu_score + bert_score_f1) / 3.0

    return mean_score

In [None]:
def preprocess_logits_for_metrics(logits, labels):

    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
batch_size = 32
num_train_epochs = 8
logging_steps = len(tokenized_datasets["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args  = Seq2SeqTrainingArguments(
    output_dir=f"{model_name}-finetuned-koGPT-complete_story",
    evaluation_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=False, #구글링 결과는 이걸 True로 하라는데 False 로 하니까 갑자기 해결됐다. 뭐지? 아마도 이걸 True로 해서 list, 즉 결과를 반환해주어버려서 compute_metrics에 list가 들어갔나?
    logging_steps = logging_steps,
    push_to_hub=True,
)

training_args.max_seq_length = 32


In [None]:
trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics = compute_metrics if training_args.predict_with_generate else None,
    preprocess_logits_for_metrics = preprocess_logits_for_metrics
)


In [None]:
trainer.train()

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,2.2495,2.124238
2,2.046,2.085293
3,1.9224,2.075103
4,1.8067,2.086714
5,1.6903,2.128392
6,1.586,2.165923
7,1.4973,2.205131
8,1.4294,2.228153


TrainOutput(global_step=30040, training_loss=1.7783810695858993, metrics={'train_runtime': 3672.9266, 'train_samples_per_second': 261.677, 'train_steps_per_second': 8.179, 'total_flos': 1.569581236224e+16, 'train_loss': 1.7783810695858993, 'epoch': 8.0})

In [None]:
trainer.push_to_hub(commit_message="Training complete", tags="text generation")

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

'https://huggingface.co/Tiabet/kogpt2-base-v2-finetuned-koGPT-complete_story/tree/main/'

In [None]:
predictions,_,_ = trainer.predict(tokenized_datasets['test'])
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

TypeError: ignored

In [None]:
from transformers import pipeline

hub_model_id = "Tiabet/kogpt2-base-v2-finetuned-koGPT-complete_story"
generator = pipeline("text-generation", model=hub_model_id)

In [None]:
generator(dataset['test']['sentence1'][0])

TypeError: ignored

In [None]:
dataset['test']['sentence1'][0]

'서영이는 기차에 타서 자리에 앉았다.'