In [1]:
# !pip install transformers
# !pip install datasets # Hugging Face와 연동

In [2]:
import argparse
import copy
import json
import logging
import os
import warnings
warnings.filterwarnings("ignore")
import logging
logging.basicConfig(level=logging.ERROR)

import numpy as np
import pandas as pd

from datasets import load_dataset
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import torch

* Hugging Face Auto Class
  * Auto 키워드: 사전 학습된(pre-trained) 모델을 이용해 원하는 작업을 수행한다.
    * AutoTokenizer: 입력을 토큰(token)으로 바꾸는 기능을 수행한다.
사전 학습된 tokenizer를 불러올 수 있다.
    * AutoModelForSequenceClassification: 문장 분류(classification)을 위한 자동화된 모델을 제공한다.
  * 사전 학습된 BERT 모델을 불러올 수 있다.
    * 예시) bert-base-multilingual-sentiment
  * from_pretrained() 함수를 이용해 특정한 경로에서 모델을 불러올 수 있다.
  * PyTorch 혹은 TensorFlow 상관없이 사용할 수 있다.

In [3]:
import transformers
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup

#### <b>기본 모델 클래스 정의</b>

In [4]:
# 기초적인(base) 모델 정의
class BaseModel(torch.nn.Module): 
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.from_pretrained()

    def save_pretrained(self, save_dir):
        self.model.save_pretrained(save_dir)
        for key in ["special_tokens_map_file", "tokenizer_file"]:
            self.tokenizer.init_kwargs.pop(key, None)
        self.tokenizer.save_pretrained(save_dir)

    def from_pretrained(self):
        raise NotImplementedError

    def forward(self, inputs):
        return self.model(**inputs)

    def eval_step(self, outputs):
        raise NotImplementedError

    def get_optimizer(self):
        """Prepare optimizer"""
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.config.weight_decay,
            },
            {
                "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": self.config.weight_decay,
            },
        ]

        optimizer = AdamW(optimizer_grouped_parameters, lr=self.config.learning_rate, eps=self.config.adam_epsilon)
        return optimizer

    def get_scheduler(self, batch_num, optimizer):
        """Prepare scheduler"""
        if self.config.warmup_proportion == 0.0:
            return None

        t_total = batch_num // self.config.gradient_accumulation_steps * self.config.num_train_epochs

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=int(t_total * self.config.warmup_proportion),
            num_training_steps=t_total,
        )

        return scheduler

    def tensor_to_array(self, tensor):
        return tensor.detach().cpu().numpy()

    def tensor_to_list(self, tensor):
        return self.tensor_to_array(tensor).tolist()

In [5]:
class ClsModel(BaseModel):
    def __init__(self, config):
        super().__init__(config)
        config.label2id = self.config.label2id

    # 사전 학습된 모델 가중치 불러오기
    def from_pretrained(self):
        data_file = os.path.join(self.config.data_dir, str(self.config.train_file))
        self.config.label2id = process_map[self.config.dataset](self.config, data_file, True, get_label_map=True)
        num_labels = len(self.config.label2id)
        if num_labels != self.config.num_labels:
            print(
                f"given args num_labels({self.config.num_labels}) is not same with num_labels({num_labels}) from dataset."
            )
            print(f"switch num_labels {self.config.num_labels} -> {num_labels}")
            self.config.num_labels = num_labels
        model_config = AutoConfig.from_pretrained(self.config.model_name_or_path, num_labels=self.config.num_labels)
        model_config.label2id = self.config.label2id
        model_config.id2label = {int(v): k for k, v in model_config.label2id.items()}
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.config.model_name_or_path, config=model_config, cache_dir=self.config.cache_dir
        )
        # BERT model과 별개로, 입력 문자열을 토큰의 index로 바꾸어주는 tokenizer가 사용됨
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.model_name_or_path, cache_dir=self.config.cache_dir)

    def forward(self, inputs):
        outputs = self.model(**inputs)
        return outputs

    def eval_step(self, inputs, outputs):
        logits = outputs.logits.detach().cpu()
        predictions = self.tensor_to_list(torch.argmax(logits, dim=-1))
        labels = self.tensor_to_list(inputs["labels"])
        results = [{"prediction": prediction, "label": label} for prediction, label in zip(predictions, labels)]
        return results

In [6]:
def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

def cal_running_avg_loss(loss, running_avg_loss, decay=0.99):
    if running_avg_loss == 0:
        return loss
    running_avg_loss = running_avg_loss * decay + (1 - decay) * loss
    return running_avg_loss

#### <b> 데이터 세트 다운로드</b>

In [7]:
df = pd.read_csv("10000_labeled.csv")
df = df.fillna(0)  # nan값 0으로 채우기
df = df[df["선호도"]!= -1] # 선호도 -1 인 문장 학습에서 배제
new_df = df[["comment", "선호도"]]  # 학습에 사용할 column 선택
new_df

Unnamed: 0,comment,선호도
13,평생에 한 번 뿐인 결혼. 한녀랑 할거야? 제정신임?,0.0
14,여기 조산명 이러는새끼들 싹 다 독거틀딱들임,0.0
15,헬조선은 멸망해야만 한다,0.0
16,좌빨 혐일은 돈이 된다,0.0
17,짱개가 괜히 한국와서 노가다 뛰는게 아니지,0.0
...,...,...
10032,윤석열 이혼하고 정치했으면 좋겠음. 영부인이 너무 걸림돌임 내가봐도 재산 증식과정 ...,2.0
10033,나찌빠는 군첩 나찌다운 글 ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ,2.0
10034,쟤 재테크 하는 족족 성공하는거 보면 저런 드립 자체도 대중이 수용하는가 반발이 올...,3.0
10035,외모 둘다 괜찮다고 썻는데 여자 외모보고 사겼겠지 그러니까 저런 태도지 내가 너 만...,2.0


In [8]:
new_df.to_csv("new.csv")  # 새로운 df로 저장

In [9]:
max_length = new_df['comment'].str.len().max()
max_length

400

#### <b>전처리 라이브러리</b>

In [10]:
# def sample_writer(data, config, tokenizer, is_train):
def sample_writer(data):
    feature = tokenizer(
        data["text"],
        max_length=config.max_seq_length,
        padding="max_length",
        truncation=True,
        add_special_tokens=True,
    )
    write_data = {
        "input_ids": feature["input_ids"],
        "attention_mask": feature["attention_mask"],
        "labels": data["label"],
    }
    return write_data


def make_label_map(labels):
    unique_labels = sorted(list(set(labels)))
    label2id = dict()
    for i, label in enumerate(unique_labels):
        label2id[label] = i
    return label2id


def postprocess():
    def decorator(fn):
        def wrapped(config, data_file, is_train, **kwargs):
            # 인자 값으로 get_label_map이 넘어온 경우
            get_label_map = kwargs.get("get_label_map", False)
            texts, labels = fn(config, data_file, is_train)

            try:
                label2id = config.label2id
            except Exception:
                label2id = label2id = make_label_map(labels)

            labels = [label2id[label] for label in labels]

            if get_label_map:
                return label2id

            # 문자열을 수로 바꿔준 뒤에 그 파일을 "학습 파일" 목적으로 저장하는 코드
            data = [{"text": text, "label": label} for text, label in zip(texts, labels)]
            pd.DataFrame(data).to_csv(
                "{}_{}_{}.csv".format(data_file, config.dataset, "train" if is_train else "valid"),
                index=False,
                encoding="utf-8-sig",
            )
            if is_train:
                pd.DataFrame(list(label2id.items()), columns=["label", "id"]).to_csv(
                    "{}_{}_label2id.csv".format(data_file, config.dataset), index=False, encoding="utf-8-sig"
                )

            return data

        return wrapped

    return decorator


def train_split(config, texts, labels, is_train):
    x_train, y_train, x_label, y_label = train_test_split(
        texts, labels, test_size=0.2, random_state=config.seed, stratify=labels
    )
    if is_train:
        texts, labels = x_train, x_label
    else:
        texts, labels = y_train, y_label
    return texts, labels


@postprocess()
def process_comment_cls(config, data_file, is_train):
    df = pd.read_csv(data_file)
    try:
        labels = df["선호도"].astype(str).values.tolist()
    except Exception:
        labels = df["선호도"].astype(str).values.tolist()
    texts = df["comment"].astype(str).values.tolist()
    texts, labels = train_split(config, texts, labels, is_train)
    return texts, labels

process_map = {
    "comment": process_comment_cls
}

def collate_fn(features):
    input_ids = [sample["input_ids"] for sample in features]
    attention_mask = [sample["attention_mask"] for sample in features]
    labels = [sample["labels"] for sample in features]

    input_ids = torch.tensor(np.array(input_ids).astype(np.int64), dtype=torch.long)
    attention_mask = torch.tensor(np.array(attention_mask).astype(np.int8), dtype=torch.long)
    labels = torch.tensor(np.array(labels).astype(np.int64), dtype=torch.long)
    inputs = {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }
    return inputs

#### <b>실질적인 데이터 로더 파트</b>

In [11]:
import multiprocessing

def init_sample_writer(_config, _tokenizer, _is_train, _writer):
    global config
    global tokenizer
    global is_train
    global writer
    config = _config
    tokenizer = _tokenizer
    is_train = _is_train
    writer = _writer

In [12]:
def write_samples(config, tokenizer, is_train, processor, writer_file, data, workers=4):
    write_cnt = 0
    with multiprocessing.Pool(
        processes=workers,
        initializer=init_sample_writer,
        initargs=(config, tokenizer, is_train, sample_writer),
    ) as pool:
        for write_data in tqdm(
            pool.imap(sample_writer, data),
            total=len(data),
            dynamic_ncols=True,
            desc="writing samples...",
            position=0,
            leave=True
        ):
            if isinstance(write_data, list):
                for datum in write_data:
                    writer_file.write(json.dumps(datum) + "\n")
                write_cnt += len(write_data)
            else:
                writer_file.write(json.dumps(write_data) + "\n")
                write_cnt += 1
    return write_cnt

In [13]:
class IterableDatasetPad(torch.utils.data.IterableDataset):
    def __init__(
        self,
        dataset: torch.utils.data.IterableDataset,
        batch_size: int = 1,
        num_devices: int = 1,
        seed: int = 0,
    ):
        self.dataset = dataset
        self.batch_size = batch_size
        self.seed = seed
        self.num_examples = 0

        chunk_size = self.batch_size * num_devices
        length = len(dataset)
        self.length = length + (chunk_size - length % chunk_size)

    def __len__(self):
        return self.length

    def __iter__(self):
        self.num_examples = 0
        if (
            not hasattr(self.dataset, "set_epoch")
            and hasattr(self.dataset, "generator")
            and isinstance(self.dataset.generator, torch.Generator)
        ):
            self.dataset.generator.manual_seed(self.seed + self.epoch)

        first_batch = None
        current_batch = []
        for element in self.dataset:
            self.num_examples += 1
            current_batch.append(element)
            # Wait to have a full batch before yielding elements.
            if len(current_batch) == self.batch_size:
                for batch in current_batch:
                    yield batch
                    if first_batch is None:
                        first_batch = batch.copy()
                current_batch = []

        while self.num_examples < self.length:
            add_num = self.batch_size - len(current_batch)
            self.num_examples += add_num
            current_batch += [first_batch] * add_num
            for batch in current_batch:
                yield batch
            current_batch = []

#### <b>실제로 처리를 담당하는 함수</b>

In [14]:
import torch.utils.data as torch_data

def get_data(config, tokenizer, is_train=True, overwrite=False):
    if is_train:
        data_file = config.train_file
    else:
        data_file = config.predict_file

    data_path = config.data_dir
    if data_file is not None:
        data_path = os.path.join(data_path, data_file)
    else:
        data_path += "/"

    data_processor = process_comment_cls # 추적
    if data_processor is None:
        raise Exception(f"Invalid data task {config.task}!")

    processor = process_comment_cls
    if processor is None:
        raise Exception(f"Invalid task dataset {config.dataset}!")

    comps = [
        data_path,
        config.dataset,
        config.model_name_or_path.replace("/", "_"),
        config.max_seq_length,
        "train" if is_train else "dev",
        "dataset.txt",
    ]
    dataset_file = "_".join([str(comp) for comp in comps])
    print("dataset_file:", dataset_file)

    if not os.path.exists(dataset_file) or overwrite:
        with open(dataset_file, "w", encoding="utf-8") as writer_file:
            if data_file is None or not os.path.isdir(data_path):
                data = processor(config, data_path, is_train)
                cnt = write_samples(
                    config, tokenizer, is_train, data_processor, writer_file, data, workers=config.threads
                )
            else:

                cnt = 0
                for filename in sorted([f for f in os.listdir(data_path) if f.endswith(".json")]):
                    data = processor(config, os.path.join(data_path, filename), is_train)
                    cnt += write_samples(
                        config, tokenizer, is_train, data_processor, writer_file, data, workers=config.threads
                    )
            print(f"{cnt} features processed from {data_path}")

    dataset = load_dataset("text", data_files=dataset_file, download_mode="force_redownload")["train"]
    dataset = dataset.map(lambda x: json.loads(x["text"]), batched=False)

    if not is_train:
        dataset = IterableDatasetPad(
            dataset=dataset,
            batch_size=config.train_batch_size if is_train else config.eval_batch_size,
            num_devices=config.world_size,
            seed=config.seed,
        )

    dataloader = torch_data.DataLoader(
        dataset,
        sampler=torch_data.RandomSampler(dataset) if is_train else None,
        drop_last=False,
        batch_size=config.train_batch_size if is_train else config.eval_batch_size,
        collate_fn=(collate_fn),
    )

    return dataloader

#### <b>모델 학습 및 평가 라이브러리</b>

In [15]:
from functools import partial
import sklearn.metrics as sklearn_metrics

binary_metrics = {
    "accuracy": sklearn_metrics.accuracy_score,
    "precision": sklearn_metrics.precision_score, # TP / (TP + FP)
    "recall": sklearn_metrics.recall_score, # recall = sensitivity (민감도)
    "f1": sklearn_metrics.f1_score,
    "matthews_corrcoef": sklearn_metrics.matthews_corrcoef,
    "roc_auc": sklearn_metrics.roc_auc_score,
}

metrics = {
    "accuracy": sklearn_metrics.accuracy_score,
    "f1-macro": partial(sklearn_metrics.f1_score, average="macro"),
}


def eval_cls(results, **kwargs):
    predictions = np.array([result["prediction"] for result in results])
    labels = np.array([result["label"] for result in results])
    is_binary = len(set(labels.tolist())) < 3
    results = {
        metric: round(f(labels, predictions) * 100, 2)
        for metric, f in (binary_metrics.items() if is_binary else metrics.items())
    }
    return {
        "results": results,
        "best_score": results["f1" if is_binary else "f1-macro"],
    }

In [16]:
# _run_epoch() 함수는, 우리가 흔히 알고있는 학습 함수와 동일
def _run_epoch(model, loader, device=None, context=None, **kwargs):
    config = kwargs["config"]
    is_train = kwargs["is_train"]
    
    avg_loss = 0
    results = []
    batch_num = len(loader)

    if is_train: 
        model.train() 
        if config.use_tpu:
            optimizer = context.getattr_or(
                "optimizer",
                lambda: model.get_optimizer(),
            )
            scheduler = context.getattr_or(
                "scheduler",
                lambda: model.get_scheduler(batch_num, optimizer),
            )
        else:
            optimizer = kwargs["optimizer"]
            scheduler = kwargs["scheduler"]
    else:
        model.eval()

    is_master = True
    
    pbar = tqdm(enumerate(loader), total=batch_num, disable=not is_master, dynamic_ncols=True, position=0, leave=True)

    for i, inputs in pbar: 
        if not config.use_tpu:
            for k, v in inputs.items():
                if isinstance(v, torch.Tensor):
                    inputs[k] = v.to(device)

        outputs = model(inputs)
        loss = outputs.loss.mean()

        # AutoClass는 loss가 내부적으로 사용됨 => cross entropy 쓰임.
        avg_loss = cal_running_avg_loss(loss.item(), avg_loss)
        loss /= config.gradient_accumulation_steps

        if is_train:
            loss.backward()
            if i % config.gradient_accumulation_steps == 0 or i == batch_num - 1:
                if config.max_grad_norm > 0:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)

                optimizer.step()
                optimizer.zero_grad()

                if scheduler is not None:
                    scheduler.step()
        else:
            result = (model.module if hasattr(model, "module") else model).eval_step(inputs, outputs)
            results.extend(result)

        if is_master:
            pbar.set_description(
                f"epoch: {kwargs['epoch'] + 1}, {('train' if is_train else 'valid')} loss: {min(100, round(avg_loss, 4))}"
            )

    return {
        "loss": avg_loss,
        "result": results,
    }

# 학습 코드에서 호출하는 함수
def run_epoch(**kwargs):
    model = kwargs.pop("model")
    if kwargs["config"].use_tpu:
        results = model(_run_epoch, **kwargs)
    else: 
        results = _run_epoch(model, **kwargs)

    if isinstance(results, list): 
        loss = sum([result["loss"] for result in results]) / len(results)
        result = []
        for res in results:
            result.extend(res["result"])
        results = {"loss": loss, "result": result}

    return results

#### <b>하이퍼 파라미터 설정</b>

In [17]:
from types import SimpleNamespace

config = SimpleNamespace()

config.task = "cls"
config.dataset = "comment"

config.cache_dir = "cache"
config.output_dir = "output"

config.use_tpu = False
config.model_name_or_path = "monologg/kobigbird-bert-base" # Model name or path
config.data_dir = "./" # The input data dir

config.train_file = "new.csv"
config.predict_file = "new.csv"

config.max_seq_length = 1024 # The maximum total input sequence length after tokenization.
config.train_batch_size = 4 # Batch size for training.
config.eval_batch_size = 2 # Batch size for evaluation.

config.learning_rate = 3e-5 # The initial learning rate for Adam.
config.num_train_epochs = 10 # Total number of training epochs to perform.

config.num_labels = 5
config.gradient_accumulation_steps = 2 # Number of updates steps to accumulate before performing a backward/update pass.

config.threads = 4
config.seed = 42 # random seed for initialization

config.do_train = True # Whether to run training.
config.do_eval_during_train = True
config.do_eval = True # Whether to run prediction.

config.do_lower_case = False
config.weight_decay = 0.0 # Weight decay if we apply some.
config.adam_epsilon = 1e-8 # Epsilon for Adam optimizer.
config.max_grad_norm = 1.0 # Max gradient norm.
config.warmup_proportion = 0.0 # Warmup proportion for linear warmup
#config.attention_type = "original_full"

In [18]:
if not os.path.exists(config.cache_dir):
    os.makedirs(config.cache_dir)

output_dir = os.path.join(config.output_dir, config.task, config.dataset)
print("Output directory:", output_dir)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

Output directory: output/cls/comment


#### 딥러닝 모델 초기화하기

In [19]:
# 현재 모델 이름이 "monologg/kobigbird-bert-base" 이므로, Hugging Face에서 찾아서 불러옴
set_seed(config.seed)

# 딥러닝 모델 초기화
model = ClsModel(config)

print(f"configuration: {str(config)}")

if torch.cuda.is_available(): # GPU를 사용할 수 있다면
    gpu_count = torch.cuda.device_count()
    print(f"{gpu_count} GPU device detected")
    devices = ["cuda:{}".format(i) for i in range(gpu_count)]
    model_dp = torch.nn.DataParallel(model, device_ids=devices)
    model.to(devices[0])
else: # GPU를 사용할 수 없다면 CPU로 구동
    devices = ["cpu"]
    model_dp = model

Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at monologg/kobigbird-bert-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


configuration: namespace(task='cls', dataset='comment', cache_dir='cache', output_dir='output', use_tpu=False, model_name_or_path='monologg/kobigbird-bert-base', data_dir='./', train_file='new.csv', predict_file='new.csv', max_seq_length=1024, train_batch_size=4, eval_batch_size=2, learning_rate=3e-05, num_train_epochs=10, num_labels=5, gradient_accumulation_steps=2, threads=4, seed=42, do_train=True, do_eval_during_train=True, do_eval=True, do_lower_case=False, weight_decay=0.0, adam_epsilon=1e-08, max_grad_norm=1.0, warmup_proportion=0.0, label2id={'0.0': 0, '1.0': 1, '2.0': 2, '3.0': 3, '4.0': 4})
4 GPU device detected


#### 데이터 및 학습 세팅 초기화하기

* 아까 전에 "실제로 학습용 데이터를 만드는" get_data() 함수를 정의했었음.

In [20]:
config.world_size = len(devices)
if config.do_train:
    train_loader = get_data(config, tokenizer=model.tokenizer, overwrite=True) # 이 코드를 처음 실행하는 경우
    # train_loader = get_data(config, tokenizer=model.tokenizer, overwrite=False) # 한 번 데이터 처리를 한 경우
valid_loader = get_data(config, tokenizer=model.tokenizer, is_train=False)

optimizer = None
scheduler = None
if config.do_train: # 학습 모드(train mode)인 경우
    optimizer = model.get_optimizer()
    scheduler = model.get_scheduler(len(train_loader), optimizer)

params = {
    "config": config,
    "model": model_dp,
    "optimizer": optimizer,
    "scheduler": scheduler,
}
if not config.use_tpu:
    params["device"] = devices[0]

dataset_file: ./new.csv_comment_monologg_kobigbird-bert-base_1024_train_dataset.txt


writing samples...: 100%|██████████| 8012/8012 [00:02<00:00, 2683.80it/s]


8012 features processed from ./new.csv


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/8012 [00:00<?, ? examples/s]

dataset_file: ./new.csv_comment_monologg_kobigbird-bert-base_1024_dev_dataset.txt


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/2003 [00:00<?, ? examples/s]

Map:   0%|          | 0/2003 [00:00<?, ? examples/s]

#### 모델 학습하기

In [None]:
def do_eval(epoch):
    with torch.no_grad():
        results = run_epoch(loader=valid_loader, epoch=epoch, is_train=False, **params)["result"]
        results = eval_cls(
            config=config,
            model=model,
            loader=valid_loader,
            tokenizer=model.tokenizer,
            results=results,
        )

    print("Eval results.")
    for k, v in results["results"].items():
        print(f"{k} : {v}")

    return results["best_score"]

train_losses = []  
val_accuracies = []  
if config.do_train: 
    best_score = 0
    for epoch in range(config.num_train_epochs):
        # run_epoch으로 학습 진행 => train_loader를 이용해 학습
        train_results = run_epoch(loader=train_loader, epoch=epoch, is_train=True, **params)
        train_loss = train_results['loss']  
        train_losses.append(train_loss)  

        score = 0
        if config.do_eval_during_train:
            score = do_eval(epoch)
            val_accuracies.append(score)
        
        if score >= best_score:
            best_score = score
            output_dir = os.path.join(config.output_dir, config.task, config.dataset, f"{epoch}-{best_score}-ckpt")
            copy.deepcopy(
                model_dp.module
                if hasattr(model_dp, "module")
                else model_dp._models[0]
                if hasattr(model_dp, "_models")
                else model_dp
            ).cpu().save_pretrained(output_dir)
            with open(os.path.join(output_dir, "finetune_config.json"), "w") as save_config:
                json.dump(vars(config), save_config, sort_keys=True, indent=4)
            print(f"Checkpoint {output_dir} saved.")

epoch: 1, train loss: 0.7639:  84%|████████▍ | 1692/2003 [30:56<05:26,  1.05s/it]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

epoch: 1, train loss: 0.7246: 100%|██████████| 2003/2003 [36:34<00:00,  1.10s/it]
epoch: 1, valid loss: 0.6603: 100%|██████████| 1004/1004 [02:33<00:00,  6.56it/s]


Eval results.
accuracy : 72.41
f1-macro : 36.17
Checkpoint output/cls/comment/0-36.17-ckpt saved.


epoch: 2, train loss: 0.629: 100%|██████████| 2003/2003 [36:43<00:00,  1.10s/it] 
epoch: 2, valid loss: 0.6963: 100%|██████████| 1004/1004 [02:33<00:00,  6.55it/s]


Eval results.
accuracy : 72.61
f1-macro : 45.86
Checkpoint output/cls/comment/1-45.86-ckpt saved.


epoch: 3, train loss: 0.4938: 100%|██████████| 2003/2003 [36:48<00:00,  1.10s/it]
epoch: 3, valid loss: 0.8698: 100%|██████████| 1004/1004 [02:32<00:00,  6.60it/s]


Eval results.
accuracy : 72.01
f1-macro : 45.89
Checkpoint output/cls/comment/2-45.89-ckpt saved.


epoch: 4, train loss: 0.397: 100%|██████████| 2003/2003 [36:49<00:00,  1.10s/it] 
epoch: 4, valid loss: 1.2086: 100%|██████████| 1004/1004 [02:29<00:00,  6.70it/s]


Eval results.
accuracy : 73.66
f1-macro : 47.81
Checkpoint output/cls/comment/3-47.81-ckpt saved.


epoch: 5, train loss: 0.3408:  98%|█████████▊| 1970/2003 [35:56<00:33,  1.02s/it]

In [None]:
import matplotlib.pyplot as plt

# 그래프 출력
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(range(1, config.num_train_epochs + 1), train_losses, label="Train Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Train Loss Over Epochs")
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(range(1, config.num_train_epochs + 1), val_accuracies, label="Validation Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Validation Accuracy Over Epochs")
plt.legend()

plt.tight_layout()
plt.show()

#### Test Dataset Inference

In [None]:
def inference(model, tokenizer, sentences, device):
    inference_results = []

    model.eval()
    with torch.no_grad():
        for sentence in sentences:
            inputs = tokenizer.encode_plus(
                sentence, return_tensors="pt", padding="max_length", truncation=True, max_length=config.max_seq_length
            )
            inputs = {key: value.to(device) for key, value in inputs.items()}  # 입력을 GPU로 이동
            outputs = model(inputs)  # 모델 추론 수행
            predictions = torch.argmax(outputs.logits, dim=-1)  # 예측된 클래스 인덱스 추출
            inference_results.append(predictions.item())  # 결과 리스트에 추가
    
    return inference_results

In [None]:
test_df = pd.read_csv("test_comment.csv")
sentences_to_infer = test_df["comment"].tolist()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inference_results = inference(model, model.tokenizer, sentences_to_infer, device)
test_df["선호도"] = inference_results

In [None]:
output_csv_path = "inference_results2.csv"
test_df.to_csv(output_csv_path, index=False)

In [None]:
test_df

In [157]:
from transformers import AutoModelForSequenceClassification

best_model = AutoModelForSequenceClassification.from_pretrained(output_dir)
best_model.to(device)
best_model.eval()

BigBirdForSequenceClassification(
  (bert): BigBirdModel(
    (embeddings): BigBirdEmbeddings(
      (word_embeddings): Embedding(32500, 768, padding_idx=0)
      (position_embeddings): Embedding(4096, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BigBirdEncoder(
      (layer): ModuleList(
        (0-11): 12 x BigBirdLayer(
          (attention): BigBirdAttention(
            (self): BigBirdBlockSparseAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): BigBirdSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

In [160]:
with torch.no_grad():
    val_results = run_epoch(loader=valid_loader, epoch=-1, is_train=False, **params)["result"]
    val_results = eval_cls(
        config=config,
        model=best_model,
        loader=valid_loader,
        tokenizer=model.tokenizer,
        results=val_results,
    )

epoch: 0, valid loss: 1.9356: 100%|█████████| 1004/1004 [01:31<00:00, 11.00it/s]


In [164]:
val_results

{'results': {'accuracy': 74.25, 'f1-macro': 52.72}, 'best_score': 52.72}