In [1]:
import numpy as np
import pandas as pd
import torch
from torch import nn


device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [2]:
# 데이터 불러오기

df = pd.read_csv("./data/혐오표현_정제.csv")
df.head()

Unnamed: 0,content,hate,gender_hate
0,걍 이 병신나라 민도가 좆박은건데 어쩔수가 잇나 걍 망해야지,1,0
1,그 앰생 쓰레기 병신들은 짐승이고. 니는 짐승을 사람 취급해주냐.,1,0
2,제발 너그집으로 꺼져 애미뒤진새끼야,1,0
3,몸만지면 대주는 걸레년임,1,1
4,환경오염도 이쯤되면 심각하고 자식 낳아봐야 지들 애미애비(그러므로 너 나 우리)가 ...,1,0


In [3]:
# 만약 클래스별 가중치를 구하겠다면 사용

# weight = df.shape[0] / (len(df["gender_hate"].unique()) * np.bincount(df["gender_hate"]))
# weight = torch.from_numpy(weight).float()
# weight

tensor([0.5651, 4.3406])

In [4]:
# 모델 불러오기

from transformers import ElectraForSequenceClassification, ElectraTokenizer
from tokenization_kocharelectra import KoCharElectraTokenizer


electramodel = ElectraForSequenceClassification.from_pretrained("monologg/kocharelectra-small-discriminator")
tokenizer = KoCharElectraTokenizer.from_pretrained("monologg/kocharelectra-small-discriminator")
electramodel = electramodel.to(device)

Some weights of the model checkpoint at monologg/kocharelectra-small-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/kocharelectra-small-discriminator and are newly initializ

In [5]:
# 데이터프레임을 TensorDataset으로 만들기

from torch.utils.data import TensorDataset

from data_preprocess import df_to_feature_and_label


all_data = TensorDataset(*df_to_feature_and_label(df, tokenizer, max_length=256))

lr = 0.001

# criterion = torch.nn.CrossEntropyLoss(weight=weight.to(device))  # 클래스별 가중치가 적용된 크로스엔트로피로스
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(electramodel.classifier.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

In [12]:
# ImbalancedDatasetSampler에서 사용되는 함수
# dataset[:][3]은 label값을 가리킨다.

def get_labels(dataset):
    return dataset[:][3]

In [8]:
from tqdm.notebook import tqdm, trange


def train(dataloader, model, criterion, optimizer, scheduler=None):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)

    with tqdm(total=num_batches, desc="train") as progressbar:
        for batch, (ids, mask, tti, label) in enumerate(dataloader):
            ids, mask, tti, label = ids.to(device), mask.to(device), tti.to(device), label.to(device)

            pred = model(input_ids=ids, attention_mask=mask, token_type_ids=tti)
            logits = pred.logits.to(device)
            loss = criterion(logits.view(-1, 2), label.view(-1))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if scheduler is not None:
                scheduler.step()

            if batch % 20 == 0:
                loss_, current = loss.item(), batch * len(ids)
                print(f"loss: {loss_:>7f} [{current:>5d}/{size:>5d}]")

            progressbar.update(1)

In [9]:
def test(dataloader, model, criterion):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)

    model.eval()
    test_loss, correct = 0, 0

    with torch.no_grad(), tqdm(total=num_batches, desc="test") as progressbar:
        for ids, mask, tti, label in dataloader:
            ids, mask, tti, label = ids.to(device), mask.to(device), tti.to(device), label.to(device)

            pred = model(input_ids=ids, attention_mask=mask, token_type_ids=tti)
            logits = pred.logits.to(device)
            loss = criterion(logits.view(-1, 2), label.view(-1))

            test_loss += loss
            correct += (pred.logits.argmax(1) == label).type(torch.float).sum().item()

            progressbar.update(1)

    test_loss /= num_batches
    correct /= size
    print(f"Test Error:\n Accuracy: {100 * correct:>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [13]:
from datetime import datetime

from sklearn.model_selection import StratifiedKFold
from torch.utils.data import Subset, DataLoader

from torchsampler import ImbalancedDatasetSampler


def training(dataset, model, criterion, optimizer, epochs, scheduler=None, cv=5, batch_size=64):
    for epoch in trange(1, epochs + 1, desc="Epoch"):
        print(f"Epoch {epoch}\n------------------------------------------")
        kfold = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)

        for train_idx, val_idx in kfold.split(dataset[:][0], dataset[:][3]):
            train_data = Subset(dataset, train_idx)
            val_data = Subset(dataset, val_idx)

            train_loader = DataLoader(
                train_data,
                batch_size=batch_size,
                sampler=ImbalancedDatasetSampler(train_data, callback_get_label=get_labels))
            val_loader = DataLoader(
                val_data,
                batch_size=batch_size,
                sampler=ImbalancedDatasetSampler(val_data, callback_get_label=get_labels))

            train(train_loader, model, criterion, optimizer, scheduler)
            test(val_loader, model, criterion)

    t = datetime.now()
    today = f"{t.year % 100}{t.month:02}{t.day:02}"
    file_name = f"model{today}.pth"

    print("완료")
    torch.save(model.state_dict(), file_name)
    print("모델 저장:", file_name)

In [None]:
training(all_data, electramodel, criterion, optimizer, epochs=200, scheduler=scheduler)