In [None]:
!pip install transformers
!pip install datasets

In [None]:
!pip install --upgrade datasets

In [None]:
import torch
import datetime
import pandas as pd
import numpy as np
import torch.nn as nn

from tqdm import tqdm
from datasets import load_dataset
from torch.optim import AdamW
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, BertConfig
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split

# Load Data

In [None]:
dataset = load_dataset("jeanlee/kmhas_korean_hate_speech")
dataset

In [None]:
train = dataset['train']
validation = dataset['validation']
test = dataset['test']

In [None]:
print(train[0])
print(validation[0])
print(test[0])

In [None]:
train_sentences = list(map(lambda x : '[CLS] ' + str(x) + ' [SEP]', train['text']))
valid_sentences = list(map(lambda x : '[CLS] ' + str(x) + ' [SEP]', validation['text']))
test_sentences = list(map(lambda x : '[CLS] ' + str(x) + ' [SEP]', test['text']))

In [None]:
print(train_sentences[0])

In [None]:
# 정답인 레이블의 위치에는 1, 나머지 위치에는 0 기록

enc = MultiLabelBinarizer()

def multi_label(examples):
  enc_label = enc.fit_transform(examples['label'])
  float_arr = np.vstack(enc_label[:]).astype(float)
  update_label = float_arr.tolist()
  return update_label

In [None]:
train_labels = multi_label(train)
valid_labels = multi_label(validation)
test_labels = multi_label(test)

In [None]:
train_sentences[:5]

In [None]:
train_labels[:5]

In [None]:
tokenizer = BertTokenizer.from_pretrained('klue/bert-base')

In [None]:
max_len = 128

def data_to_tensor(sentences, labels, max_len):

  tokenized_text = [tokenizer.tokenize(sent) for sent in sentences]
  input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_text]
  input_ids = pad_sequences(input_ids, maxlen = max_len, dtype = 'long', truncating = 'post', padding = 'post')

  attention_masks = []

  for seq in input_ids:
    seq_mask = [float(i > 0) for i in seq]
    attention_masks.append(seq_mask)

  tensor_inputs = torch.tensor(input_ids)
  tensor_labels = torch.tensor(labels)
  tensor_masks = torch.tensor(attention_masks)

  return tensor_inputs, tensor_labels, tensor_masks

In [None]:
train_inputs, train_labels, train_masks = data_to_tensor(train_sentences, train_labels, max_len = max_len)
valid_inputs, valid_labels, valid_masks = data_to_tensor(valid_sentences, valid_labels, max_len = max_len)
test_inputs, test_labels, test_masks = data_to_tensor(test_sentences, test_labels, max_len = max_len)

In [None]:
print('정수 인코딩 결과 :', test_inputs[0])
print('-' * 100)

print('원본 문장 복원 결과 :', tokenizer.decode(test_inputs[0]))
print('-' * 100)

print('어텐션 마스크 :', test_masks[0])
print('-' * 100)

print('샘플의 길이 :', len(test_inputs[0]))
print('-' * 100)

print('레이블 :', test_labels[0])

In [None]:
batch_size = 64

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_loader = DataLoader(train_data, sampler = train_sampler, batch_size = batch_size)

valid_data = TensorDataset(valid_inputs, valid_masks, valid_labels)
valid_sampler = SequentialSampler(valid_data)
valid_loader = DataLoader(valid_data, sampler = valid_sampler, batch_size = batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
test_loader = DataLoader(test_data, sampler = test_sampler, batch_size = batch_size)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
num_labels = 9

model = BertForSequenceClassification.from_pretrained('klue/bert-base', num_labels = num_labels, problem_type = 'multi_label_classification')
model.to(device)

In [None]:
epochs = 3
optimizer = AdamW(model.parameters(), lr = 2e-5)

In [None]:
def metrics(pred, labels, threshold = 0.5):

  y_pred = (pred >= threshold).astype(int)

  accuarcy = accuracy_score(labels, y_pred)

  f1_micro = f1_score(labels, y_pred, average = 'micro', zero_division = 0)
  f1_macro = f1_score(labels, y_pred, average = 'macro', zero_division = 0)
  f1_weight = f1_score(labels, y_pred, average = 'weighted', zero_division = 0)
  roc_auc = roc_auc_score(labels, y_pred, average = 'micro')

  metrics = {'Accuracy' : accuarcy,
             'f1_micro' : f1_micro,
             'f1_macro' : f1_macro,
             'f1_weight' : f1_weight,
             'Roc_auc_score' : roc_auc}

  return metrics

In [None]:
def train_epoch(model, loader, optimizer, device):

  total_loss = 0
  model.train()

  for step, batch in tqdm(enumerate(loader), desc = 'Training Batch'):
    batch = tuple(t.to(device) for t in batch)
    b_inputs, b_masks, b_labels = batch

    outputs = model(b_inputs, attention_mask = b_masks, labels = b_labels, token_type_ids = None)
    loss = outputs.loss

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

  return total_loss / len(loader)

In [None]:
def evaluation(model, loader, device):

  model.eval()
  total_loss = 0
  pred = []
  true = []

  for batch in loader:
    batch = tuple(t.to(device) for t in batch)
    b_inputs, b_masks, b_labels = batch

    with torch.no_grad():
      outputs = model(b_inputs, attention_mask = b_masks, labels = b_labels, token_type_ids = None)

    loss = outputs.loss
    total_loss += loss.item()

    logits = outputs.logits.detach().cpu().numpy()
    label_ids = b_labels.cpu().numpy()

    sigmoid = nn.Sigmoid()
    probs = sigmoid(torch.tensor(logits)).numpy()

    pred.extend(probs)
    true.extend(label_ids)

  eval_metrics = metrics(np.array(pred), np.array(true))
  avg_loss = total_loss / len(loader)

  return avg_loss, eval_metrics

In [None]:
min_val_loss = float('inf')

for epoch in range(epochs):
  print(f'{epoch + 1} | {epochs}')

  train_epoch(model, train_loader, optimizer, device)

  print('Running Validation...')
  avg_loss, eval_metrics = evaluation(model, valid_loader, device)
  print(f'Validation Loss : {avg_loss}')
  print('Accuracy : {0:.2f}'.format(eval_metrics['Accuracy']))
  print('F1_micro : {0:.2f}'.format(eval_metrics['f1_micro']))
  print('F1_macro : {0:.2f}'.format(eval_metrics['f1_macro']))
  print('F1_weight : {0:.2f}'.format(eval_metrics['f1_weight']))

  if avg_loss < min_val_loss:
    print(f'Validation loss decreased ({min_val_loss:.2f} --> {avg_loss:.2f}). Saving model...')
    torch.save(model.state_dict(), 'best_model.pt')
    min_val_loss = avg_loss

In [None]:
from transformers import pipeline

pipe = pipeline('text-classification', model = model.to(device), tokenizer = tokenizer, max_length = 512, device = 0, return_all_scores = True, function_to_apply = 'sigmoid')

In [None]:
label_dict = {'LABEL_0' : '출신차별', 'LABEL_1' : '외모차별', 'LABEL_2' : '정치성향차별', 'LABEL_3' : '혐오욕설', 'LABEL_4' : '연령차별',
              'LABEL_5' : '성차별', 'LABEL_6' : '인종차별', 'LABEL_7' : '종교차별', 'LABEL_8' : '해당사항없음'}

def prediction(text):
  result = pipe(text)
  return [label_dict[res['label']] for res in result[0] if res['score'] > 0.5]

In [None]:
prediction('틀 니 들은 왜 그렇게 민폐를 끼치냐? 특히 나이 먹은 남자들이 심하다')