### Install Prerequisite Libraries

In [1]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m95.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2
Looking in in

### Load Datasets with HuggingFace's datasets Library

In [2]:
import pandas as pd
import numpy as np

In [3]:
from datasets import load_dataset

kmhas = load_dataset("jeanlee/kmhas_korean_hate_speech")
unsmile = load_dataset('smilegate-ai/kor_unsmile')

Downloading builder script:   0%|          | 0.00/4.73k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading and preparing dataset kmhas_korean_hate_speech/default to /root/.cache/huggingface/datasets/jeanlee___kmhas_korean_hate_speech/default/1.0.0/17406fbed45548c92e0795df0675e21fb2a09ceaa098bd5ff58c7fdc7f8a63d4...


Downloading data:   0%|          | 0.00/2.96M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/326k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/823k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/78977 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/8776 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/21939 [00:00<?, ? examples/s]

Dataset kmhas_korean_hate_speech downloaded and prepared to /root/.cache/huggingface/datasets/jeanlee___kmhas_korean_hate_speech/default/1.0.0/17406fbed45548c92e0795df0675e21fb2a09ceaa098bd5ff58c7fdc7f8a63d4. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading and preparing dataset None/None (download: 1.39 MiB, generated: 4.93 MiB, post-processed: Unknown size, total: 6.32 MiB) to /root/.cache/huggingface/datasets/smilegate-ai___parquet/smilegate-ai--kor_unsmile-e0f75c6e3be1af78/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/290k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/15005 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/3737 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/smilegate-ai___parquet/smilegate-ai--kor_unsmile-e0f75c6e3be1af78/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

### Preprocess the datasets

In [4]:
combined = []

for data in kmhas:
  for row in zip(kmhas[data]['text'], kmhas[data]['label']):
    if len(row[1]) == 1 and 8 in row[1]:
      combined.append((row[0], 0))
    else:
      combined.append((row[0], 1))

for data in unsmile:
  for row in zip(unsmile[data]['문장'], unsmile[data]['clean']):
    if row[1] == 1:
      combined.append((row[0], 0))
    else:
      combined.append((row[0], 1))

dataset = pd.DataFrame(data=combined, columns=['document', 'label'])
dataset['label'].value_counts()
#dataset.to_csv('dataset.csv')

0    64289
1    64145
Name: label, dtype: int64

### Split datasets to train, test, validate

In [5]:
def train_validate_test_split(df, train_percent=.7, validate_percent=.2):
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

In [6]:
train_data, validate_data, test_data = train_validate_test_split(dataset)

In [7]:
print(len(train_data), len(validate_data), len(test_data))

89903 25686 12845


In [8]:
train_data = train_data.dropna(how = 'any') # Null 값이 존재하는 행 제거
print(train_data.isnull().values.any()) # Null 값이 존재하는지 확인

False


In [9]:
print(len(train_data))

89903


### Tokenize the data into wordpiece with BertTokenizer

In [10]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("klue/roberta-base")

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

In [11]:
val_data = validate_data.dropna(how = 'any')

In [12]:
print(len(val_data))

25686


In [13]:
X_train_list = train_data['document'].tolist()
X_val_list = validate_data['document'].tolist()
y_train = train_data['label'].tolist()
y_val = validate_data['label'].tolist()

In [14]:
X_train = tokenizer(X_train_list, truncation=True, padding=True)
X_val = tokenizer(X_val_list, truncation=True, padding=True)

### Fine-tune the KLUE Bert with keras

In [16]:
import tensorflow as tf

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(X_train),
    y_train
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(X_val),
    y_val
))

In [17]:
from transformers import TFBertForSequenceClassification
from tensorflow.keras.callbacks import EarlyStopping

In [18]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

In [19]:
model = TFBertForSequenceClassification.from_pretrained("klue/roberta-base", num_labels=2, from_pt=True)
model.compile(optimizer=optimizer, loss=model.hf_compute_loss, metrics=['accuracy'])

Downloading (…)lve/main/config.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


Downloading pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['roberta.encoder.layer.0.intermediate.dense.bias', 'roberta.encoder.layer.1.attention.self.query.bias', 'roberta.encoder.layer.11.attention.output.dense.weight', 'roberta.encoder.layer.4.attention.output.dense.bias', 'roberta.encoder.layer.9.attention.output.LayerNorm.bias', 'roberta.encoder.layer.9.output.dense.bias', 'roberta.encoder.layer.10.attention.output.LayerNorm.bias', 'roberta.encoder.layer.10.intermediate.dense.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.7.attention.self.query.weight', 'roberta.encoder.layer.10.output.LayerNorm.bias', 'roberta.encoder.layer.11.output.dense.bias', 'roberta.encoder.layer.6.attention.output.dense.weight', 'roberta.encoder.layer.6.output.LayerNorm.weight', 'roberta.encoder.layer.2.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention

In [20]:
model.hf_compute_loss

<bound method TFSequenceClassificationLoss.hf_compute_loss of <transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification object at 0x7f9adb263d00>>

In [21]:
tf.device(0)

callback_earlystop = EarlyStopping(
    monitor="val_accuracy", 
    min_delta=0.001,
    patience=2
)

model.fit(
    train_dataset.shuffle(10000).batch(128), epochs=5, batch_size=128,
    validation_data = val_dataset.shuffle(10000).batch(128),
    callbacks = [callback_earlystop]
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5


<keras.callbacks.History at 0x7f99c81a5150>

In [22]:
model.evaluate(val_dataset.batch(64))



[0.4690099060535431, 0.8169820308685303]

### Save the fine-tuned model

In [23]:
model.save_pretrained('curse_detection/roberta-base')
tokenizer.save_pretrained('curse_detection/roberta-base')

('curse_detection/roberta-base/tokenizer_config.json',
 'curse_detection/roberta-base/special_tokens_map.json',
 'curse_detection/roberta-base/vocab.txt',
 'curse_detection/roberta-base/added_tokens.json')

### Load and Test

In [26]:
from transformers import TextClassificationPipeline

# 로드하기
loaded_tokenizer = BertTokenizer.from_pretrained('curse_detection/roberta-base')
loaded_model = TFBertForSequenceClassification.from_pretrained('curse_detection/roberta-base', output_attentions=True)

text_classifier = TextClassificationPipeline(
    tokenizer=loaded_tokenizer, 
    model=loaded_model, 
    framework='tf',
    return_all_scores=True,
    device=0
)

Some layers from the model checkpoint at curse_detection/roberta-base were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at curse_detection/roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [27]:
def classify_with_better_output(text_classifier, test_document):
  output = text_classifier(test_document)[0]
  clean = output[0]['score']
  curse = output[1]['score']
  print(f'{test_document} 가 입력되었으며,')
  if clean > curse:
    print(f'모델은 이 문장을 {clean * 100}% 확률로 깨끗한 문장이라고 추론했습니다.')
  else:
    print(f'모델은 이 문장을 {curse * 100}% 확률로 욕설이나 혐오표현이 있는 문장이라고 추론했습니다.')

In [28]:
classify_with_better_output(text_classifier, '시바견은 너무 귀엽다.')

시바견은 너무 귀엽다. 가 입력되었으며,
모델은 이 문장을 78.0305564403534% 확률로 욕설이나 혐오표현이 있는 문장이라고 추론했습니다.


In [29]:
classify_with_better_output(text_classifier, '시발자동차는 1955년에 출시된 우리나라 최초의 자동차이다.')

시발자동차는 1955년에 출시된 우리나라 최초의 자동차이다. 가 입력되었으며,
모델은 이 문장을 99.55005049705505% 확률로 욕설이나 혐오표현이 있는 문장이라고 추론했습니다.


In [30]:
classify_with_better_output(text_classifier, '수박씨 발아는 심은 후 10~15일 후 진행된다.')

수박씨 발아는 심은 후 10~15일 후 진행된다. 가 입력되었으며,
모델은 이 문장을 95.43393850326538% 확률로 깨끗한 문장이라고 추론했습니다.


In [31]:
classify_with_better_output(text_classifier, '만두 몇 개 시키고 옴')

만두 몇 개 시키고 옴 가 입력되었으며,
모델은 이 문장을 99.39292669296265% 확률로 깨끗한 문장이라고 추론했습니다.


In [32]:
classify_with_better_output(text_classifier, '내 만두 가져가지 마 개시키야')

내 만두 가져가지 마 개시키야 가 입력되었으며,
모델은 이 문장을 99.51743483543396% 확률로 깨끗한 문장이라고 추론했습니다.


In [33]:
classify_with_better_output(text_classifier, '배고픈데 앞에서 만두 먹네 개시키')

배고픈데 앞에서 만두 먹네 개시키 가 입력되었으며,
모델은 이 문장을 99.46988821029663% 확률로 깨끗한 문장이라고 추론했습니다.


In [34]:
classify_with_better_output(text_classifier, '아니 ㅅㅂ 네이버 메인 왜 이렇게 바꿨냐고')

아니 ㅅㅂ 네이버 메인 왜 이렇게 바꿨냐고 가 입력되었으며,
모델은 이 문장을 58.273279666900635% 확률로 욕설이나 혐오표현이 있는 문장이라고 추론했습니다.


In [35]:
classify_with_better_output(text_classifier, '조밥나물 조팝나무는 실제로 있다.')

조밥나물 조팝나무는 실제로 있다. 가 입력되었으며,
모델은 이 문장을 60.00569462776184% 확률로 욕설이나 혐오표현이 있는 문장이라고 추론했습니다.


In [37]:
classify_with_better_output(text_classifier, '너가 시발점이야.')

너가 시발점이야. 가 입력되었으며,
모델은 이 문장을 99.52327609062195% 확률로 깨끗한 문장이라고 추론했습니다.


In [36]:
classify_with_better_output(text_classifier, '2023-1학기 텍스트마이닝 기말 텀프로젝트')

2023-1학기 텍스트마이닝 기말 텀프로젝트 가 입력되었으며,
모델은 이 문장을 99.50994849205017% 확률로 깨끗한 문장이라고 추론했습니다.


In [38]:
labels_test = []
alpha_error = []
beta_error = []
test = zip(test_data['document'] , test_data['label'])
for i, (doc, label) in enumerate(test):

  if i == 1000:
    break

  output = text_classifier(doc)[0]
  clean = output[0]['score']
  curse = output[1]['score']
  result = 1 if curse > clean else 0
  if (result == label):
    labels_test.append(1)
  else:
    labels_test.append(0)
  if (result != label and label == 1):
    alpha_error.append(1)
  if (result != label and label == 0):
    beta_error.append(1)
  

print(f'Accuracy Percentage with unseen data : {(sum(labels_test) / len(labels_test)) * 100}%')
print(f'Alpha Error Percentage with unseen data : {(sum(alpha_error) / sum(labels_test)) * 100}%')
print(f'Beta Error Percentage with unseen data : {(sum(beta_error) / sum(labels_test)) * 100}%')

Accuracy Percentage with unseen data : 82.0%
Alpha Error Percentage with unseen data : 8.536585365853659%
Beta Error Percentage with unseen data : 13.414634146341465%


In [None]:
# zip the model in colab
!tar -cvf klue_roberta_large_finetuned curse_detection/


In [15]:
# use google drive in colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [25]:
# copy zipped model into google drive in colab
!cp -r ./curse_detection ./drive/MyDrive/

## Use the model with Hugging Face's transformers Library

In [None]:
from transformers import TextClassificationPipeline
from transformers import TFBertForSequenceClassification
from transformers import BertTokenizerFast

# 파인튜닝 된 모델 로드하기
loaded_tokenizer = BertTokenizerFast.from_pretrained('Tolerblanc/klue-bert-finetuned')
loaded_model = TFBertForSequenceClassification.from_pretrained('Tolerblanc/klue-bert-finetuned', output_attentions=True)

# 기존 모델 로드하기
# loaded_tokenizer = BertTokenizerFast.from_pretrained('klue/bert-base')
# loaded_model = TFBertForSequenceClassification.from_pretrained('klue/bert-base', output_attentions=True)

text_classifier = TextClassificationPipeline(
    tokenizer=loaded_tokenizer, 
    model=loaded_model, 
    framework='tf',
    return_all_scores=True,
    device=0
)

# 추론 테스트는 위 classify_with_better_output 적용!

  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)okenizer_config.json: 100%|██████████| 367/367 [00:00<00:00, 93.5kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 248k/248k [00:00<00:00, 561kB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 752k/752k [00:00<00:00, 4.55MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 125/125 [00:00<00:00, 393kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 612/612 [00:00<00:00, 1.20MB/s]
Downloading tf_model.h5: 100%|██████████| 443M/443M [00:28<00:00, 15.5MB/s] 
Some layers from the model checkpoint at Tolerblanc/klue-bert-finetuned were not used when initializing TFBertForSequenceClassification: ['dropout_113']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are init