### Install Prerequisite Libraries

In [None]:
!pip install transformers
!pip install datasets

### Load pretained models with HuggingFace's transformers Library

In [2]:
import numpy as np
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TextClassificationPipeline
from transformers import TFBertForSequenceClassification

In [5]:
# load finetuned models from local
klue_bert_tokenizer = AutoTokenizer.from_pretrained('curse_detection/klue-bert-base', from_tf=True)
klue_bert_model = TFBertForSequenceClassification.from_pretrained('curse_detection/klue-bert-base', output_attentions=True)
kcelectra_tokenizer = AutoTokenizer.from_pretrained('curse_detection/kcelectra-base')
kcelectra_model = AutoModelForSequenceClassification.from_pretrained('curse_detection/kcelectra-base', output_attentions=True)
kcbert_tokenizer = AutoTokenizer.from_pretrained('curse_detection/kcbert-base')
kcbert_model = AutoModelForSequenceClassification.from_pretrained('curse_detection/kcbert-base', output_attentions=True)
klue_roberta_tokenizer = AutoTokenizer.from_pretrained('curse_detection/klue-roberta-base', from_tf=True)
klue_roberta_model = TFBertForSequenceClassification.from_pretrained('curse_detection/klue-roberta-base', output_attentions=True)

Some layers from the model checkpoint at curse_detection/klue-bert-base were not used when initializing TFBertForSequenceClassification: ['dropout_113']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at curse_detection/klue-bert-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.
Some layers from the model checkpoint at curse_d

In [None]:
# load finetuned models from hugging face
klue_bert_tokenizer = AutoTokenizer.from_pretrained('Tolerblanc/klue-bert-finetuned', from_tf=True)
klue_bert_model = AutoModelForSequenceClassification.from_pretrained('Tolerblanc/klue-bert-finetuned', output_attentions=True, from_tf=True)
kcelectra_tokenizer = AutoTokenizer.from_pretrained('Tolerblanc/kcelectra-base')
kcelectra_model = AutoModelForSequenceClassification.from_pretrained('Tolerblanc/kcelectra-base', output_attentions=True)
kcbert_tokenizer = AutoTokenizer.from_pretrained('Tolerblanc/kcbert-base')
kcbert_model = AutoModelForSequenceClassification.from_pretrained('Tolerblanc/kcbert-base', output_attentions=True)
klue_roberta_tokenizer = AutoTokenizer.from_pretrained('Tolerblanc/klue-roberta-base', from_tf=True)
klue_roberta_model = AutoModelForSequenceClassification.from_pretrained('Tolerblanc/klue-roberta-base', output_attentions=True, from_tf=True)

In [6]:
# tensorflow, torch GPU Acceleration in M1 Mac
import torch
torch_device = torch.device('mps:0' if torch.backends.mps.is_available() else 'cpu')
import tensorflow as tf
tf.device('/GPU:0')

<tensorflow.python.eager.context._EagerDeviceContext at 0x2cdc329c0>

In [7]:
klue_bert_classifier = TextClassificationPipeline(
    tokenizer=klue_bert_tokenizer, 
    model=klue_bert_model, 
    framework='tf',
)
kcelectra_classifier = TextClassificationPipeline(
    tokenizer=kcelectra_tokenizer, 
    model=kcelectra_model, 
    framework='pt',
    device=torch_device
)
klue_roberta_classifier = TextClassificationPipeline(
    tokenizer=klue_roberta_tokenizer, 
    model=klue_roberta_model, 
    framework='tf',
)
kcbert_classifier = TextClassificationPipeline(
    tokenizer=kcbert_tokenizer, 
    model=kcbert_model, 
    framework='pt',
    device=torch_device
)

In [8]:
# 대조군으로 사용할 모델
# JminJ 님의 kcElectra 기반 파인튜닝 모델
# 아웃풋 0이 bad, 1이 clean
# https://github.com/JminJ/Bad_text_classifier
comparison_tokenizer = AutoTokenizer.from_pretrained('JminJ/kcElectra_base_Bad_Sentence_Classifier')
comparison_model = AutoModelForSequenceClassification.from_pretrained('JminJ/kcElectra_base_Bad_Sentence_Classifier')
comparison_classifier = TextClassificationPipeline(
    tokenizer=comparison_tokenizer, 
    model=comparison_model, 
    framework='pt',
    device=torch_device
)

In [None]:
!git clone https://github.com/2runo/Curse-detection-data.git benchmark_dataset 

In [71]:
# 456번째 라인에서 | 하나 지워야 함, 맨 윗줄에 'document|label' 추가
benchmark_dataset = pd.read_csv('dataset.txt', sep='|')
benchmark_dataset

Unnamed: 0,document,label
0,좌배 까는건 ㅇㅂ,1
1,집에 롱 패딩만 세 개다. 10년 더 입어야지 ㅋㅋ,0
2,개소리야 니가 빨갱이를 옹호하고 드루킹을 ㅇㅇ짓이라고 말못해서 삐진거야 빨갱아,1
3,세탁이라고 봐도 된다,0
4,애새끼가 초딩도 아니고 ㅋㅋㅋㅋ,1
...,...,...
5820,좌우 헬파이어 3개씩 6개 장착에 아파치보다 약하지만 20mm 기관포 장착임,0
5821,"세금 내놓으라고 데모질 중 ㅋㅋ간첩, 도둑놈 새끼들이 대통령 해처먹으니까 나도 같...",1
5822,너가 한 말 중에,0
5823,제갈대중 ㅇㅂ,0


### Benchmark models

In [9]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def benchmark_model(classifier, dataset):
    documents = dataset['document'].to_list()
    true_labels = dataset['label'].to_list()
    inference = []

    for doc in tqdm(documents):
        output = classifier(doc)[0]
        if output['label'] == 'ok_sen' or output['label'] == 'LABEL_0':
            inference.append(0)
        else:
            inference.append(1)

    acc = accuracy_score(true_labels, inference)
    prec = precision_score(true_labels, inference)
    rec = recall_score(true_labels, inference)
    f1 = f1_score(true_labels, inference)

    return [acc, prec, rec, f1], inference

In [94]:
classifiers = {
    "comparison" : comparison_classifier,
    "klue_BERT" : klue_bert_classifier, 
    "kcElectra" : kcelectra_classifier, 
    "kcBERT" : kcbert_classifier, 
    "klue_roBERTa" : klue_roberta_classifier
}
inferences = []
true_labels = benchmark_dataset['label'].to_list()

for name, model in classifiers.items():
    print(f':: :: benchmarking the {name} model :: ::')
    scores, infer = benchmark_model(model, benchmark_dataset)
    print(f'accuracy={scores[0]:.2f}, precision={scores[1]:.2f}, recall={scores[2]:.2f}, f1={scores[3]:.2f}')
    inferences.append(infer)
print()

:: :: benchmarking the comparison model :: ::


100%|██████████| 5825/5825 [01:56<00:00, 50.01it/s]


accuracy=0.81, precision=0.69, recall=0.87, f1=0.77
:: :: benchmarking the klue_BERT model :: ::


100%|██████████| 5825/5825 [22:10<00:00,  4.38it/s]


accuracy=0.83, precision=0.76, recall=0.75, f1=0.75
:: :: benchmarking the kcElectra model :: ::


100%|██████████| 5825/5825 [02:17<00:00, 42.37it/s]


accuracy=0.68, precision=0.54, recall=0.66, f1=0.59
:: :: benchmarking the kcBERT model :: ::


100%|██████████| 5825/5825 [03:03<00:00, 31.68it/s]


accuracy=0.76, precision=0.61, recall=0.85, f1=0.71
:: :: benchmarking the klue_roBERTa model :: ::


 31%|███       | 1777/5825 [07:04<18:43,  3.60it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (602 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 5825/5825 [23:10<00:00,  4.19it/s]

accuracy=0.79, precision=0.67, recall=0.77, f1=0.72






### Save the benchmarking result

In [None]:
df = pd.DataFrame()
df['label'] = true_labels
for i, name in enumerate(classifiers.keys()):
	df[name] = inferences[i]

df.to_csv('inference.csv')

## Test with Youtube Comments

In [12]:
youtube_comments = pd.read_csv('Youtube_Comments.csv', sep=',')
youtube_comments


Unnamed: 0,document,label
0,이새끼는 근본이 안되어있어범죄자 새끼,1
1,개극혐이다 진짜,1
2,토할 것 같다..,0
3,"승리 비판 전혀 상관없이 , 저 댓글들만 보여주는게 과연 sbs수준을 올릴까 내릴까?",0
4,승리는 사람 아니다. 같이 우리나라 사회에 있어서는 안된다. 해외로 추방시켜야 한다...,0
...,...,...
195,이새키보소,1
196,쓰레기는 변하지 않아~,1
197,멍ㄸㄹㅇ~옥살이하면 범죄자의 모든것이 없어지나?대대손손 기록에 남지않을까? 꼬리표처럼!!,1
198,잘가라~멀리 안나간다 ㅡㅡ,0


In [13]:
classifiers = {
    "comparison" : comparison_classifier,
    "klue_BERT" : klue_bert_classifier, 
}
inferences = []
true_labels = youtube_comments['label'].to_list()

for name, model in classifiers.items():
    print(f':: :: benchmarking the {name} model :: ::')
    scores, infer = benchmark_model(model, youtube_comments)
    print(f'accuracy={scores[0]:.2f}, precision={scores[1]:.2f}, recall={scores[2]:.2f}, f1={scores[3]:.2f}')
    inferences.append(infer)
print()

:: :: benchmarking the comparison model :: ::


100%|██████████| 200/200 [00:09<00:00, 20.47it/s]


accuracy=0.77, precision=0.52, recall=0.90, f1=0.66
:: :: benchmarking the klue_BERT model :: ::


100%|██████████| 200/200 [00:47<00:00,  4.19it/s]

accuracy=0.89, precision=0.75, recall=0.80, f1=0.78






In [14]:
df = pd.DataFrame()
df['label'] = true_labels
for i, name in enumerate(classifiers.keys()):
	df[name] = inferences[i]

df.to_csv('inference_youtube.csv')