In [1]:
import pandas as pd
import tensorflow as tf
import re
from transformers import TextClassificationPipeline
import tensorflow as tf
import numpy as np
import random
import os

def my_seed_everywhere(seed: int = 42):
    random.seed(seed) # random
    np.random.seed(seed) # np
    os.environ["PYTHONHASHSEED"] = str(seed) # os
    tf.random.set_seed(seed) # tensorflow

my_seed = 42
my_seed_everywhere(my_seed)

In [2]:
file_name = 'train_ri_2000.csv'
data = pd.read_csv('/aiffel/aiffel/aiffelthon/선별된 실험데이터/{}'.format(file_name))
data.head(5)

Unnamed: 0,label_sentence,non_label_sentence,class,binary_class
0,1:행님 가족 소개 좀 하하\n2:가족은 3남매 누나 나 동생\n1:난 누나 2명 ...,행님 가족 소개 좀 하하\n가족은 3남매 누나 나 동생\n난 누나 2명 엄마 아빠 ...,일반 대화,일반 대화
1,1:또 확진자 3천 명 넘어섰네\n1:언제 멈추는겨\n2:아 가을이라서 사람들 다 ...,또 확진자 3천 명 넘어섰네\n언제 멈추는겨\n아 가을이라서 사람들 다 나다니니까 ...,일반 대화,일반 대화
2,1:문의 주셔서 감사합니다 입니다\n2:네 등산 교육이 있다고 하는데 이 뭐였죠\n...,문의 주셔서\n네 등산 교육이 있다고 하는데 이 뭐였죠\n노르딕 워킹 말씀하시는 건...,일반 대화,일반 대화
3,1:연일 기름값이 많이 오르는데 인플레이션이 가속화 되지 않을까\n2:미국의 물류난...,연일 기름값이 많이 오르는데 인플레이션이 가속화 되지 않을까\n미국의 물류난도 그렇...,일반 대화,일반 대화
4,1: 전화 주셔서 감사합니다 입니다\n2: 예 안녕하세요 제가 를 이용하고 있는데요...,전화 주셔서\n예 제가 를 이용하고 있는데요\n백신 프로그램에서 가 바이러스로 인...,일반 대화,일반 대화


In [3]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(data['class'])

data['class'] = label_encoder.transform(data['class'])
from sklearn.model_selection import train_test_split

train = data['non_label_sentence'].tolist()
target = data['class'].tolist()

# train, val 분리 (20%)
train_x, val_x, train_y, val_y = train_test_split(train, target, random_state=27, test_size=0.2)
# val, test 분리 (test를 전체의 10%)
val_x, test_x, val_y, test_y = train_test_split(val_x, val_y, random_state=27, test_size=0.5)

from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('klue/bert-base')

train_tensor = tokenizer(train_x, truncation=True, padding=True)
val_tensor = tokenizer(val_x, truncation=True, padding=True)

# trainset-set
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_tensor),
    train_y
))

# validation-set
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_tensor),
    val_y
))

In [4]:
from transformers import TFBertForSequenceClassification

model = TFBertForSequenceClassification.from_pretrained('klue/bert-base', num_labels=5, from_pt=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
history = model.fit(
    train_dataset.shuffle(len(train_dataset)).batch(8), epochs=3, batch_size=8,
    validation_data=val_dataset.shuffle(len(val_dataset)).batch(8),
)

Epoch 1/3

In [None]:
id2labels = model.config.id2label
model.config.id2label = {id : label_encoder.inverse_transform([int(re.sub('LABEL_', '', label))])[0]  for id, label in id2labels.items()}

label2ids = model.config.label2id
model.config.label2id = {label_encoder.inverse_transform([int(re.sub('LABEL_', '', label))])[0] : id   for id, label in id2labels.items()}

model.config.id2label

In [None]:
test_file_path = '/aiffel/aiffel/aiffelthon/test.json'
with open(test_file_path, mode='rt', encoding='utf-8') as f:
    test_dataset = pd.read_json(f)
    
test_data = test_dataset.transpose()
test_data = test_data['text'].tolist()

text_classifier = TextClassificationPipeline(
    tokenizer=tokenizer, 
    model=model, 
    framework='tf'
)

labels = {'갈취 대화': '00', '기타 괴롭힘 대화': '01', '일반 대화': '02', '직장 내 괴롭힘 대화': '03', '협박 대화': '04'}

pred_label = []

for text in test_data:
    label = text_classifier(text)[0]['label']
    pred_label.append(labels[label])

In [None]:
test_file_path = '/aiffel/aiffel/aiffelthon/test.json'
with open(test_file_path, mode='rt', encoding='utf-8') as f:
    test_dataset = pd.read_json(f)
    
test_data = test_dataset.transpose()

submission = test_data.assign(CLASS=pred_label)
submission = submission.rename(columns={'CLASS':'class'})
submission.drop(['text'], axis=1, inplace=True)
submission = submission.transpose()

import json

submission_file_path = '/aiffel/aiffel/aiffelthon/submission/submission_KlueBertBase_{}'.format(file_name[:-4])
result = submission.to_json(submission_file_path)

with open(submission_file_path) as f:
    parsed = json.load(f)

with open(submission_file_path, 'w') as f:
    json.dump(parsed, f, indent=4)