In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
from transformers import TFBertModel, BertConfig, AutoTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout

In [2]:
# 사용 가능한 GPU 목록 확인
gpus = tf.config.list_physical_devices('GPU')
print("Num GPUs Available:", len(gpus))
print("GPU Details:", gpus)

Num GPUs Available: 0
GPU Details: []


### 데이터 전처리

In [13]:
train_df = pd.read_csv('../data/train.csv', index_col=0) # 캐글 데이터
gen_df = pd.read_csv('../data/gen_data_final998.csv', index_col=0) # 합성데이터
class_dict = {'협박 대화': 0, '갈취 대화':1, '직장 내 괴롭힘 대화':2, '기타 괴롭힘 대화':3, '일반 대화':4}

In [14]:
gen_df['topic'] = '일반 대화'
gen_df = gen_df.rename(columns={'topic':'class'})

In [15]:
data_df = pd.concat([train_df, gen_df], ignore_index=True)

In [16]:
data_df['class'] = data_df['class'].apply(lambda x: class_dict[x])

In [17]:
# train 데이터의 최대 길이를 구함
data_len = [len(x.split()) for x in data_df['conversation']]
MAX_LEN = max(data_len)
MAX_LEN

223

In [18]:
labels = list(data_df['class'])
len(data_df['conversation']), len(labels) # 대화 , labels 갯수 확인

(4948, 4948)

In [19]:
np.unique(labels)

array([0, 1, 2, 3, 4])

In [20]:
num_classes = len(np.unique(labels))

In [21]:
model_name = "klue/bert-base"

In [22]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
token_data = tokenizer(
    list(data_df['conversation']),
    padding='max_length', # 자동으로 최대 길이로 패딩해줌
    truncation=True, # 모델이 감당 가능한 최대 길이 초과하면 자름
    return_tensors='np'
)

In [15]:
lengths = [len(seq) for seq in token_data['input_ids']]
print(f"토크나이저 후 데이터 내 최대 시퀀스 길이: {max(lengths)}")

토크나이저 후 데이터 내 최대 시퀀스 길이: 512


### 모델

In [16]:
num_samples = len(data_df) # 전체 샘플 갯수
indices = np.arange(num_samples) # 인덱스 생성

train_indices, val_indices = train_test_split( # 인덱스를 8대2로 나눔
    indices,
    test_size=0.2,
    random_state=42,
    stratify=labels # stratify에는 target값으로 class 비율 일정하게 셔플
)

In [17]:
train_inputs = {key: tf.gather(train, train_indices) for key, train in token_data.items()}
val_inputs = {key: tf.gather(val, val_indices) for key, val in token_data.items()}

# 레이블도 동일한 인덱스로 선택
train_labels = tf.gather(labels, train_indices)
val_labels = tf.gather(labels, val_indices)

train_dataset = tf.data.Dataset.from_tensor_slices((train_inputs, train_labels))
train_dataset = train_dataset.shuffle(buffer_size=10000).batch(8) # 셔플 및 배치

# 예시: 검증 데이터셋 생성
val_dataset = tf.data.Dataset.from_tensor_slices((val_inputs, val_labels))
val_dataset = val_dataset.batch(8) # 검증 데이터는 보통 셔플하지 않음

In [18]:
for i in train_dataset:
    print(i)
    break

({'input_ids': <tf.Tensor: shape=(8, 512), dtype=int64, numpy=
array([[    2,  5680,  2098, ...,     0,     0,     0],
       [    2,  3776,  1513, ...,     0,     0,     0],
       [    2, 16094,  2065, ...,     0,     0,     0],
       ...,
       [    2,   743,   732, ...,     0,     0,     0],
       [    2,  4442,   743, ...,     0,     0,     0],
       [    2,  1396,  1558, ...,     0,     0,     0]])>, 'token_type_ids': <tf.Tensor: shape=(8, 512), dtype=int64, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])>, 'attention_mask': <tf.Tensor: shape=(8, 512), dtype=int64, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>}, <tf.Tensor: shape=(8,), dtype=int32, nu

In [19]:
# 사전학습된 klue 모델 불러오기
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=num_classes, from_pt=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
# 더미 입력으로 모델 동작하는지 빌드
dummy_input = tokenizer(
    ["더미 텍스트"],
    padding='max_length', 
    truncation=True, 
    return_tensors='tf'
)
model(dummy_input)

TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(1, 5), dtype=float32, numpy=
array([[ 0.25585747, -0.04101317,  0.2202857 , -0.03755243, -0.08563735]],
      dtype=float32)>, hidden_states=None, attentions=None)

In [21]:
model.bert.trainable = False
model.classifier.trainable = True

In [22]:
for layer in model.layers:
    print(f"Layer {layer.name} is trainable: {layer.trainable}")

Layer bert is trainable: False
Layer dropout_37 is trainable: True
Layer classifier is trainable: True


In [23]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  110617344 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  3845      
Total params: 110,621,189
Trainable params: 3,845
Non-trainable params: 110,617,344
_________________________________________________________________


In [24]:
# bert 층 동결하고 classifier 부분만 사용
for layer in model.layers:
    print(f"Layer {layer.name} is trainable: {layer.trainable}")

Layer bert is trainable: False
Layer dropout_37 is trainable: True
Layer classifier is trainable: True


In [28]:
optimizer = keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# 10. 모델 컴파일
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [29]:
early_stopping_cb = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    restore_best_weights=True,
    patience=2)

# ModelCheckpoint 콜백 수정
model_checkpoint_cb = keras.callbacks.ModelCheckpoint(
    filepath='klue_weight.h5', 
    monitor='val_loss',
    save_best_only=True,
    save_weights_only=False,
    verbose=1
)

In [30]:
NUM_EPOCHS = 50

In [None]:
model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=NUM_EPOCHS,
    verbose=1,
    callbacks=[early_stopping_cb]
    # callbacks=[early_stopping_cb, model_checkpoint_cb]
)

Epoch 1/50
 14/495 [..............................] - ETA: 2:36:07 - loss: 1.6468 - accuracy: 0.2500

In [None]:
def predict(text):
    inputs = tokenizer(text, return_tensors='tf', padding='max_length', truncation=True)
    logits = model(inputs).logits
    return int(tf.argmax(logits, axis=1).numpy()[0])

In [None]:
test_df = pd.read_csv('../data/test.csv', index_col=0)
submission = pd.read_csv('../data/submission.csv', index_col=0)

In [None]:
submission['target'] = test_df['text'].apply(predict)

In [None]:
submission.to_csv('bert_klue_sub.csv')