In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
from transformers import TFBertModel, BertConfig, AutoTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout

In [2]:
# 사용 가능한 GPU 목록 확인
gpus = tf.config.list_physical_devices('GPU')
print("Num GPUs Available:", len(gpus))
print("GPU Details:", gpus)

Num GPUs Available: 1
GPU Details: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


### 데이터 전처리

In [3]:
train_df = pd.read_csv('train.csv', index_col=0) # 캐글 데이터
gen_df = pd.read_csv('gen_data_final998.csv', index_col=0) # 합성데이터
class_dict = {'협박 대화': 0, '갈취 대화':1, '직장 내 괴롭힘 대화':2, '기타 괴롭힘 대화':3, '일반 대화':4}

In [4]:
gen_df['topic'] = '일반 대화'
gen_df = gen_df.rename(columns={'topic':'class'})

In [5]:
data_df = pd.concat([train_df, gen_df], ignore_index=True)

In [6]:
data_df['class'] = data_df['class'].apply(lambda x: class_dict[x])

In [7]:
# train 데이터의 최대 길이를 구함
data_len = [len(x.split()) for x in data_df['conversation']]
MAX_LEN = max(data_len)
MAX_LEN

223

In [8]:
labels = list(data_df['class'])
len(data_df['conversation']), len(labels) # 대화 , labels 갯수 확인

(4948, 4948)

In [9]:
np.unique(labels)

array([0, 1, 2, 3, 4])

In [10]:
num_classes = len(np.unique(labels))

In [11]:
model_name = "klue/bert-base"

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/495k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [13]:
token_data = tokenizer(
    list(data_df['conversation']),
    padding='max_length', # 자동으로 최대 길이로 패딩해줌
    truncation=True, # 모델이 감당 가능한 최대 길이 초과하면 자름
    return_tensors='np'
)

In [14]:
lengths = [len(seq) for seq in token_data['input_ids']]
print(f"토크나이저 후 데이터 내 최대 시퀀스 길이: {max(lengths)}")

토크나이저 후 데이터 내 최대 시퀀스 길이: 512


### 모델

In [15]:
num_samples = len(data_df) # 전체 샘플 갯수
indices = np.arange(num_samples) # 인덱스 생성

train_indices, val_indices = train_test_split( # 인덱스를 8대2로 나눔
    indices,
    test_size=0.2,
    random_state=42,
    stratify=labels # stratify에는 target값으로 class 비율 일정하게 셔플
)

In [16]:
train_inputs = {key: tf.gather(train, train_indices) for key, train in token_data.items()}
val_inputs = {key: tf.gather(val, val_indices) for key, val in token_data.items()}

# 레이블도 동일한 인덱스로 선택
train_labels = tf.gather(labels, train_indices)
val_labels = tf.gather(labels, val_indices)

train_dataset = tf.data.Dataset.from_tensor_slices((train_inputs, train_labels))
train_dataset = train_dataset.shuffle(buffer_size=10000).batch(8) # 셔플 및 배치

# 예시: 검증 데이터셋 생성
val_dataset = tf.data.Dataset.from_tensor_slices((val_inputs, val_labels))
val_dataset = val_dataset.batch(8) # 검증 데이터는 보통 셔플하지 않음

In [17]:
for i in train_dataset:
    print(i)
    break

({'input_ids': <tf.Tensor: shape=(8, 512), dtype=int64, numpy=
array([[    2,  1396,  1507, ...,     0,     0,     0],
       [    2, 11683,  1269, ...,     0,     0,     0],
       [    2,  7082,  2059, ...,     0,     0,     0],
       ...,
       [    2,  1396,  7171, ...,     0,     0,     0],
       [    2,  1535,  2259, ...,     0,     0,     0],
       [    2,  1370,    18, ...,     0,     0,     0]])>, 'token_type_ids': <tf.Tensor: shape=(8, 512), dtype=int64, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])>, 'attention_mask': <tf.Tensor: shape=(8, 512), dtype=int64, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>}, <tf.Tensor: shape=(8,), dtype=int32, nu

In [18]:
# 사전학습된 klue 모델 불러오기
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=num_classes, from_pt=True)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# 더미 입력으로 모델 동작하는지 빌드
dummy_input = tokenizer(
    ["더미 텍스트"],
    padding='max_length',
    truncation=True,
    return_tensors='tf'
)
model(dummy_input)

TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(1, 5), dtype=float32, numpy=
array([[-0.23880652,  0.20831637,  0.0197846 ,  0.05509594,  0.09337355]],
      dtype=float32)>, hidden_states=None, attentions=None)

In [20]:
model.bert.trainable = False
model.classifier.trainable = True

In [21]:
for layer in model.layers:
    print(f"Layer {layer.name} is trainable: {layer.trainable}")

Layer bert is trainable: False
Layer dropout_37 is trainable: True
Layer classifier is trainable: True


In [22]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  110617344 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  3845      
                                                                 
Total params: 110621189 (421.99 MB)
Trainable params: 3845 (15.02 KB)
Non-trainable params: 110617344 (421.97 MB)
_________________________________________________________________


In [23]:
# bert 층 동결하고 classifier 부분만 사용
for layer in model.layers:
    print(f"Layer {layer.name} is trainable: {layer.trainable}")

Layer bert is trainable: False
Layer dropout_37 is trainable: True
Layer classifier is trainable: True


In [24]:
optimizer = keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# 10. 모델 컴파일
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [25]:
early_stopping_cb = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    restore_best_weights=True,
    patience=2)

# ModelCheckpoint 콜백 수정
model_checkpoint_cb = keras.callbacks.ModelCheckpoint(
    filepath='klue_weight.h5',
    monitor='val_loss',
    save_best_only=True,
    save_weights_only=False,
    verbose=1
)

In [26]:
NUM_EPOCHS = 50

In [27]:
model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=NUM_EPOCHS,
    verbose=1,
    callbacks=[early_stopping_cb]
    # callbacks=[early_stopping_cb, model_checkpoint_cb]
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tf_keras.src.callbacks.History at 0x7d457bdbe290>

In [None]:
def predict(text):
    inputs = tokenizer(text, return_tensors='tf', padding='max_length', truncation=True)
    logits = model(inputs).logits
    return int(tf.argmax(logits, axis=1).numpy()[0])

In [None]:
submission['target'] = test_df['text'].apply(predict)

In [34]:
submission.to_csv('bert_klue_sub.csv')