In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
from transformers import TFBertModel, BertConfig, AutoTokenizer, TFBertForSequenceClassification, TFAutoModel
from sklearn.model_selection import train_test_split
from tensorflow import keras
import tf_keras
from tf_keras.src.layers import Input, Dense, Dropout
from tf_keras.src.models import Model

In [2]:
from tf_keras.metrics import Metric
from tf_keras.metrics import Precision, Recall

class MacroF1Score(Metric):
    def __init__(self, num_classes, name='macro_f1_score', **kwargs):
        super(MacroF1Score, self).__init__(name=name, **kwargs)
        self.num_classes = num_classes
        self.precision = [tf.keras.metrics.Precision(class_id=i) for i in range(num_classes)]
        self.recall = [tf.keras.metrics.Recall(class_id=i) for i in range(num_classes)]

    def update_state(self, y_true, y_pred, sample_weight=None):
        # y_pred: 소프트맥스 출력, y_true: 정수 라벨
        y_pred = tf.argmax(y_pred, axis=-1)  # 예측 클래스
        y_true = tf.cast(y_true, tf.int64)   # 정수형 라벨
        for i in range(self.num_classes):
            y_true_i = tf.cast(tf.equal(y_true, i), tf.float32)
            y_pred_i = tf.cast(tf.equal(y_pred, i), tf.float32)
            self.precision[i].update_state(y_true_i, y_pred_i, sample_weight)
            self.recall[i].update_state(y_true_i, y_pred_i, sample_weight)

    def result(self):
        f1_scores = []
        for i in range(self.num_classes):
            p = self.precision[i].result()
            r = self.recall[i].result()
            f1 = 2 * (p * r) / (p + r + tf.keras.backend.epsilon())
            f1_scores.append(f1)
        return tf.reduce_mean(f1_scores)  # 매크로 평균

    def reset_states(self):
        for i in range(self.num_classes):
            self.precision[i].reset_states()
            self.recall[i].reset_states()

In [3]:
def get_token_data(data, tokenizer):
    return_data = tokenizer(
    data,
    padding='max_length', # 자동으로 최대 길이로 패딩해줌
    truncation=True, # 모델이 감당 가능한 최대 길이 초과하면 자름
    return_tensors='np',
    # max_length = token_max_len)
    )
    return return_data

In [4]:
print(tf.__version__)

2.18.0


In [5]:
# 사용 가능한 GPU 목록 확인
gpus = tf.config.list_physical_devices('GPU')
print("Num GPUs Available:", len(gpus))
print("GPU Details:", gpus)

Num GPUs Available: 0
GPU Details: []


### 데이터 전처리

In [6]:
class_dict = {'협박 대화': 0, '갈취 대화':1, '직장 내 괴롭힘 대화':2, '기타 괴롭힘 대화':3, '일반 대화':4}

In [8]:
train_df = pd.read_csv('trainset_v1.csv') # 캐글 데이터
val_df = pd.read_csv('valset_v1.csv') # 합성데이터

In [9]:
train_df

Unnamed: 0,class,conversation
0,0,너네 딸 어디있어?\n 누구시죠?\n 너네 딸이 우리 딸 이렇게 만들었다고! 당장 ...
1,0,입금받았으니 물건 빨리 보내시죠\n 당분간 바빠서 그래요\n 장난하십니까.벌써 몇번...
2,2,이사원 이업무좀 내일까지 할수있나\n저 지금 하고있는거도있고 내일까지는 힘들거같습니...
3,1,안녕 꼬마야 몇살이야\n안녕하세요 6살이요\n손에 든건뭐야?\n돈이에요 엄마가 애호...
4,1,아니 이자가 왜 이리 늘어요? 너무한거 아니에요\n뭐 새끼야 애초에 돈을 빌렸으면 ...
...,...,...
3870,4,이상으로 이번 프로젝트의 주요 성과를 말씀드렸습니다. 질문 있으시면 부탁드립니다.\...
3871,1,저기요 저 1000원만 주실 수 있나요?\n네? 제가 왜요?\n제가 버스비가 없어서...
3872,1,내가 너 도와줬으니까 돈 줘\n그런게 어딨어 밥만 사줄게\n돈 내놓으라고\n왜 밥으...
3873,3,병신이 아이스크림 먹게 돼 있냐?\n난 먹으면 안 돼? 그만 좀 해.\n당연히 안 ...


In [10]:
all_df = pd.concat([train_df, val_df])

In [11]:
# 모든 데이터의 공백 제거 최대 길이를 구함
data_len = [len(x.split()) for x in all_df['conversation']]
MAX_LEN = max(data_len)
MAX_LEN

223

In [12]:
# 훈련 데이터 class 개수
train_df['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
3,809
4,798
1,778
2,776
0,714


In [13]:
# 훈련 데이터 class 개수
val_df['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
3,202
4,200
1,195
2,194
0,178


In [49]:
num_classes = len(np.unique(train_df['class']))
print(f'class 개수 : {num_classes}')

class 개수 : 5


### 토크나이저

In [16]:
model_name = "klue/bert-base"

In [17]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/495k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [28]:
train_labels = tf.convert_to_tensor(train_df['class'])
train_token_data = get_token_data(list(train_df['conversation']), tokenizer)
train_token_data = {key: tf.convert_to_tensor(train) for key, train in train_token_data.items()}

In [31]:
val_labels = tf.convert_to_tensor(val_df['class'])
val_token_data = get_token_data(list(val_df['conversation']), tokenizer)
val_token_data = {key: tf.convert_to_tensor(val) for key, val in val_token_data.items()}

In [29]:
train_token_data

{'input_ids': <tf.Tensor: shape=(3875, 512), dtype=int64, numpy=
 array([[    2,   743,  2203, ...,     0,     0,     0],
        [    2, 12793,  2757, ...,     0,     0,     0],
        [    2,  4188,  2252, ...,     0,     0,     0],
        ...,
        [    2,   732,  2116, ...,     0,     0,     0],
        [    2, 26784,  2052, ...,     0,     0,     0],
        [    2,  1453,  1504, ...,     0,     0,     0]])>,
 'token_type_ids': <tf.Tensor: shape=(3875, 512), dtype=int64, numpy=
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])>,
 'attention_mask': <tf.Tensor: shape=(3875, 512), dtype=int64, numpy=
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]])>}

In [32]:
train_labels

<tf.Tensor: shape=(3875,), dtype=int64, numpy=array([0, 0, 2, ..., 1, 3, 4])>

In [33]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_token_data, train_labels))
train_dataset = train_dataset.shuffle(buffer_size=10000).batch(8) # 셔플 및 배치

# 예시: 검증 데이터셋 생성
val_dataset = tf.data.Dataset.from_tensor_slices((val_token_data, val_labels))
val_dataset = val_dataset.batch(8) # 검증 데이터는 보통 셔플하지 않음

In [34]:
lengths = [len(seq) for seq in train_token_data['input_ids']]
print(f"토크나이저 후 데이터 내 최대 시퀀스 길이: {max(lengths)}")

토크나이저 후 데이터 내 최대 시퀀스 길이: 512


In [35]:
for i in train_dataset:
    print(i)
    break

({'input_ids': <tf.Tensor: shape=(8, 512), dtype=int64, numpy=
array([[    2,   543, 11683, ...,     0,     0,     0],
       [    2,  3997,  2720, ...,     0,     0,     0],
       [    2,  3611,  1662, ...,     0,     0,     0],
       ...,
       [    2,  3660,   743, ...,     0,     0,     0],
       [    2,  1119,  6200, ...,     0,     0,     0],
       [    2,   801,  2116, ...,     0,     0,     0]])>, 'token_type_ids': <tf.Tensor: shape=(8, 512), dtype=int64, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])>, 'attention_mask': <tf.Tensor: shape=(8, 512), dtype=int64, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>}, <tf.Tensor: shape=(8,), dtype=int64, nu

### 모델

In [36]:
# 사전학습된 klue 모델의 bert 인코더만 불러오기 -> TFAutoModel 사용

bert_encoders = TFAutoModel.from_pretrained(model_name, num_labels=num_classes, from_pt=True)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'bert.embeddings.position_ids', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the 

In [37]:
# 4. 더미 입력으로 모델 빌드
dummy_input = get_token_data(["더미 텍스트"], tokenizer)
# bert_encoders(dummy_input)  # 모델 호출로 빌드

In [38]:
input_layers = {}
for name, tensor in dummy_input.items(): # 토크나이저의 출력값을 알기 위해서 더미데이터 출력값 가져옴
    shape = tensor.shape[1:]  # batch dimension 제외
    input_layers[name] = Input(shape=shape, dtype='int32', name=name) # input_layer 정의 ( tokneizer의 출력값 전부가 input으로 들어감 )
input_layers

{'input_ids': <KerasTensor: shape=(None, 512) dtype=int32 (created by layer 'input_ids')>,
 'token_type_ids': <KerasTensor: shape=(None, 512) dtype=int32 (created by layer 'token_type_ids')>,
 'attention_mask': <KerasTensor: shape=(None, 512) dtype=int32 (created by layer 'attention_mask')>}

In [39]:
# bert 인코더 통과 시키고
bert_outputs = bert_encoders(input_layers)
pooled_output = bert_outputs.pooler_output

In [40]:
# dense 층 추가
dense1 = Dense(1024, activation='relu')(pooled_output)
dropout1 = Dropout(0.5)(dense1)
dense2 = Dense(512, activation='relu')(dropout1)
dropout2 = Dropout(0.5)(dense2)
classifier1 = Dense(num_classes)(dropout2)

In [41]:
model = Model(inputs=input_layers, outputs=classifier1)

In [42]:
# 사전학습된 bert 인코더 훈련 X

bert_encoders.trainable = False

In [43]:
for layer in model.layers:
    print(f"Layer {layer.name} is trainable: {layer.trainable}")

Layer attention_mask is trainable: True
Layer input_ids is trainable: True
Layer token_type_ids is trainable: True
Layer tf_bert_model is trainable: False
Layer dense is trainable: True
Layer dropout_37 is trainable: True
Layer dense_1 is trainable: True
Layer dropout_38 is trainable: True
Layer dense_2 is trainable: True


In [44]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 attention_mask (InputLayer  [(None, 512)]                0         []                            
 )                                                                                                
                                                                                                  
 input_ids (InputLayer)      [(None, 512)]                0         []                            
                                                                                                  
 token_type_ids (InputLayer  [(None, 512)]                0         []                            
 )                                                                                                
                                                                                              

In [45]:
from tf_keras.optimizers import Adam  # tf_keras에서 Adam 가져오기
f1 = MacroF1Score(num_classes=num_classes)
optimizer = Adam(learning_rate=5e-5)
loss = tf_keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy', f1])

In [46]:
early_stopping_cb = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    restore_best_weights=True,
    patience=2)

# # ModelCheckpoint 콜백 수정
# model_checkpoint_cb = keras.callbacks.ModelCheckpoint(
#     filepath='model_weight.h5', # 파일 확장자를 .keras (권장) 또는 .h5 로 지정
#     monitor='val_loss',
#     save_best_only=True,
#     save_weights_only=False,      # 전체 모델 저장 (기본값이므로 생략 가능)
#     verbose=1
# )

In [47]:
NUM_EPOCHS = 50

In [48]:
model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=NUM_EPOCHS,
    verbose=1,
    callbacks=[early_stopping_cb] # 정의된 콜백 전달
    # callbacks=[early_stopping_cb, model_checkpoint_cb] # 정의된 콜백 전달
)

Epoch 1/50
  3/485 [..............................] - ETA: 3:31:55 - loss: 1.8598 - accuracy: 0.0833 - macro_f1_score: 0.0000e+00

KeyboardInterrupt: 

In [None]:
def predict(text):
    inputs = get_token_data([text], tokenizer)
    logits = model(inputs).logits
    return int(tf.argmax(logits, axis=1).numpy()[0])

In [None]:
test_df = pd.read_csv('test.csv', index_col=0)
submission = pd.read_csv('submission.csv', index_col=0)

In [None]:
submission['target'] = test_df['text'].apply(predict)