In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Embedding, GlobalMaxPool1D, LayerNormalization, Dropout, MultiHeadAttention
from tensorflow.keras.models import Model
from tensorflow.keras.layers import TextVectorization
import matplotlib.pyplot as plt

# 0. 특수 토큰 추가 (변경)
SPECIAL_TOKENS = ["[CLS]", "[SEP]", "[PAD]"]  # 분류 작업용 특수 토큰

# 1. 데이터 준비 (변경: 특수 토큰 추가 전처리)
df = pd.read_csv('train_merged.csv')
df = df.rename(columns={'class': 'target'})

# 대화 데이터에 [CLS] 토큰 추가 (예시)
df['conversation'] = "[CLS] " + df['conversation'].astype(str) + " [SEP]"

# 레이블 인코딩
le = LabelEncoder()
df['target'] = le.fit_transform(df['target'])

# 데이터 분할
X = df['conversation'].values  # 변경: 이미 문자열로 처리됨
y = df['target'].values
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 2. 텍스트 벡터화 (변경: 특수 토큰 고려)
VOCAB_SIZE = 20000
MAX_LEN = 128

# 특수 토큰을 포함한 어휘 사전 생성
vectorize_layer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=MAX_LEN,
    standardize=None,  # 변경: 기본 전처리 비활성화
    name='text_vectorization'
)
vectorize_layer.adapt(X_train)

# 3. Positional Embedding 레이어 추가 (신규)
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, max_len, embed_dim):
        super().__init__()
        self.token_emb = Embedding(input_dim=VOCAB_SIZE, output_dim=embed_dim)
        self.pos_emb = Embedding(input_dim=max_len, output_dim=embed_dim)

    def call(self, inputs):
        positions = tf.range(start=0, limit=tf.shape(inputs)[-1], delta=1)
        position_embeddings = self.pos_emb(positions)
        token_embeddings = self.token_emb(inputs)
        return token_embeddings + position_embeddings

# 4. 트랜스포머 블록 변경 (GELU 적용)
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation='gelu'),  # 변경: ReLU → GELU
            Dense(embed_dim)
        ])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

# 5. 모델 구성 함수 변경 (PositionalEmbedding 추가)
def create_gpt1_model(vocab_size, max_len, embed_dim, num_heads, ff_dim, num_layers, num_classes):
    inputs = Input(shape=(1,), dtype=tf.string)
    x = vectorize_layer(inputs)
    x = PositionalEmbedding(max_len, embed_dim)(x)  # 변경: 기존 Embedding → PositionalEmbedding
    for _ in range(num_layers):
        x = TransformerBlock(embed_dim, num_heads, ff_dim)(x)
    x = GlobalMaxPool1D()(x)
    outputs = Dense(num_classes, activation='softmax')(x)
    model = Model(inputs, outputs)
    model.compile(optimizer='adam', 
                 loss='sparse_categorical_crossentropy',
                 metrics=['accuracy'])
    return model

# 6. 사전학습을 위한 언어 모델 구성 (신규)
def create_pretrain_model(base_model):
    inputs = Input(shape=(1,), dtype=tf.string)
    x = base_model(inputs)
    outputs = Dense(VOCAB_SIZE, activation='softmax')(x)  # 다음 단어 예측
    pretrain_model = Model(inputs, outputs)
    pretrain_model.compile(optimizer='adam', 
                          loss='sparse_categorical_crossentropy',
                          metrics=['accuracy'])
    return pretrain_model

# 7. 학습 파이프라인 변경
base_model = create_gpt1_model(
    vocab_size=VOCAB_SIZE,
    max_len=MAX_LEN,
    embed_dim=128,
    num_heads=4,
    ff_dim=256,
    num_layers=2,
    num_classes=len(le.classes_)
)

# 사전학습 단계 (예시, 실제로는 대규모 데이터 필요)
pretrain_model = create_pretrain_model(base_model)
# pretrain_model.fit(large_unlabeled_data, ...)  # 실제 구현시 활성화

# 미세조정 단계
history = base_model.fit(
    tf.convert_to_tensor(X_train),
    tf.convert_to_tensor(y_train),
    validation_data=(tf.convert_to_tensor(X_val), tf.convert_to_tensor(y_val)),
    epochs=10,
    batch_size=32,
    callbacks=[F1Callback(X_val, y_val)]  
)

# 7. 학습 곡선 시각화
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
plt.plot(history.history['accuracy'], label='Train Acc')
plt.plot(history.history['val_accuracy'], label='Val Acc')
plt.title('Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)

plt.subplot(1, 3, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

plt.subplot(1, 3, 3)
plt.plot(f1_callback.val_f1, label='F1')
plt.title('F1 Score')
plt.xlabel('Epoch')
plt.ylabel('F1 Score')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()
