# 트랜스포머 기초

In [1]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, LayerNormalization, Dropout, Add, Input
from tensorflow.keras.optimizers import Adam
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# !git clone https://github.com/taehojo/data.git

# CSV 파일 로드
dataframe = pd.read_csv("./data/sentiment_data.csv")

# 데이터와 라벨 추출
sentences = dataframe["sentence"].tolist()
labels = dataframe["label"].tolist()

# 임베딩 벡터 크기와 최대 문장 길이 설정
embedding_dim = 128
max_len = 10

# 토크나이저 초기화 및 텍스트를 시퀀스로 변환
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
word_index = tokenizer.word_index

# 패딩을 사용하여 시퀀스 길이를 동일하게 맞춤
data = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_len, padding="post")

# 데이터셋을 훈련 세트와 검증 세트로 분리
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, random_state=42)

2025-04-27 20:06:20.313526: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-27 20:06:20.326481: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745751980.337955  714596 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745751980.340808  714596 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745751980.349053  714596 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
# 포지셔널 인코딩 함수


def get_positional_encoding(max_len, d_model):
    pos_enc = np.zeros((max_len, d_model))  # 포지셔널 인코딩 배열 초기화. max_len: 최대 시퀀스 길이, d_model:임베딩 벡터의 차원

    # 시퀀스의 각 위치에 대해 포지셔널 인코딩 값 계산
    for pos in range(max_len):
        for i in range(0, d_model, 2):
            pos_enc[pos, i] = np.sin(pos / (10000 ** (2 * i / d_model)))  # 짝수 인덱스
            if i + 1 < d_model:
                pos_enc[pos, i + 1] = np.cos(pos / (10000 ** (2 * (i + 1) / d_model)))  # 홀수 인덱스


# 포지셔널 인코딩 생성
positional_encoding = get_positional_encoding(max_len, embedding_dim)

In [3]:
# 멀티헤드 어텐션 레이어


class MultiHeadSelfAttentionLayer(tf.keras.layers.Layer):
    def __init__(self, num_heads, key_dim):
        super(MultiHeadSelfAttentionLayer, self).__init__()  # 레이어가 생성될 때 한 번 실행.
        self.mha = tf.keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=key_dim
        )  # 멀티헤드 어텐션 레이어 생성. num_heads: 어텐션 헤드의 수, key_dim은 각 헤드의 차원 수
        self.norm = LayerNormalization()  # 레이어 정규화

    def call(self, x):
        attn_output = self.mha(query=x, value=x, key=x)  # 멀티헤드 어텐션 적용
        attn_output = self.norm(attn_output + x)  # 잔차 연결 적용
        return attn_output

# 트랜스포머 응용

In [1]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, LayerNormalization, Dropout, Add, Input, Lambda
from tensorflow.keras.optimizers import Adam
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# !git clone https://github.com/taehojo/data.git

# CSV 파일 로드
dataframe = pd.read_csv("./data/sentiment_data.csv")

# 데이터와 라벨 추출
sentences = dataframe["sentence"].tolist()
labels = dataframe["label"].tolist()

# 임베딩 벡터 크기와 최대 문장 길이 설정
embedding_dim = 128
max_len = 10

# 토크나이저 초기화 및 텍스트를 시퀀스로 변환
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
word_index = tokenizer.word_index

# 패딩을 사용하여 시퀀스 길이를 동일하게 맞춤
data = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_len, padding="post")

# 데이터셋을 훈련 세트와 검증 세트로 분리
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, random_state=42)


# 포지셔널 인코딩 함수
def get_positional_encoding(max_len, d_model):
    pos_enc = np.zeros((max_len, d_model))
    for pos in range(max_len):
        for i in range(0, d_model, 2):
            pos_enc[pos, i] = np.sin(pos / (10000 ** (2 * i / d_model)))
            if i + 1 < d_model:
                pos_enc[pos, i + 1] = np.cos(pos / (10000 ** (2 * (i + 1) / d_model)))
    return pos_enc


# 포지셔널 인코딩 생성
positional_encoding = get_positional_encoding(max_len, embedding_dim)


# 멀티헤드 어텐션 레이어
class MultiHeadSelfAttentionLayer(tf.keras.layers.Layer):
    def __init__(self, num_heads, key_dim, masked=False):
        super(MultiHeadSelfAttentionLayer, self).__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)
        self.norm = LayerNormalization()
        self.masked = masked  # 새롭게 추가된 부분

    def call(self, x):
        if self.masked:
            # 마스크드 어텐션을 적용할 경우
            batch_size = tf.shape(x)[0]  # 입력 x의 배치 크기
            seq_len = tf.shape(x)[1]  # 입력 x의 시퀀스 길이

            # 마스크 행렬 생성
            mask = tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)  # tf.linalg.band_part: 대각선 이전 값들을 1로 채우고 나머지는 0으로
            mask = tf.reshape(mask, (1, 1, seq_len, seq_len))
            # 마스크 행렬의 형태를 만들어 주는 부분 -->    # 만약 [1, 1, 4, 4] 라면
            # [[1, 0, 0, 0],
            #  [1, 1, 0, 0],
            #  [1, 1, 1, 0],
            #  [1, 1, 1, 1]] 이런 형태로 변환됨

            mask = tf.tile(mask, [batch_size, 1, 1, 1])  # 배치 크기만큼 마스크를 반복하여 확장

            # 마스크 행렬을 -무한대로 변경
            mask = mask * -1e9

            # 마스크를 사용한 멀티헤드 어텐션
            attn_output = self.mha(query=x, value=x, key=x, attention_mask=mask)
        else:
            # 마스크 없이 멀티헤드 어텐션을 적용할 경우
            attn_output = self.mha(query=x, value=x, key=x)

        attn_output = self.norm(attn_output + x)  # 잔차 연결 적용 후 레이어 정규화

        return attn_output


# 모델 설정
inputs = Input(shape=(max_len,))

# 1. 임베딩 레이어: 텍스트 데이터를 임베딩 벡터로 변환
embedding_layer = Embedding(input_dim=len(word_index) + 1, output_dim=embedding_dim, input_length=max_len)
embedded_sequences = embedding_layer(inputs)

# 2. 포지셔널 인코딩 추가
embedded_sequences_with_positional_encoding = embedded_sequences + positional_encoding

# 3. 멀티헤드 어텐션 레이어 추가
attention_layer = MultiHeadSelfAttentionLayer(num_heads=8, key_dim=embedding_dim)
attention_output = attention_layer(embedded_sequences_with_positional_encoding)

# 4. 잔차 연결
attention_output_with_residual = Add()([embedded_sequences_with_positional_encoding, attention_output])

# 5. 마스크드 멀티헤드 어텐션 레이어 추가
masked_attention_layer = MultiHeadSelfAttentionLayer(num_heads=8, key_dim=embedding_dim, masked=True)  # masked=True 적용
masked_attention_output = masked_attention_layer(attention_output_with_residual)

# 6. 잔차 연결
masked_attention_output_with_residual = Add()([attention_output_with_residual, masked_attention_output])

# 7. GlobalAveragePooling1D 레이어 추가
pooled_output = GlobalAveragePooling1D()(masked_attention_output_with_residual)

# 8. 피드 포워드 네트워크
dense_layer = Dense(128, activation="relu")(pooled_output)
dropout_layer = Dropout(0.5)(dense_layer)
output_layer = Dense(1, activation="sigmoid")(dropout_layer)

# 모델 생성
model = Model(inputs=inputs, outputs=output_layer)

# 모델 컴파일
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# 모델 학습
history = model.fit(X_train, np.array(y_train), epochs=10, batch_size=16, validation_data=(X_val, np.array(y_val)))

# 샘플 데이터 예측
sample_texts = ["I absolutely love this!", "I can't stand this product"]
sample_sequences = tokenizer.texts_to_sequences(sample_texts)
sample_data = tf.keras.preprocessing.sequence.pad_sequences(sample_sequences, maxlen=max_len, padding="post")
predictions = model.predict(sample_data)

for i, text in enumerate(sample_texts):
    print(f"Text: {text}")
    print(f"Prediction: {'Positive' if predictions[i] > 0.5 else 'Negative'}")

2025-04-27 20:09:18.568265: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-27 20:09:18.577987: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745752158.588531  716485 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745752158.591374  716485 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745752158.600260  716485 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Epoch 1/10


I0000 00:00:1745752165.097394  716643 service.cc:152] XLA service 0x7f556401ba00 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1745752165.097449  716643 service.cc:160]   StreamExecutor device (0): NVIDIA GeForce RTX 4070 Ti SUPER, Compute Capability 8.9
2025-04-27 20:09:25.172963: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1745752165.505686  716643 cuda_dnn.cc:529] Loaded cuDNN version 90501













[1m 63/100[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.5273 - loss: 0.9174

I0000 00:00:1745752170.883503  716643 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 10ms/step - accuracy: 0.5219 - loss: 0.8672 - val_accuracy: 0.4975 - val_loss: 0.6478
Epoch 2/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8243 - loss: 0.3650 - val_accuracy: 0.9975 - val_loss: 0.0152
Epoch 3/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9881 - loss: 0.0636 - val_accuracy: 0.9975 - val_loss: 0.0090
Epoch 4/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9890 - loss: 0.0591 - val_accuracy: 1.0000 - val_loss: 7.8207e-04
Epoch 5/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9959 - loss: 0.0339 - val_accuracy: 0.9975 - val_loss: 0.0075
Epoch 6/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m-2s[0m -16790us/step - accuracy: 0.9965 - loss: 0.0354 - val_accuracy: 1.0000 - val_loss: 0.0057
Epoch 7/10
[1m100/100[0m 




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Text: I absolutely love this!
Prediction: Negative
Text: I can't stand this product
Prediction: Negative
