In [4]:
# 11.2 단어 임베딩 이해하기
# 필요한 라이브러리 설치
!pip install tensorflow

# 단어 임베딩 이해하기
import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 샘플 텍스트 데이터
sentences = [
    'I love machine learning',
    'Deep learning is fun',
    'NLP is a field of AI',
    'I enjoy learning new things'
]

# 토큰화 및 시퀀스 변환
tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentences)
padded_sequences = pad_sequences(sequences, padding='post', maxlen=6)  # 입력 길이를 6으로 설정

# 임베딩 레이어 정의
model = Sequential()
model.add(Embedding(input_dim=100, output_dim=8, input_length=6))  # input_length를 6으로 설정
model.compile(optimizer='adam', loss='mse')

# 모델 요약
model.summary()

# 임베딩 결과 출력
embedding_output = model.predict(padded_sequences)
print("임베딩 벡터:\n", embedding_output)


Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 6, 8)              800       
                                                                 
Total params: 800 (3.12 KB)
Trainable params: 800 (3.12 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
임베딩 벡터:
 [[[-0.00258049  0.02777132 -0.01028944 -0.04098171 -0.03875224
    0.00319958  0.0406366   0.01717348]
  [ 0.0220235  -0.00541878 -0.02449356 -0.04419569 -0.00654415
   -0.01048565  0.04022045 -0.04286895]
  [-0.03785087  0.03841039 -0.01380272 -0.00229568  0.03661151
    0.00168189  0.02020187  0.0339097 ]
  [ 0.0433468   0.03390931 -0.03314903  0.00118365 -0.04410258
   -0.03057426  0.02173462 -0.03498989]
  [ 0.04766205 -0.02434663 -0.0178985  -0.00818788 -0.02547421
    0.02391818 -0.01742922  0.03680075]
  [ 0.04766205 -0.02434663 -

In [8]:
# 11.3 단어 그룹을 표현하는 두 가지 방법: 집합과 시퀀스

# 필요한 라이브러리 설치
# !pip install tensorflow scikit-learn

# 집합 기반 접근 (Bag of Words)
from sklearn.feature_extraction.text import CountVectorizer

sentences = [
    'I love machine learning',
    'Deep learning is fun',
    'NLP is a field of AI',
    'I enjoy learning new things'
]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(sentences)

print("Bag of Words:\n", X.toarray())
print("Feature names:\n", vectorizer.get_feature_names_out())

# 필요한 라이브러리 설치
# !pip install tensorflow

# 시퀀스 기반 접근 (Sequence Model)
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

sentences = [
    'I love machine learning',
    'Deep learning is fun',
    'NLP is a field of AI',
    'I enjoy learning new things'
]

tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
padded_sequences = pad_sequences(sequences, padding='post')

model = Sequential()
model.add(Embedding(input_dim=100, output_dim=8, input_length=10))
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy')

model.summary()


Bag of Words:
 [[0 0 0 0 0 0 1 1 1 0 0 0 0]
 [0 1 0 0 1 1 1 0 0 0 0 0 0]
 [1 0 0 1 0 1 0 0 0 0 1 1 0]
 [0 0 1 0 0 0 1 0 0 1 0 0 1]]
Feature names:
 ['ai' 'deep' 'enjoy' 'field' 'fun' 'is' 'learning' 'love' 'machine' 'new'
 'nlp' 'of' 'things']
Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 10, 8)             800       
                                                                 
 lstm_3 (LSTM)               (None, 32)                5248      
                                                                 
 dense_3 (Dense)             (None, 1)                 33        
                                                                 
Total params: 6081 (23.75 KB)
Trainable params: 6081 (23.75 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [9]:
# 11-4. 트랜스포머 시퀀스

# 필요한 라이브러리 설치
# !pip install tensorflow transformers

# 트랜스포머 아키텍처 (BERT 사용 예제)
from transformers import TFBertForSequenceClassification, BertTokenizer
import tensorflow as tf

# 모델과 토크나이저 로드
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 샘플 텍스트
sentences = [
    "I love machine learning",
    "Deep learning is challenging but fun",
    "I dislike long training times",
    "NLP is fascinating"
]

# 토크나이저를 사용하여 텍스트 전처리
inputs = tokenizer(sentences, return_tensors="tf", padding=True, truncation=True)

# 모델 예측
outputs = model(inputs)
predictions = tf.nn.softmax(outputs.logits, axis=-1)

print("Predictions:\n", predictions)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Predictions:
 tf.Tensor(
[[0.42760822 0.57239175]
 [0.4320462  0.5679538 ]
 [0.41217655 0.5878234 ]
 [0.43977204 0.56022793]], shape=(4, 2), dtype=float32)


In [14]:
# 11.5 텍스트 분류를 넘어: 시퀀스-투-시퀀스 학습
# 필요한 라이브러리 설치
# !pip install tensorflow

# 시퀀스-투-시퀀스 모델 (기계 번역 예제)
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.models import Model
import numpy as np

# 데이터 전처리
input_texts = ["Hello", "How are you?", "Good morning"]
target_texts = ["\tHola", "\t¿Cómo estás?", "\tBuenos días"]

input_tokenizer = tf.keras.preprocessing.text.Tokenizer()
input_tokenizer.fit_on_texts(input_texts)
input_sequences = input_tokenizer.texts_to_sequences(input_texts)
input_data = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, padding='post', maxlen=5)

target_tokenizer = tf.keras.preprocessing.text.Tokenizer()
target_tokenizer.fit_on_texts(target_texts)
target_sequences = target_tokenizer.texts_to_sequences(target_texts)
target_data = tf.keras.preprocessing.sequence.pad_sequences(target_sequences, padding='post', maxlen=7)

# 인코더
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(input_dim=100, output_dim=64)(encoder_inputs)
encoder_lstm = LSTM(64, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# 디코더
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(input_dim=100, output_dim=64)
decoder_embedding_output = decoder_embedding(decoder_inputs)
decoder_lstm = LSTM(64, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding_output, initial_state=encoder_states)
decoder_dense = Dense(100, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# 시퀀스-투-시퀀스 모델 정의
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# 모델 요약
model.summary()

# 모델 훈련
target_data = target_data.reshape(*target_data.shape, 1)  # 레이블에 맞게 형태 조정
model.fit([input_data, target_data], target_data, epochs=10)

# 인코더 모델 및 디코더 모델 정의 (추론용)
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(64,))
decoder_state_input_c = Input(shape=(64,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_inputs_single = Input(shape=(None,))
decoder_embedding_output = decoder_embedding(decoder_inputs_single)
decoder_lstm_output, state_h2, state_c2 = decoder_lstm(decoder_embedding_output, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_lstm_output)
decoder_model = Model(
    [decoder_inputs_single] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2
)

# 예측 함수 정의 (단순화된 형태)
def predict_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.array([[target_tokenizer.word_index[' ']]])  # 시작 토큰
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = target_tokenizer.index_word[sampled_token_index]
        decoded_sentence += ' ' + sampled_char
        if sampled_char == '\n' or len(decoded_sentence) > 50:
            stop_condition = True
        target_seq = np.array([[sampled_token_index]])
        states_value = [h, c]
    return decoded_sentence

# 샘플 예측
sample_input = np.array([[input_tokenizer.word_index['hello']]])
predicted_seq = predict_sequence(sample_input)
print("Predicted sequence:", predicted_seq)



Model: "model_11"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_17 (InputLayer)       [(None, None)]               0         []                            
                                                                                                  
 input_18 (InputLayer)       [(None, None)]               0         []                            
                                                                                                  
 embedding_15 (Embedding)    (None, None, 64)             6400      ['input_17[0][0]']            
                                                                                                  
 embedding_16 (Embedding)    (None, None, 64)             6400      ['input_18[0][0]']            
                                                                                           

KeyError: '\t'