In [17]:
import glob
import os
import re
import tensorflow as tf

tf.random.set_seed(1234)


txt_file_path = '/content/drive/MyDrive/aiffel/ex4/lyrics/*'

txt_list = glob.glob(txt_file_path)
print(f"가사 파일 수 : {len(txt_list)}")
raw_corpus = []

for txt_file in txt_list:
    with open(txt_file, "r") as f:
        raw = f.read().splitlines()
        raw_corpus.extend(raw)

print(f"가사의 줄 수 : {len(raw_corpus)}")

가사 파일 수 : 49
가사의 줄 수 : 187088


In [2]:
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^a-zA-Z'?.!,¿]+", " ", sentence)
    sentence = sentence.strip()
    sentence = '<start> ' + sentence + ' <end>'
    return sentence

corpus = []

for sentence in raw_corpus:
    if len(sentence) == 0: continue
    if sentence[-1] == ":": continue
    if sentence.startswith('('): continue
    if sentence.startswith('['): continue
    preprocessed_sentence = preprocess_sentence(sentence)
    # if sentence.startswith('('):
    #     print(sentence, preprocessed_sentence)
    # if 'll' in preprocessed_sentence.split():
    #     print(sentence)
    # break
    # if preprocessed_sentence:
    corpus.append(preprocessed_sentence)
    
print(corpus[:5])

['<start> just before our love got lost you said <end>', '<start> i am as constant as a northern star and i said <end>', '<start> constantly in the darkness <end>', "<start> where's that at <end>", "<start> if you want me i'll be in the bar on the back of a cartoon coaster <end>"]


In [3]:
def tokenize(corpus):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=12000, 
        filters=' ',
        oov_token="<unk>",
    )
    tokenizer.fit_on_texts(corpus)
    tensor = tokenizer.texts_to_sequences(corpus)   
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post', maxlen=15)  
    
    return tensor, tokenizer
tensor, tokenizer = tokenize(corpus)

In [4]:
src_input = tensor[:, :-1]
tgt_input = tensor[:, 1:]
from sklearn.model_selection import train_test_split
enc_train, enc_val, dec_train, dec_val = \
	train_test_split(src_input, tgt_input, test_size=0.2, random_state=1234)

In [5]:
print("Source Train:", enc_train.shape)
print("Target Train:", dec_train.shape)

Source Train: (137132, 14)
Target Train: (137132, 14)


In [None]:
VOCAB_SIZE = tokenizer.num_words + 1
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout


class TextGenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size1, hidden_size2):
        super(TextGenerator, self).__init__()
        
        self.embedding = Embedding(vocab_size, embedding_size)
        self.rnn_1 = LSTM(hidden_size1, return_sequences=True)
        self.rnn_2 = LSTM(hidden_size2, return_sequences=True)
        self.linear = Dense(vocab_size)
        self.dropout = Dropout(0.2)
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.dropout(out)
        out = self.rnn_2(out)
        out = self.dropout(out)
        out = self.linear(out)
        
        return out
    
embedding_size = 256
hidden_size = 1024

hidden_sizes1 = [256, 512, 1024, 2048]
hidden_sizes2 = [256, 512, 1024, 2048]
best_val_loss = 100
best_hidden_size1 = 0
best_hidden_size2 = 0
from itertools import product
# hidden_sizes = [256]
for hidden_size1, hidden_size2 in product(hidden_sizes1, hidden_sizes2):
    model = TextGenerator(tokenizer.num_words + 1, embedding_size , hidden_size1, hidden_size2)
    optimizer = tf.keras.optimizers.Adam()

    loss = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')

    model.compile(loss=loss, optimizer=optimizer)
    history = model.fit(enc_train, dec_train, epochs=5, validation_data=(enc_val, dec_val), batch_size=4096)
    val_loss = history.history['val_loss'][0]
    print(f"hidden_size1 : {hidden_size1}, hidden_size2 : {hidden_size2}, val_loss : {val_loss}")
    if best_val_loss > val_loss:
        best_val_loss = val_loss
        best_hidden_size1 = hidden_size1
        best_hidden_size2 = hidden_size2


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
hidden_size1 : 256, hidden_size2 : 256, val_loss : 4.595523357391357
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
hidden_size1 : 256, hidden_size2 : 512, val_loss : 4.65460205078125
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
hidden_size1 : 256, hidden_size2 : 1024, val_loss : 4.367781162261963
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
hidden_size1 : 256, hidden_size2 : 2048, val_loss : 4.15635347366333
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
hidden_size1 : 512, hidden_size2 : 256, val_loss : 4.574446201324463
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
hidden_size1 : 512, hidden_size2 : 512, val_loss : 4.606404781341553
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
hidden_size1 : 512, hidden_size2 : 1024, val_loss : 4.4409332275390625
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
hidden_size1 : 512, hidden_size2 : 2048, val_loss : 4.070143699645996
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch

In [25]:
best_val_loss, best_hidden_size

(3.8896186351776123, 1024)

In [12]:
optimizer = tf.keras.optimizers.Adam()

loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

model.compile(loss=loss, optimizer=optimizer)
model.fit(enc_train, dec_train, epochs=10, validation_data=(enc_val, dec_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f5835998c50>

In [14]:
def generate_text(model, tokenizer, init_sentence="<start>", max_len=20):
    # 테스트를 위해서 입력받은 init_sentence도 텐서로 변환합니다
    test_input = tokenizer.texts_to_sequences([init_sentence])
    test_tensor = tf.convert_to_tensor(test_input, dtype=tf.int64)
    end_token = tokenizer.word_index["<end>"]

    # 단어 하나씩 예측해 문장을 만듭니다
    #    1. 입력받은 문장의 텐서를 입력합니다
    #    2. 예측된 값 중 가장 높은 확률인 word index를 뽑아냅니다
    #    3. 2에서 예측된 word index를 문장 뒤에 붙입니다
    #    4. 모델이 <end>를 예측했거나, max_len에 도달했다면 문장 생성을 마칩니다
    while True:
        # 1
        predict = model(test_tensor) 
        # 2
        predict_word = tf.argmax(tf.nn.softmax(predict, axis=-1), axis=-1)[:, -1] 
        # 3 
        test_tensor = tf.concat([test_tensor, tf.expand_dims(predict_word, axis=0)], axis=-1)
        # 4
        if predict_word.numpy()[0] == end_token: break
        if test_tensor.shape[1] >= max_len: break

    generated = ""
    # tokenizer를 이용해 word index를 단어로 하나씩 변환합니다 
    for word_index in test_tensor[0].numpy():
        generated += tokenizer.index_word[word_index] + " "

    return generated

In [16]:
generate_text(model, tokenizer, init_sentence="<start> i love", max_len=20)

'<start> i love you , baby , i love you so much <end> '