데이터 읽어오기

In [1]:
import glob  
import tensorflow as tf

print(tf.__version__)

2.6.0


In [2]:
import glob
import os, re 

txt_file_path = os.getenv('HOME')+'/aiffel/lyricist/data/lyrics/*' #os.getenv(x)함수는 환경 변수x의 값을 포함하는 문자열 변수를 반환합니다. txt_file_path 에 "/root/aiffel/lyricist/data/lyrics/*" 저장

txt_list = glob.glob(txt_file_path) #txt_file_path 경로에 있는 모든 파일명을 리스트 형식으로 txt_list 에 할당

raw_corpus = [] 

# 여러개의 txt 파일을 모두 읽어서 raw_corpus 에 담습니다.
for txt_file in txt_list:
    with open(txt_file, "r") as f:
        raw = f.read().splitlines() #read() : 파일 전체의 내용을 하나의 문자열로 읽어온다. , splitlines()  : 여러라인으로 구분되어 있는 문자열을 한라인씩 분리하여 리스트로 반환
        raw_corpus.extend(raw) # extend() : 리스트함수로 추가적인 내용을 연장 한다.

print("데이터 크기:", len(raw_corpus))
print("Examples:\n", raw_corpus[:3])

데이터 크기: 187088
Examples:
 ["Now I've heard there was a secret chord", 'That David played, and it pleased the Lord', "But you don't really care for music, do you?"]


데이터 정제

In [3]:
for idx, sentence in enumerate(raw_corpus):
    if len(sentence) == 0: continue   
    if sentence[-1] == ":": continue 

    if idx > 9: break .
        
    print(sentence)

Now I've heard there was a secret chord
That David played, and it pleased the Lord
But you don't really care for music, do you?
It goes like this
The fourth, the fifth
The minor fall, the major lift
The baffled king composing Hallelujah Hallelujah
Hallelujah
Hallelujah
Hallelujah Your faith was strong but you needed proof


In [4]:
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip() # 1
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence) # 2
    sentence = re.sub(r'[" "]+', " ", sentence) # 3
    sentence = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sentence) # 4
    sentence = sentence.strip() # 5
    sentence = '<start> ' + sentence + ' <end>' # 6
    return sentence

print(preprocess_sentence("This @_is ;;;sample        sentence."))

<start> this is sample sentence . <end>


In [5]:
corpus = []

# raw_corpus list에 저장된 문장들을 순서대로 반환하여 sentence에 저장
for sentence in raw_corpus:
    if len(sentence) == 0: continue
    if sentence[-1] == ":": continue
        
    # preprocess_sentence() 함수를 이용하여 토큰 개수가 15개 미만인 문장만 저장
    preprocessed_sentence = preprocess_sentence(sentence)
    if len(preprocessed_sentence.split(' '))<=15:
        corpus.append(preprocessed_sentence)

# 결과 확인
corpus[:10]

['<start> now i ve heard there was a secret chord <end>',
 '<start> that david played , and it pleased the lord <end>',
 '<start> but you don t really care for music , do you ? <end>',
 '<start> it goes like this <end>',
 '<start> the fourth , the fifth <end>',
 '<start> the minor fall , the major lift <end>',
 '<start> the baffled king composing hallelujah hallelujah <end>',
 '<start> hallelujah <end>',
 '<start> hallelujah <end>',
 '<start> hallelujah your faith was strong but you needed proof <end>']

In [6]:
print(len(corpus))

156013


평가 데이터셋 분리

In [7]:
def tokenize(corpus):
    # 12000단어에 포함되지 못한 단어는 '<unk>'로 바꿀거에요
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=12000, 
        filters=' ',
        oov_token="<unk>"
    )
    
    tokenizer.fit_on_texts(corpus)
    tensor = tokenizer.texts_to_sequences(corpus)   
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')  
    
    print(tensor,tokenizer)
    return tensor, tokenizer

tensor, tokenizer = tokenize(corpus)

[[   2   50    4 ...    0    0    0]
 [   2   15 2967 ...    0    0    0]
 [   2   33    7 ...   46    3    0]
 ...
 [   2    4  118 ...    0    0    0]
 [   2  258  194 ...   12    3    0]
 [   2    7   34 ...    0    0    0]] <keras_preprocessing.text.Tokenizer object at 0x7f9e5d3c42b0>


In [8]:
print(tensor[:3, :10])

[[   2   50    4   95  303   62   53    9  946 6263]
 [   2   15 2967  871    5    8   11 5739    6  374]
 [   2   33    7   40   16  164  288   28  333    5]]


In [9]:
for idx in tokenizer.index_word:
    print(idx, ":", tokenizer.index_word[idx])

    if idx >= 10: break

1 : <unk>
2 : <start>
3 : <end>
4 : i
5 : ,
6 : the
7 : you
8 : and
9 : a
10 : to


In [10]:
src_input = tensor[:, :-1]  # tensor에서 마지막 토큰을 잘라내서 소스 문장을 생성
tgt_input = tensor[:, 1:]  # tensor에서 <start>를 잘라내서 타겟 문장을 생성 

print(src_input[0])
print(tgt_input[0])

[   2   50    4   95  303   62   53    9  946 6263    3    0    0    0]
[  50    4   95  303   62   53    9  946 6263    3    0    0    0    0]


In [11]:
from sklearn.datasets import load_iris # 샘플 데이터 로딩
from sklearn.model_selection import train_test_split


src_input = tensor[:, :-1]
tgt_input = tensor[:, 1:]  

X = src_input
y = tgt_input

# train_test_split
enc_train, enc_val, dec_train, dec_val = train_test_split(X, y, test_size=0.2)

In [12]:
print(enc_train.shape, dec_train.shape)
print(enc_val.shape, dec_val.shape)

(124810, 14) (124810, 14)
(31203, 14) (31203, 14)


In [13]:
BUFFER_SIZE = len(src_input)
BATCH_SIZE = 256
steps_per_epoch = len(src_input) // BATCH_SIZE

VOCAB_SIZE = tokenizer.num_words + 1   

dataset = tf.data.Dataset.from_tensor_slices((src_input, tgt_input))
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
dataset

<BatchDataset shapes: ((256, 14), (256, 14)), types: (tf.int32, tf.int32)>

인공지능 만들기

In [14]:
class TextGenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super().__init__()
       . 
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size) 
        self.rnn_1 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)  
        self.rnn_2 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.linear = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.rnn_2(out)
        out = self.linear(out)
        
        return out

embedding_size = 256 
hidden_size = 1024 
model = TextGenerator(tokenizer.num_words + 1, embedding_size , hidden_size)

In [15]:
for enc_val, dec_val in dataset.take(1): break

model(enc_val)

<tf.Tensor: shape=(256, 14, 12001), dtype=float32, numpy=
array([[[ 5.62852038e-05, -5.74638725e-05, -5.54467151e-05, ...,
         -1.04926170e-04, -1.92459606e-06,  6.67352433e-05],
        [-2.50754656e-05, -1.41379511e-04,  1.27630790e-06, ...,
         -3.31160845e-04, -1.90512044e-04,  1.93272514e-04],
        [-1.85434968e-04, -6.57441633e-05,  1.92020205e-04, ...,
         -6.64999767e-04, -2.57447653e-04,  2.01440285e-04],
        ...,
        [ 5.66817529e-04,  1.67906587e-03,  1.47010793e-03, ...,
         -1.92827731e-03, -1.33572286e-03,  1.68988667e-03],
        [ 9.83903068e-04,  1.94801332e-03,  1.24063471e-03, ...,
         -1.81187200e-03, -1.55316794e-03,  2.00679689e-03],
        [ 1.36897736e-03,  2.15465738e-03,  9.79853445e-04, ...,
         -1.67081528e-03, -1.73514348e-03,  2.29967688e-03]],

       [[ 5.62852038e-05, -5.74638725e-05, -5.54467151e-05, ...,
         -1.04926170e-04, -1.92459606e-06,  6.67352433e-05],
        [ 2.47048330e-04,  2.55967403e-04,  4

In [16]:
model.summary()

Model: "text_generator"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  3072256   
_________________________________________________________________
lstm (LSTM)                  multiple                  5246976   
_________________________________________________________________
lstm_1 (LSTM)                multiple                  8392704   
_________________________________________________________________
dense (Dense)                multiple                  12301025  
Total params: 29,012,961
Trainable params: 29,012,961
Non-trainable params: 0
_________________________________________________________________


In [17]:
optimizer = tf.keras.optimizers.Adam() 
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

model.compile(loss=loss, optimizer=optimizer)
model.fit(dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f9e404675e0>

In [18]:
def generate_text(model, tokenizer, init_sentence="<start>", max_len=20): #시작 문자열을 init_sentence 로 받으며 디폴트값은 <start> 를 받는다
    # init_sentence를 텐서로 변환
    test_input = tokenizer.texts_to_sequences([init_sentence]) #텍스트 안의 단어들을 숫자의 시퀀스의 형태로 변환
    test_tensor = tf.convert_to_tensor(test_input, dtype=tf.int64)
    end_token = tokenizer.word_index["<end>"]

    while True: #루프를 돌면서 init_sentence에 단어를 하나씩 생성
        # 1
        predict = model(test_tensor) 
        # 2
        predict_word = tf.argmax(tf.nn.softmax(predict, axis=-1), axis=-1)[:, -1] 
        # 3 
        test_tensor = tf.concat([test_tensor, tf.expand_dims(predict_word, axis=0)], axis=-1)
        # 4 
        if predict_word.numpy()[0] == end_token: break
        if test_tensor.shape[1] >= max_len: break

    generated = ""
    # tokenizer를 이용해 word index를 단어로 하나씩 변환 
    for word_index in test_tensor[0].numpy():
        generated += tokenizer.index_word[word_index] + " "

    return generated

In [19]:
lyricist = model

In [20]:
generate_text(lyricist, tokenizer, init_sentence="<start> i love", max_len=20)

'<start> i love you , i m a liability <end> '

In [21]:
#회고
#- 이번 프로젝트에서 어려웠던 점
#: 토크나이즈에서 결과값이 크게 좌우되었다. 되게 쉽게 보아 간과하고 있었다.
#- 프로젝트를 진행하면서 알아낸 점 혹은 아직 모호한 점
#: preprocessed_sentence에서 덜어낸 것과 sentence에서 덜어낸 것에 자수가 15자를 넘어가는 것에 특수문자에 포함된 것이냐가 크게 좌우했다.
#이에 따라 학습하는 데이터의 질이 좋지 못하게 되어 결과값도 풍부하지 않게 나왔다.
#- 루브릭 평가 지표를 맞추기 위해 시도한 것들:
#: lf len(preprocessed_sentence.split(' '))<=15:
#        corpus.append(preprocessed_sentence)
#  if len(sentence.split(''))>15:
#        continue
#  두 방안을 사용하여 토큰화를 시도했다.
#- 만약에 루브릭 평가 관련 지표를 달성 하지 못했을 때, 이유에 관한 추정:
#: 달성했다.
#- 자기 다짐:
#: 하면 된다.