# 연습

In [2]:
import os, re
import numpy as np
import tensorflow as tf

file_path = os.getenv('HOME') + '/aiffel/lyricist/data/shakespeare.txt'
with open(file_path, 'r') as f:
    raw_corpus = f.read().splitlines()
    
print(raw_corpus[:9])

['First Citizen:', 'Before we proceed any further, hear me speak.', '', 'All:', 'Speak, speak.', '', 'First Citizen:', 'You are all resolved rather to die than to famish?', '']


In [26]:
for index, sentence in enumerate(raw_corpus):
    if len(sentence) == 0:continue
    if sentence[-1] == ":":continue
        
    if index > 9:break
        
    print(sentence)

Before we proceed any further, hear me speak.
Speak, speak.
You are all resolved rather to die than to famish?


In [15]:
# def preprocess_sentence(sentence):
#     sentence = sentence.lower().strip() # 단어를 소문자로 바꾸고 공백삭제
#     sentence = re.sub(r"([?.!,¿])", r"\1", sentence) # 특수문자 양쪽에 공백??
#     sentence = re.sub(r'[" "]', " ", sentence) # 여러개의 공백은 하나의 공백으로 처리
#     sentence = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sentence) # a-zA-Z?.!,¿가 아닌 문자는 공백으로 처리
#     sentence = sentence.strip() # 양쪽 공백 처리
#     sentence = "<start>" + sentence + '<end>' # 문장 처음에는 <start>, 끝에는 <end> 추가
#     return sentence

In [27]:
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip() # 1
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence) # 2
    sentence = re.sub(r'[" "]+', " ", sentence) # 3
    sentence = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sentence) # 4
    sentence = sentence.strip() # 5
    sentence = '<start> ' + sentence + ' <end>' # 6
    return sentence

In [28]:
print(preprocess_sentence("This @_is ;;;sample        sentence."))

<start> this is sample sentence . <end>


In [29]:
corpus = []

for sentence in raw_corpus:
    if len(sentence) == 0: continue
    if sentence[-1] == ":": continue
        
    preprocessed_sentence = preprocess_sentence(sentence)
    corpus.append(preprocessed_sentence)
    
corpus[:10]

['<start> before we proceed any further , hear me speak . <end>',
 '<start> speak , speak . <end>',
 '<start> you are all resolved rather to die than to famish ? <end>',
 '<start> resolved . resolved . <end>',
 '<start> first , you know caius marcius is chief enemy to the people . <end>',
 '<start> we know t , we know t . <end>',
 '<start> let us kill him , and we ll have corn at our own price . <end>',
 '<start> is t a verdict ? <end>',
 '<start> no more talking on t let it be done away , away ! <end>',
 '<start> one word , good citizens . <end>']

In [30]:
def tokenize(corpus):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words = 7000,
        filters = ' ',
        oov_token = "<unk>"
    )
    
    tokenizer.fit_on_texts(corpus)
    
    tensor = tokenizer.texts_to_sequences(corpus)
    
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding = 'post')
    
    print(tensor, tokenizer)
    return tensor, tokenizer

tensor, tokenizer = tokenize(corpus)

[[   2  143   40 ...    0    0    0]
 [   2  110    4 ...    0    0    0]
 [   2   11   50 ...    0    0    0]
 ...
 [   2  149 4553 ...    0    0    0]
 [   2   34   71 ...    0    0    0]
 [   2  945   34 ...    0    0    0]] <keras_preprocessing.text.Tokenizer object at 0x7f0bfb983190>


In [31]:
print(tensor[:3, :10])

[[   2  143   40  933  140  591    4  124   24  110]
 [   2  110    4  110    5    3    0    0    0    0]
 [   2   11   50   43 1201  316    9  201   74    9]]


In [32]:
for index in tokenizer.index_word:
    print(index, ":", tokenizer.index_word[index])
    
    if index >= 10: break

1 : <unk>
2 : <start>
3 : <end>
4 : ,
5 : .
6 : the
7 : and
8 : i
9 : to
10 : of


In [33]:
src_input = tensor[:,:-1]
tgt_input = tensor[:,1:]
print(src_input[0])
print(tgt_input[0])

[  2 143  40 933 140 591   4 124  24 110   5   3   0   0   0   0   0   0
   0   0]
[143  40 933 140 591   4 124  24 110   5   3   0   0   0   0   0   0   0
   0   0]


In [34]:
BUFFER_SIZE = len(src_input)
BATCH_SIZE = 256
steps_per_epoch = len(src_input)//BATCH_SIZE

VOCAB_SIZE = tokenizer.num_words + 1

dataset = tf.data.Dataset.from_tensor_slices((src_input, tgt_input))
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder = True)
dataset

<BatchDataset shapes: ((256, 20), (256, 20)), types: (tf.int32, tf.int32)>

In [36]:
class TextGenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super().__init__()
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.rnn_1 = tf.keras.layers.LSTM(hidden_size, return_sequences = True)
        self.rnn_2 = tf.keras.layers.LSTM(hidden_size, return_sequences = True)
        self.linear = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.rnn_2(out)
        out = self.linear(out)
        
        return out
    
embedding_size = 256
hidden_size = 1024
model = TextGenerator(tokenizer.num_words + 1, embedding_size, hidden_size)

In [37]:
for src_sample, tgt_sample in dataset.take(1):break
    
model(src_sample)

<tf.Tensor: shape=(256, 20, 7001), dtype=float32, numpy=
array([[[-3.5814825e-04,  1.9253716e-04, -2.5566388e-04, ...,
         -2.8332704e-04, -5.3213895e-05, -8.1370636e-06],
        [-2.1109100e-04,  1.8291328e-04, -4.7705293e-04, ...,
         -4.3738744e-04,  3.0917006e-05, -1.6622123e-04],
        [ 6.3367719e-05,  2.5092167e-04, -7.2886981e-04, ...,
         -6.3748332e-04, -6.1941086e-05, -1.2380027e-04],
        ...,
        [ 2.3183892e-03,  1.0343073e-03,  1.2371280e-03, ...,
          1.4163012e-03, -1.4375453e-03, -1.9045782e-03],
        [ 2.5420196e-03,  1.1397250e-03,  1.5196903e-03, ...,
          1.5781472e-03, -1.6570971e-03, -2.0266911e-03],
        [ 2.7083051e-03,  1.2338535e-03,  1.7942002e-03, ...,
          1.7051207e-03, -1.8662640e-03, -2.1316516e-03]],

       [[-3.5814825e-04,  1.9253716e-04, -2.5566388e-04, ...,
         -2.8332704e-04, -5.3213895e-05, -8.1370636e-06],
        [-5.7064992e-04,  1.7936347e-04, -5.4900395e-04, ...,
         -2.1406755e-04, -

In [38]:
model.summary()

Model: "text_generator"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  1792256   
_________________________________________________________________
lstm (LSTM)                  multiple                  5246976   
_________________________________________________________________
lstm_1 (LSTM)                multiple                  8392704   
_________________________________________________________________
dense (Dense)                multiple                  7176025   
Total params: 22,607,961
Trainable params: 22,607,961
Non-trainable params: 0
_________________________________________________________________


In [39]:
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits = True,
    reduction = 'none'
)

model.compile(loss = loss, optimizer = optimizer)
model.fit(dataset, epochs = 30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f0bfb580590>

# 미니 프로젝트

In [4]:
import glob
import os

txt_file_path = os.getenv('HOME') + '/aiffel//lyricist/data/lyrics/*'

txt_list = glob.glob(txt_file_path)

raw_corpus = []

for txt_file in txt_list:
    with open(txt_file,'r') as f:
        raw = f.read().splitlines()
        raw_corpus.extend(raw)
        
print('데이터 크기 :', len(raw_corpus))
print('Examples :\n', raw_corpus[:3])

데이터 크기 : 187088
Examples :
 ['[Hook]', "I've been down so long, it look like up to me", 'They look up to me']


In [5]:
raw_corpus[:100]

['[Hook]',
 "I've been down so long, it look like up to me",
 'They look up to me',
 "I got fake people showin' fake love to me",
 'Straight up to my face, straight up to my face',
 "I've been down so long, it look like up to me",
 'They look up to me',
 "I got fake people showin' fake love to me",
 'Straight up to my face, straight up to my face [Verse 1]',
 "Somethin' ain't right when we talkin'",
 "Somethin' ain't right when we talkin'",
 "Look like you hidin' your problems",
 'Really you never was solid',
 'No, you can\'t "son" me',
 "You won't never get to run me",
 'Just when shit look out of reach',
 'I reach back like one, three',
 'Like one, three, yeah [Pre-Hook]',
 "That's when they smile in my face",
 'Whole time they wanna take my place',
 'Whole time they wanna take my place',
 'Whole time they wanna take my place',
 'Yeah, I know they wanna take my place',
 'I can tell that love is fake',
 "I don't trust a word you say",
 'How you wanna clique up after your mistakes?',
 

In [6]:
for index, sentence in enumerate(raw_corpus):
    if len(sentence) == 0: continue
    if index > 9: break
        
    print(sentence)

[Hook]
I've been down so long, it look like up to me
They look up to me
I got fake people showin' fake love to me
Straight up to my face, straight up to my face
I've been down so long, it look like up to me
They look up to me
I got fake people showin' fake love to me
Straight up to my face, straight up to my face [Verse 1]
Somethin' ain't right when we talkin'


In [8]:
# 문장 전처리하기
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub('\[.+?\]', '', sentence)
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)
    sentence = re.sub(r'[ ]+', " ", sentence)
    sentence = re.sub(r'[^a-zA-Z?.!,¿]', " ", sentence)
    sentence = sentence.strip()
    sentence = '<start> ' + sentence + ' <end>'
    return sentence

print(preprocess_sentence("Hold on, hold on, fuck that. Fuck that shit. Hold on, I got to start this mothafuckin' record over again, wait a minute. Fuck that shit. Still on this mothafuckin' record. I'ma play this mothafucka for y'all. Aye, y'all get some more drinks goin' on, I'll sound a whole lot better. [Verse 1]"))

<start> hold on , hold on , fuck that . fuck that shit . hold on , i got to start this mothafuckin  record over again , wait a minute . fuck that shit . still on this mothafuckin  record . i ma play this mothafucka for y all . aye , y all get some more drinks goin  on , i ll sound a whole lot better . <end>


In [11]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence

corpus = []

for sentence in raw_corpus:
    
    if len(sentence) == 0: continue

    preprocessed_sentence = preprocess_sentence(sentence)
    
    if len(text_to_word_sequence(preprocessed_sentence)) >= 15: # 토큰의 길이가 15가 넘는 문장은 빼기
        continue
    
    else:
        corpus.append(preprocessed_sentence)

    
corpus[:100]

['<start>  <end>',
 '<start> i ve been down so long , it look like up to me <end>',
 '<start> they look up to me <end>',
 '<start> i got fake people showin  fake love to me <end>',
 '<start> straight up to my face , straight up to my face <end>',
 '<start> i ve been down so long , it look like up to me <end>',
 '<start> they look up to me <end>',
 '<start> i got fake people showin  fake love to me <end>',
 '<start> straight up to my face , straight up to my face <end>',
 '<start> somethin  ain t right when we talkin <end>',
 '<start> somethin  ain t right when we talkin <end>',
 '<start> look like you hidin  your problems <end>',
 '<start> really you never was solid <end>',
 '<start> no , you can t  son  me <end>',
 '<start> you won t never get to run me <end>',
 '<start> just when shit look out of reach <end>',
 '<start> i reach back like one , three <end>',
 '<start> like one , three , yeah <end>',
 '<start> that s when they smile in my face <end>',
 '<start> whole time they wanna ta

In [12]:
# 텐서화 시키기

def tokenize(corpus):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words = 12000,
        filters = '',
        oov_token = '<unk>'
    )
    
    tokenizer.fit_on_texts(corpus)
    
    tensor = tokenizer.texts_to_sequences(corpus)
    
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding = 'post')
    
    print(tensor, tokenizer)
    return tensor, tokenizer

tensor, tokenizer = tokenize(corpus)

[[   2    3    0 ...    0    0    0]
 [   2    5   97 ...    0    0    0]
 [   2   40  132 ...    0    0    0]
 ...
 [   2  202    3 ...    0    0    0]
 [   2  424    9 ...    0    0    0]
 [   2    9 1564 ...    0    0    0]] <keras_preprocessing.text.Tokenizer object at 0x7fb93fef11d0>


In [13]:
print(tensor[:3, :])

[[  2   3   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  2   5  97 108  59  31 166   4  11 132  23  29  10  12   3   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  2  40 132  29  10  12   3   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0]]


In [14]:
# 구축된 단어사전 확인해보기
for index in tokenizer.index_word:
    print(index, " : ", tokenizer.index_word[index])
    
    if index >= 10: break

1  :  <unk>
2  :  <start>
3  :  <end>
4  :  ,
5  :  i
6  :  the
7  :  you
8  :  and
9  :  a
10  :  to


In [15]:
# input 데이터와 output 데이터 만들기

src_input = tensor[:, :-1]
tgt_input = tensor[:, 1:]

print(src_input[1])
print(tgt_input[1])
len(src_input)

[  2   5  97 108  59  31 166   4  11 132  23  29  10  12   3   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0]
[  5  97 108  59  31 166   4  11 132  23  29  10  12   3   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0]


155622

In [16]:
buffer_size = len(src_input)
batch_size = 256
steps_per_epoch = len(src_input)//batch_size

vocab_size = tokenizer.num_words + 1

dataset = tf.data.Dataset.from_tensor_slices((src_input, tgt_input))

dataset = dataset.shuffle(buffer_size)
dataset = dataset.batch(batch_size, drop_remainder = True)
dataset



<BatchDataset shapes: ((256, 31), (256, 31)), types: (tf.int32, tf.int32)>

In [17]:
# 학습데이터와 평가데이터로 분리하기

from sklearn.model_selection import train_test_split

enc_train, enc_val, dec_train, dec_val = train_test_split(src_input, tgt_input, test_size = 0.2, random_state = 42)

print("Source Train:", enc_train.shape)
print("Target Train:", dec_train.shape)



Source Train: (124497, 31)
Target Train: (124497, 31)


In [18]:
# 인공지능 학습시키기

class TextGenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super().__init__()
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.rnn_1 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.rnn_2 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.linear = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.rnn_2(out)
        out = self.linear(out)
        
        return out
    
embedding_size = 256
hidden_size = 1024
model = TextGenerator(tokenizer.num_words + 1, embedding_size , hidden_size)

In [19]:
for src_sample, tgt_sample in dataset.take(1):break
    
model(src_sample)

<tf.Tensor: shape=(256, 31, 12001), dtype=float32, numpy=
array([[[ 9.91955967e-05, -5.00889219e-05,  1.82585900e-05, ...,
          1.78373084e-05, -3.50100299e-06, -1.92250183e-04],
        [ 4.90466657e-04, -3.26571142e-04,  1.35671085e-04, ...,
          5.37909800e-05, -1.41416662e-04, -3.19969346e-04],
        [ 8.55844351e-04, -4.91262123e-04,  3.33294083e-05, ...,
         -1.22903060e-04, -1.99335816e-04, -3.70199559e-04],
        ...,
        [ 4.21123859e-03, -4.92468430e-03, -5.23592392e-03, ...,
         -2.19860769e-04,  3.16735078e-03, -3.57267307e-03],
        [ 4.33726236e-03, -5.04222233e-03, -5.38091827e-03, ...,
         -1.79692724e-04,  3.29125882e-03, -3.64975329e-03],
        [ 4.45285439e-03, -5.14252204e-03, -5.50540304e-03, ...,
         -1.40434175e-04,  3.40141193e-03, -3.71687417e-03]],

       [[ 9.91955967e-05, -5.00889219e-05,  1.82585900e-05, ...,
          1.78373084e-05, -3.50100299e-06, -1.92250183e-04],
        [-5.77989194e-05, -9.07840222e-05, -1

In [20]:
model.summary()

Model: "text_generator"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  3072256   
_________________________________________________________________
lstm (LSTM)                  multiple                  5246976   
_________________________________________________________________
lstm_1 (LSTM)                multiple                  8392704   
_________________________________________________________________
dense (Dense)                multiple                  12301025  
Total params: 29,012,961
Trainable params: 29,012,961
Non-trainable params: 0
_________________________________________________________________


In [21]:
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits = True,
    reduction = 'none'
)

model.compile(loss = loss, optimizer = optimizer)
model.fit(dataset, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fb9304ca190>

In [30]:
def generate_text(model, tokenizer, init_sentence="<start>", max_len=20):
    # 테스트를 위해서 입력받은 init_sentence도 텐서로 변환합니다
    test_input = tokenizer.texts_to_sequences([init_sentence])
    test_tensor = tf.convert_to_tensor(test_input, dtype=tf.int64)
    end_token = tokenizer.word_index["<end>"]

    # 단어 하나씩 예측해 문장을 만듭니다
    #    1. 입력받은 문장의 텐서를 입력합니다
    #    2. 예측된 값 중 가장 높은 확률인 word index를 뽑아냅니다
    #    3. 2에서 예측된 word index를 문장 뒤에 붙입니다
    #    4. 모델이 <end>를 예측했거나, max_len에 도달했다면 문장 생성을 마칩니다
    while True:
        # 1
        predict = model(test_tensor) 
        # 2
        predict_word = tf.argmax(tf.nn.softmax(predict, axis=-1), axis=-1)[:, -1] 
        # 3 
        test_tensor = tf.concat([test_tensor, tf.expand_dims(predict_word, axis=0)], axis=-1)
        # 4
        if predict_word.numpy()[0] == end_token: break
        if test_tensor.shape[1] >= max_len: break

    generated = ""
    # tokenizer를 이용해 word index를 단어로 하나씩 변환합니다 
    for word_index in test_tensor[0].numpy():
        generated += tokenizer.index_word[word_index] + " "

    return generated

In [33]:
generate_text(model, tokenizer, init_sentence="<start> i like", max_len=20)

'<start> i like the way how you re kissin me <end> '