### 패딩

In [18]:
import numpy as np
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
from tensorflow.keras.preprocessing.text import Tokenizer

In [19]:
text = "A barber is a person. a barber is good person. a barber is huge person. he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word. a barber kept his word. His barber kept his secret. But keeping and keeping such a huge secret to himself was driving the barber crazy. the barber went up a huge mountain."

In [20]:
s_token = sent_tokenize(text)

In [21]:
# Cleaning으로 각 문장별 특수문자, 구두점 삭제
sentence = []
for s in s_token:
    tmp = re.sub('[^a-zA-Z0-9]',' ',s)
    sentence.append(tmp)
print(sentence)

['A barber is a person ', 'a barber is good person ', 'a barber is huge person ', 'he Knew A Secret ', 'The Secret He Kept is huge secret ', 'Huge secret ', 'His barber kept his word ', 'a barber kept his word ', 'His barber kept his secret ', 'But keeping and keeping such a huge secret to himself was driving the barber crazy ', 'the barber went up a huge mountain ']


In [22]:
# Stopwords로 불용어 삭제
pp_sentences = []
s_words = set(stopwords.words('english'))

for s in sentence:
    # word tokenization
    word = word_tokenize(s)
    result = []
    for w in word:
        w = w.lower()
        if w not in s_words:
            result.append(w)
    pp_sentences.append(result)
print(pp_sentences)

[['barber', 'person'], ['barber', 'good', 'person'], ['barber', 'huge', 'person'], ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], ['huge', 'secret'], ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], ['barber', 'kept', 'secret'], ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'], ['barber', 'went', 'huge', 'mountain']]


In [25]:
# OOV를 사용하지 않고 모든 단어에 대해 인코딩한 버전
tokenizer = Tokenizer()
tokenizer.fit_on_texts(pp_sentences)
encoded = tokenizer.texts_to_sequences(pp_sentences)
print(encoded)

# OOV를 사용하여 최상의 5개의 단어만 사용한 버전
vocab_size = 5
tokenizer = Tokenizer(num_words = vocab_size + 2, oov_token = 'OOV')
tokenizer.fit_on_texts(pp_sentences)
encoded = tokenizer.texts_to_sequences(pp_sentences)
print(encoded)

[[1, 5], [1, 8, 5], [1, 3, 5], [9, 2], [2, 4, 3, 2], [3, 2], [1, 4, 6], [1, 4, 6], [1, 4, 2], [7, 7, 3, 2, 10, 1, 11], [1, 12, 3, 13]]
[[2, 6], [2, 1, 6], [2, 4, 6], [1, 3], [3, 5, 4, 3], [4, 3], [2, 5, 1], [2, 5, 1], [2, 5, 3], [1, 1, 4, 3, 1, 2, 1], [2, 1, 4, 1]]


In [26]:
max_len = max(len(item) for item in encoded)
print(max_len)

7


In [28]:
# while문을 사용하여 최고길이(max_len)만큼 뒤에 0을 추가(post-zero-padding)
for s in encoded:
    while len(s) <max_len:
        s.append(0)

padded_np = np.array(encoded)
padded_np

array([[2, 6, 0, 0, 0, 0, 0],
       [2, 1, 6, 0, 0, 0, 0],
       [2, 4, 6, 0, 0, 0, 0],
       [1, 3, 0, 0, 0, 0, 0],
       [3, 5, 4, 3, 0, 0, 0],
       [4, 3, 0, 0, 0, 0, 0],
       [2, 5, 1, 0, 0, 0, 0],
       [2, 5, 1, 0, 0, 0, 0],
       [2, 5, 3, 0, 0, 0, 0],
       [1, 1, 4, 3, 1, 2, 1],
       [2, 1, 4, 1, 0, 0, 0]])

### Keras를 사용하여 Padding 작업 실행

In [29]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [32]:
# OOV를 사용하여 최상의 5개의 단어만 사용한 버전
encoded = tokenizer.texts_to_sequences(pp_sentences)
print(encoded)

[[2, 6], [2, 1, 6], [2, 4, 6], [1, 3], [3, 5, 4, 3], [4, 3], [2, 5, 1], [2, 5, 1], [2, 5, 3], [1, 1, 4, 3, 1, 2, 1], [2, 1, 4, 1]]


In [33]:
# keras의 pad_sequences는 default로 pre-zero-padding 기법을 사용
padded = pad_sequences(encoded) 
padded

array([[0, 0, 0, 0, 0, 2, 6],
       [0, 0, 0, 0, 2, 1, 6],
       [0, 0, 0, 0, 2, 4, 6],
       [0, 0, 0, 0, 0, 1, 3],
       [0, 0, 0, 3, 5, 4, 3],
       [0, 0, 0, 0, 0, 4, 3],
       [0, 0, 0, 0, 2, 5, 1],
       [0, 0, 0, 0, 2, 5, 1],
       [0, 0, 0, 0, 2, 5, 3],
       [1, 1, 4, 3, 1, 2, 1],
       [0, 0, 0, 2, 1, 4, 1]])

In [36]:
# post-padding을 원하면, padding='post'를 사용
padded_post = pad_sequences(encoded, padding='post')
padded_post


array([[2, 6, 0, 0, 0, 0, 0],
       [2, 1, 6, 0, 0, 0, 0],
       [2, 4, 6, 0, 0, 0, 0],
       [1, 3, 0, 0, 0, 0, 0],
       [3, 5, 4, 3, 0, 0, 0],
       [4, 3, 0, 0, 0, 0, 0],
       [2, 5, 1, 0, 0, 0, 0],
       [2, 5, 1, 0, 0, 0, 0],
       [2, 5, 3, 0, 0, 0, 0],
       [1, 1, 4, 3, 1, 2, 1],
       [2, 1, 4, 1, 0, 0, 0]])

In [39]:
# padding='post'를 사용했을 때에는 위의 numpy과정과 같은 결과 출력됨
print(padded_np == padded_post)
print((padded_np == padded_post).all())

[[ True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True]]
True


In [40]:
# 모든 문서를 max_len으로 정의 할 필요는 없음!, maxlen을 커스텀 설정 가능
# 단, 기존데이터가 maxlen보다 길었다면 뒤의 데이터가 손실됨.
padded = pad_sequences(encoded,padding='pre', maxlen=5)
padded

array([[0, 0, 0, 2, 6],
       [0, 0, 2, 1, 6],
       [0, 0, 2, 4, 6],
       [0, 0, 0, 1, 3],
       [0, 3, 5, 4, 3],
       [0, 0, 0, 4, 3],
       [0, 0, 2, 5, 1],
       [0, 0, 2, 5, 1],
       [0, 0, 2, 5, 3],
       [4, 3, 1, 2, 1],
       [0, 2, 1, 4, 1]])

In [45]:
# 만약 뒤가 아닌 앞의 데이터가 삭제되도록 설정하려면
padded = pad_sequences(encoded,padding='post',maxlen=max_len, truncating='pre')
padded

array([[2, 6, 0, 0, 0, 0, 0],
       [2, 1, 6, 0, 0, 0, 0],
       [2, 4, 6, 0, 0, 0, 0],
       [1, 3, 0, 0, 0, 0, 0],
       [3, 5, 4, 3, 0, 0, 0],
       [4, 3, 0, 0, 0, 0, 0],
       [2, 5, 1, 0, 0, 0, 0],
       [2, 5, 1, 0, 0, 0, 0],
       [2, 5, 3, 0, 0, 0, 0],
       [1, 1, 4, 3, 1, 2, 1],
       [2, 1, 4, 1, 0, 0, 0]])

In [46]:
# zero-padding이 아닌 임의의 값으로 padding값을 사용하고 싶다면
val_last = len(tokenizer.word_index) + 1
val_last

15

In [47]:
padded = pad_sequences(encoded, padding='pre', value=val_last)
padded

array([[15, 15, 15, 15, 15,  2,  6],
       [15, 15, 15, 15,  2,  1,  6],
       [15, 15, 15, 15,  2,  4,  6],
       [15, 15, 15, 15, 15,  1,  3],
       [15, 15, 15,  3,  5,  4,  3],
       [15, 15, 15, 15, 15,  4,  3],
       [15, 15, 15, 15,  2,  5,  1],
       [15, 15, 15, 15,  2,  5,  1],
       [15, 15, 15, 15,  2,  5,  3],
       [ 1,  1,  4,  3,  1,  2,  1],
       [15, 15, 15,  2,  1,  4,  1]])