### Python Dictionary 자료형을 이용한 정수 인코딩


In [20]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re


In [21]:
text = "A barber is a person. a barber is good person. a barber is huge person. he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word. a barber kept his word. His barber kept his secret. But keeping and keeping such a huge secret to himself was driving the barber crazy. the barber went up a huge mountain."

In [26]:
# sent_tokenize를 통한 문장 토큰화
s_token = sent_tokenize(text)
print(s_token)

['A barber is a person.', 'a barber is good person.', 'a barber is huge person.', 'he Knew A Secret!', 'The Secret He Kept is huge secret.', 'Huge secret.', 'His barber kept his word.', 'a barber kept his word.', 'His barber kept his secret.', 'But keeping and keeping such a huge secret to himself was driving the barber crazy.', 'the barber went up a huge mountain.']


In [29]:
# Cleaning으로 각 문장별 특수문자, 구두점 삭제
sentence = []
for s in s_token:
    tmp = re.sub('[^a-zA-Z0-9]',' ',s)
    sentence.append(tmp)
print(sentence)

['A barber is a person ', 'a barber is good person ', 'a barber is huge person ', 'he Knew A Secret ', 'The Secret He Kept is huge secret ', 'Huge secret ', 'His barber kept his word ', 'a barber kept his word ', 'His barber kept his secret ', 'But keeping and keeping such a huge secret to himself was driving the barber crazy ', 'the barber went up a huge mountain ']


In [37]:
# Stopwords로 불용어 삭제
vocab = {}
pp_sentences = []
s_words = set(stopwords.words('english'))

for s in sentence:
    # word tokenization
    word = word_tokenize(s)
    result = []
    for w in word:
        w = w.lower()
        if w not in s_words:
            result.append(w)
            if w not in vocab:
                vocab[w] = 0
            vocab[w]+=1
    pp_sentences.append(result)
print(pp_sentences)

[['barber', 'person'], ['barber', 'good', 'person'], ['barber', 'huge', 'person'], ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], ['huge', 'secret'], ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], ['barber', 'kept', 'secret'], ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'], ['barber', 'went', 'huge', 'mountain']]


In [1]:
# 원래는 여기서 어간 추출과 표제어 추출 또한 진행해줘야겠지만
# 결과가 덜 정확하게 나옴으로 통과

In [38]:
# 빈도수 확인
print(vocab)

{'barber': 8, 'person': 3, 'good': 1, 'huge': 5, 'knew': 1, 'secret': 6, 'kept': 4, 'word': 2, 'keeping': 2, 'driving': 1, 'crazy': 1, 'went': 1, 'mountain': 1}


In [39]:
# 빈도수를 기준으로 sorted
vocab_sorted = sorted(vocab.items(), key=lambda x:x[1], reverse=True)
print(vocab_sorted)

[('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3), ('word', 2), ('keeping', 2), ('good', 1), ('knew', 1), ('driving', 1), ('crazy', 1), ('went', 1), ('mountain', 1)]


In [40]:
# 빈도수가 많은 순으로 다시 index를 지정
# word_to_index = {word[0] : index + 1 for index, word in enumerate(vocab_sorted)}
w_index = {}
i = 0
for w, f in vocab_sorted:
    if f>1: # 빈도수가 적은 단어는 제외
        i+=1
        w_index[w] = i

print(w_index)

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5, 'word': 6, 'keeping': 7}


In [41]:
# 빈도수가 가장 높은 5개의 데이터만 사용
vocab_size = 5
w_frequency = [word for word, index in w_index.items() if index >=vocab_size+1]
for w in w_frequency:
    del w_index[w]
print(w_index)

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5}


In [42]:
# w_index를 사용하여 문장을 정수화로 수정
# 이 때, w_index에 없는 단어를 위해 OOV(Out-Of-Vocab)를 추가
w_index['OOV']= len(w_index)+1
print(w_index)

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5, 'OOV': 6}


In [45]:
# 이제 pp_sentence 내의 모든 단어를 정수화로 인코딩
e_sentences = []
for sentence in pp_sentences:
    e_sentence = []
    for word in sentence:
        try:
            e_sentence.append(w_index[word])
        except KeyError:
            e_sentence.append(w_index['OOV'])
    e_sentences.append(e_sentence)
print(e_sentences)

[[1, 5], [1, 6, 5], [1, 3, 5], [6, 2], [2, 4, 3, 2], [3, 2], [1, 4, 6], [1, 4, 6], [1, 4, 2], [6, 6, 3, 2, 6, 1, 6], [1, 6, 3, 6]]


### Counter을 사용하여 정수 인코딩

In [46]:
from collections import Counter
print(pp_sentences)

[['barber', 'person'], ['barber', 'good', 'person'], ['barber', 'huge', 'person'], ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], ['huge', 'secret'], ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], ['barber', 'kept', 'secret'], ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'], ['barber', 'went', 'huge', 'mountain']]


In [47]:
# words = np.hstack(preprocessed_sentences)으로도 수행 가능.
all_words_list = sum(pp_sentences, [])
print(all_words_list)

['barber', 'person', 'barber', 'good', 'person', 'barber', 'huge', 'person', 'knew', 'secret', 'secret', 'kept', 'huge', 'secret', 'huge', 'secret', 'barber', 'kept', 'word', 'barber', 'kept', 'word', 'barber', 'kept', 'secret', 'keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy', 'barber', 'went', 'huge', 'mountain']


In [48]:
# 파이썬의 Counter 모듈을 이용하여 단어의 빈도수 카운트
vocab = Counter(all_words_list)
print(vocab)

Counter({'barber': 8, 'secret': 6, 'huge': 5, 'kept': 4, 'person': 3, 'word': 2, 'keeping': 2, 'good': 1, 'knew': 1, 'driving': 1, 'crazy': 1, 'went': 1, 'mountain': 1})


In [49]:
vocab_size = 5
vocab = vocab.most_common(vocab_size) # 등장 빈도수가 높은 상위 5개의 단어만 저장
vocab

[('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3)]

In [50]:
word_to_index = {}
i = 0
for (word, frequency) in vocab :
    i = i + 1
    word_to_index[word] = i

print(word_to_index)

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5}


### NLTK의 FreqDist를 사용하여 정수 인코딩

In [51]:
from nltk import FreqDist
import numpy as np
print(pp_sentences)

[['barber', 'person'], ['barber', 'good', 'person'], ['barber', 'huge', 'person'], ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], ['huge', 'secret'], ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], ['barber', 'kept', 'secret'], ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'], ['barber', 'went', 'huge', 'mountain']]


In [54]:
vocab = FreqDist(np.hstack(pp_sentences))
print(vocab["barber"])

8


In [55]:
vocab_size = 5
vocab = vocab.most_common(vocab_size) # 등장 빈도수가 높은 상위 5개의 단어만 저장
print(vocab)

[('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3)]


In [63]:
# enumerate를 사용하여 더 짧은 코드로 인코딩
word_to_index = {word[0] : index + 1 for index, word in enumerate(vocab)}
print(word_to_index)

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5}


### Keras의 텍스트 전처리

In [57]:
from tensorflow.keras.preprocessing.text import Tokenizer




In [59]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(pp_sentences)

In [64]:
# word_index를 통해 빈도수가 높은 단어 확인
print(tokenizer.word_index)

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5, 'word': 6, 'keeping': 7, 'good': 8, 'knew': 9, 'driving': 10, 'crazy': 11, 'went': 12, 'mountain': 13}


In [65]:
# word_counts로 각 단어가 실제로 얼마나 나왔는지 확인
print(tokenizer.word_counts)

OrderedDict([('barber', 8), ('person', 3), ('good', 1), ('huge', 5), ('knew', 1), ('secret', 6), ('kept', 4), ('word', 2), ('keeping', 2), ('driving', 1), ('crazy', 1), ('went', 1), ('mountain', 1)])


In [66]:
print(tokenizer.texts_to_sequences(pp_sentences))

[[1, 5], [1, 8, 5], [1, 3, 5], [9, 2], [2, 4, 3, 2], [3, 2], [1, 4, 6], [1, 4, 6], [1, 4, 2], [7, 7, 3, 2, 10, 1, 11], [1, 12, 3, 13]]


In [67]:
vocab_size = 5
tokenizer = Tokenizer(num_words = vocab_size + 1) # 상위 5개 단어만 사용
tokenizer.fit_on_texts(pp_sentences)

In [69]:
# 위의 vocab_size를 적용해도 word_index와 word_counts에는 적용되지 않음
print(tokenizer.word_index)
print(tokenizer.word_counts)

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5, 'word': 6, 'keeping': 7, 'good': 8, 'knew': 9, 'driving': 10, 'crazy': 11, 'went': 12, 'mountain': 13}
OrderedDict([('barber', 8), ('person', 3), ('good', 1), ('huge', 5), ('knew', 1), ('secret', 6), ('kept', 4), ('word', 2), ('keeping', 2), ('driving', 1), ('crazy', 1), ('went', 1), ('mountain', 1)])


In [70]:
# 하지만 texts_to_sequences를 사용할 때 적용됨!
print(tokenizer.texts_to_sequences(pp_sentences))

[[1, 5], [1, 5], [1, 3, 5], [2], [2, 4, 3, 2], [3, 2], [1, 4], [1, 4], [1, 4, 2], [3, 2, 1], [1, 3]]


In [71]:
# word_index와 word_counts에도 vocab_size를 적용하고 싶다면
vocab_size = 5
tokenizer = Tokenizer(num_words = vocab_size + 1) # 상위 5개 단어만 사용
tokenizer.fit_on_texts(pp_sentences)
w_frequency = [word for word, index in tokenizer.word_index.items() if index>=vocab_size+1]
w_frequency

['word', 'keeping', 'good', 'knew', 'driving', 'crazy', 'went', 'mountain']

In [72]:
for word in w_frequency:
    del tokenizer.word_index[word]
    del tokenizer.word_counts[word]

print(tokenizer.word_index)
print(tokenizer.word_counts)

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5}
OrderedDict([('barber', 8), ('person', 3), ('huge', 5), ('secret', 6), ('kept', 4)])


In [76]:
# 만약 위의 방법 + vocab_size를 초과하는 단어를 OOV로 설정하고 싶다면
# 숫자 0과 OOV를 고려해서 단어 집합의 크기는 +2
vocab_size = 5
tokenizer = Tokenizer(num_words = vocab_size + 2, oov_token = 'OOV')
tokenizer.fit_on_texts(pp_sentences)
print(tokenizer.word_index) # oov_token을 사용하기로 했다면, OOV의 index는 1이됨

{'OOV': 1, 'barber': 2, 'secret': 3, 'huge': 4, 'kept': 5, 'person': 6, 'word': 7, 'keeping': 8, 'good': 9, 'knew': 10, 'driving': 11, 'crazy': 12, 'went': 13, 'mountain': 14}


In [78]:
print(tokenizer.texts_to_sequences(pp_sentences))

[[2, 6], [2, 1, 6], [2, 4, 6], [1, 3], [3, 5, 4, 3], [4, 3], [2, 5, 1], [2, 5, 1], [2, 5, 3], [1, 1, 4, 3, 1, 2, 1], [2, 1, 4, 1]]
