# Tokenizer (단어 단위로 잘라주기)

In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
sentence = ['i love my dog',
           'I, love my cay',
           'You love my dog']

In [6]:
# tokenizer는 지정된 max_num에 맞게 빈도수 기반으로 단어를 추리고, 사전으로 받아올 수 있음
tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentence)
word_index = tokenizer.word_index
print(word_index)

{'love': 1, 'my': 2, 'i': 3, 'dog': 4, 'cay': 5, 'you': 6}


# sequence

In [11]:
# pad_sequences는 input에 맞게 들어가는 max_len을 정해주고, 그에 맞게 패딩을 진행함
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences = ['i love my dog',
           'I, love my cay',
           'You love my dog',
            'Do you think my dog is amazing?']
tokenizer = Tokenizer(num_words=100, oov_token='<oov>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'<oov>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cay': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}


In [12]:
sequences = tokenizer.texts_to_sequences(sentences)
sequences

[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]

In [15]:
# padding, truncating -> 앞이 기준이면 pre, 뒤가 기준이면 post
padded = pad_sequences(sequences, maxlen=5, padding='post', truncating='post')
print(padded)

[[5 3 2 4 0]
 [5 3 2 7 0]
 [6 3 2 4 0]
 [8 6 9 2 4]]


## json 파일 불러와서 직접 tokenizing -> padding 해보기

In [48]:
import json
with open('../../DATA/sarcasm.json', 'r', encoding='utf-8') as f:
    datastore = json.load(f)

In [49]:
sentences = []
labels = []
urls = []
for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])
    urls.append(item['article_link'])

import numpy as np
labels = np.array(labels)

In [50]:
# tokenizing
vocab_size = 20000
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<oov>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

# padding
sequences = tokenizer.texts_to_sequences(sentences)
padded_sent = pad_sequences(sequences, padding='post', truncating='post')
print(padded_sent)

[[  308 15115   679 ...     0     0     0]
 [    4  8435  3338 ...     0     0     0]
 [  145   838     2 ...     0     0     0]
 ...
 [10735     9    68 ...     0     0     0]
 [ 1541   392  4164 ...     0     0     0]
 [    1  1647     6 ...     0     0     0]]


## model 학습

In [51]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D

In [57]:
embedding_dim = 100
max_length = 40

model = Sequential()
# Embedding layer? Input 값이 처음으로 들어오는 layer
# vocab_size는 데이터 셋 내의 유니크 단어 수, embedding_dim은 몇 차원으로 줄일 것인지, input_length는 들어오는 문장의 최대 길이
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(GlobalAveragePooling1D())
model.add(Dense(6, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 40, 100)           2000000   
_________________________________________________________________
global_average_pooling1d_4 ( (None, 100)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 6)                 606       
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 7         
Total params: 2,000,613
Trainable params: 2,000,613
Non-trainable params: 0
_________________________________________________________________


In [58]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(padded_sent, labels, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f9849b82280>