# 데이터 준비

In [1]:
import pandas as pd 
df = pd.read_csv('imdb.zip')

In [2]:
df.head()

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [3]:
import joblib
tk = joblib.load('tokenizer.pkl')

In [4]:
from sklearn.model_selection import train_test_split
review_train, review_test, y_train, y_test = train_test_split(df['review'],\
                            df['sentiment'], test_size=0.2, random_state=42)

In [5]:
# 토큰화한다.
seqs=tk.texts_to_sequences(review_train)

In [6]:
review_train.iloc[0]

'It is an insane game.'

In [7]:
seqs[0]

[9, 6, 33, 1258, 214]

In [8]:
seqs[:3]

[[9, 6, 33, 1258, 214],
 [178, 5, 28, 35, 23, 168, 713, 591, 3, 713, 1, 10, 1, 280],
 [206, 336, 4]]

# 순방향 순환신경망

In [9]:
import tensorflow as tf

In [10]:
# maxlen : None(기본값), 10: 10으로 문장 길이를 잘라서 패딩처리
# padding : pre(기본값), post(뒤쪽에 0 패딩처리) -> pre 처리가 더 좋음 권장
# truncating : pre(기본값), maxlen값이 설정이 되야지 동작, 성능 테스트 한후 적당한걸 선택
pads = tf.keras.preprocessing.sequence.pad_sequences(seqs, maxlen=None,\
                                                    padding='pre', truncating='pre')

In [11]:
len(pads), pads.shape

(800, (800, 73))

In [12]:
# 모델 만들어 주기
NUM_WORDS= tk.num_words + 1

In [13]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(NUM_WORDS, 8, mask_zero=True), # mask_zero: 0으로 된 패딩 무시해라
    tf.keras.layers.LSTM(8), # 순환신경망 8개 노드로 만듦
    tf.keras.layers.Dense(1, activation='sigmoid') # 감성처리 0이냐 1이냐
])

In [14]:
model.summary() # 앞에 0있으면 자르기 때문에 패딩은 앞에 붙이는게 좋다

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 8)           16008     
                                                                 
 lstm (LSTM)                 (None, 8)                 544       
                                                                 
 dense (Dense)               (None, 1)                 9         
                                                                 
Total params: 16,561
Trainable params: 16,561
Non-trainable params: 0
_________________________________________________________________


In [16]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit

# 역방향 순환신경망

## pre padding 적용

In [27]:
# 패딩을 post로 처리하는 것이 좋음, 순방향일 경우는 pre로 처리하는것을 권장
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(NUM_WORDS, 8, mask_zero=True),
    tf.keras.layers.LSTM(8, go_backwards=True),
    tf.keras.layers.Dense(1, activation='sigmoid')
])


In [28]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'] )

In [29]:
model.fit(pads, y_train.values, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x28056fed580>

## post paddingd 적용

In [30]:
# 항상 잘나오진 않기에 권장tkgkd
pads = tf.keras.preprocessing.sequence.pad_sequences(seqs, padding='post',)

In [33]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(NUM_WORDS, 8, mask_zero=True),
    tf.keras.layers.LSTM(8, go_backwards=True),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [35]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [36]:
model.fit(pads, y_train.values, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2805c83dc40>

# 양방향 순환신경망

In [23]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(NUM_WORDS, 8, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(8)),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [24]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 8)           16008     
                                                                 
 bidirectional (Bidirectiona  (None, 16)               1088      
 l)                                                              
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 17,113
Trainable params: 17,113
Non-trainable params: 0
_________________________________________________________________


In [25]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [26]:
model.fit(pads, y_train.values, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2804eb785e0>