In [None]:
import numpy as np
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
docs = [ 'additional income',
'best price',
'big bucks',
'cash bonus',
'earn extra cash',
'spring savings certificate',
'valero gas marketing',
'all domestic employees',
'nominations for oct',
'confirmation from spinner' ]

In [None]:
labels = np.array([1,1,1,1,1,0,0,0,0,0])

In [None]:
vocab_size = 50
encoded_docs = [one_hot(d, vocab_size) for d in docs]
print(encoded_docs)

In [None]:
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length=max_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(padded_docs, labels, epochs=50, verbose=0)
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('정확도=', accuracy)

In [None]:
test_doc = ['big income']
encoded_docs = [one_hot(d, vocab_size) for d in test_doc]
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(model.predict(padded_docs))

# 다음 단어 예측하기

In [None]:
import numpy as np
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

In [None]:
text_data = """Soft as the voice of an angel\n
Breathing a lesson unhead\n
Hope with a gentle persuasion\n
Whispers her comforting word\n
Wait till the darkness is over\n
Wait till the tempest is done\n
Hope for sunshine tomorrow\n
After the shower"""

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text_data])
encoded = tokenizer.texts_to_sequences([text_data])[0]
print(encoded)

In [None]:
print(tokenizer.word_index)
vocab_size = len(tokenizer.word_index) + 1
print("어휘 크기 : %d" % vocab_size)

In [None]:
sequences = list()
for i in range(1, len(encoded)):
  sequence = encoded[i-1:i+1]
  sequences.append(sequence)
print(sequences)
print("총 시퀀스 개수 : %d" % len(sequences))

In [None]:
sequences = np.array(sequences)
X, y = sequences[:,0],sequences[:,1]
print("X=", X)
print("y=", y)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN, LSTM
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=500, verbose=2)

In [None]:
test_text = "Wait"
encoded = tokenizer.texts_to_sequences([test_text])[0]
encoded = np.array(encoded)


In [None]:
onehot_output = model.predict(encoded)
print('onehot_output=', onehot_output)

In [None]:
output = np.argmax(onehot_output)
print('output=', output)

In [None]:
print(test_text, "=> ", end="")
for word, index in tokenizer.word_index.items():
  if index == output:
    print(word)

# 영화 리뷰 감성 판별하기

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt

In [None]:
imdb = keras.datasets.imdb
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=10000)
print(x_train[0])

In [None]:
# 단어 ->정수 인덱스 딕셔너리
word_to_index = imdb.get_word_index()
# 처음 몇 개의 인덱스는 특수 용도로 사용된다.
word_to_index = {k:(v+3) for k,v in word_to_index.items()}
word_to_index["<PAD>"] = 0 # 문장을 채우는 기호
word_to_index["<START>"] = 1 # 시작을 표시
word_to_index["<UNK>"] = 2 # 알려지지 않은 토큰
word_to_index["<UNUSED>"] = 3

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *

In [None]:
x_train = pad_sequences(x_train, maxlen=100)
x_test = pad_sequences(x_test, maxlen=100)
vocab_size = 10000

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 64,
input_length=100))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.summary()

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam',
metrics=['accuracy'])
history = model.fit(x_train, y_train,
batch_size=64, epochs=20, verbose=1,
validation_data=(x_test, y_test))

In [None]:
results = model.evaluate(x_test, y_test, verbose=2)
print(results)

In [None]:
review = """What can I say about this movie that was already said? It is my
favorite time travel sci-fi, adventure epic comedy in the 80's and I love
this movie to death! When I saw this movie I was thrown out by its theme. An
excellent sci-fi, adventure epic, I LOVE the 80s. It's simple the greatest time
travel movie ever happened in the history of world cinema. I love this movie to
death, I love, LOVE, love it!"""

In [None]:
import re
review = re.sub("[^0-9a-zA-Z ]", "", review).lower()
review_encoding = []

In [None]:
# 리뷰의 각 단어 대하여 반복한다.
for w in review.split():
  index = word_to_index.get(w, 2) # 딕셔너리에 없으면 2 반환
  if index <= 10000: # 단어의 개수는 10000이하
    review_encoding.append(index)
  else:
    review_encoding.append(word_to_index["UNK"])
  # 2차원 리스트로 전달하여야 한다.
test_input = pad_sequences([review_encoding], maxlen = 100)
value = model.predict(test_input) # 예측

In [None]:
if(value > 0.5):
  print("긍정적인 리뷰입니다.")
else:
  print("부정적인 리뷰입니다.")

# LSTM과 CNN을 조합해 영화 리뷰 분류하기

In [None]:
from attention import Attention

In [None]:
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=5000)

In [None]:
X_train = sequence.pad_sequences(X_train, maxlen=500)
X_test = sequence.pad_sequences(X_test, maxlen=500)

In [None]:
model = Sequential()
model.add(Embedding(5000, 500))
model.add(Dropout(0, 5))
#model.add(Conv1D(64, 5, padding='valid', activation='relu', strides=1))
#model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(64, return_sequences=True))
model.add(Attention())

In [None]:
model.add(Dropout(0, 5))
model.add(Dense(1))
model.add(Activation('sigmoid'))

In [None]:
from keras.engine.training import optimizer
model.compile(loss='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])

In [None]:
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=3)

In [None]:
history = model.fit(X_train, y_train, batch_size = 40, epochs=100, validation_split=0.25, callbacks=[early_stopping_callback])

In [None]:
print(model.evaluate(X_test, y_test)[1])

In [None]:
y_vloss = history.history['val_loss']
y_loss = history.history['loss']

In [None]:
x_len = np.arange(len(y_loss))
plt.plot(x_len, y_vloss, marker='.', c='red', label='Testset_loss')
plt.plot(x_len, y_loss, marker='.', c='blue', label='Trainset_loss')
plt.legend(loc='upper right')
plt.grid()
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()

# LSTM을 이용해 로이터 뉴스 카테고리 분석

In [None]:
!pip install attention

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from tensorflow.keras.layers import Embedding, Embedding, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.datasets import reuters
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
(X_train, y_train), (X_test, y_test) = reuters.load_data(num_words=1000, test_split=0.2)

In [None]:
X_train = sequence.pad_sequences(X_train, maxlen=500)
X_test = sequence.pad_sequences(X_test, maxlen=500)

In [None]:
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [None]:
model = Sequential()
model.add(Embedding(1000, 100))
model.add(LSTM(100, activation='tanh'))
model.add(Dense(46, activation='softmax'))

In [None]:
model.compile(loss='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])

In [None]:
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=5)

In [None]:
history = model.fit(X_train, y_train, batch_size = 20, epochs=200, validation_data=(X_test, y_test), callbacks=[early_stopping_callback])

In [None]:
y_vloss = history.history['val_loss']
y_loss = history.history['loss']

In [None]:
x_len = np.arange(len(y_loss))
plt.plot(x_len, y_vloss, marker='.', c='red', label='Testset_loss')
plt.plot(x_len, y_loss, marker='.', c='blue', label='Trainset_loss')
plt.legend(loc='upper right')
plt.grid()
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()

# 양방햔 RNN

In [None]:
model = Sequential()
model.add(Embedding(10000, 128, input_length=200))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(1, activation='softmax'))