In [28]:
import requests
import pandas as pd
import tensorflow as tf
import joblib
import random
import numpy as np

In [35]:
# 텐서 GPU 설정
import tensorflow as tf
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
import os

os.environ["CUDA_VISIBLE_DEVICES"]="0"
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        print(e)

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 6571839836309871380
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 2254123828
locality {
  bus_id: 1
  links {
  }
}
incarnation: 6395822226982723625
physical_device_desc: "device: 0, name: NVIDIA GeForce GTX 1650, pci bus id: 0000:01:00.0, compute capability: 7.5"
xla_global_id: 416903419
]


In [2]:
res = requests.get('https://github.com/euphoris/datasets/raw/master/imdb.zip')

with open('imdb.zip', 'wb') as f:
    f.write(res.content)

In [6]:
df = pd.read_csv('imdb.zip')
df.head()

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [4]:
tk = tf.keras.preprocessing.text.Tokenizer(num_words=2000, oov_token='<unk>')

In [7]:
tk.fit_on_texts(df.review)

In [9]:
tk.word_index

{'<unk>': 1,
 'the': 2,
 'and': 3,
 'a': 4,
 'of': 5,
 'is': 6,
 'this': 7,
 'i': 8,
 'it': 9,
 'to': 10,
 'in': 11,
 'was': 12,
 'movie': 13,
 'film': 14,
 'that': 15,
 'for': 16,
 'as': 17,
 'but': 18,
 'with': 19,
 'one': 20,
 'on': 21,
 'you': 22,
 'are': 23,
 'not': 24,
 'bad': 25,
 "it's": 26,
 'very': 27,
 'all': 28,
 'just': 29,
 'so': 30,
 'good': 31,
 'at': 32,
 'an': 33,
 'be': 34,
 'there': 35,
 'about': 36,
 'have': 37,
 'by': 38,
 'like': 39,
 'from': 40,
 'if': 41,
 'acting': 42,
 'time': 43,
 'out': 44,
 'his': 45,
 'or': 46,
 'really': 47,
 'great': 48,
 'even': 49,
 'he': 50,
 'who': 51,
 'were': 52,
 'has': 53,
 'see': 54,
 'my': 55,
 'characters': 56,
 'well': 57,
 'most': 58,
 'how': 59,
 'more': 60,
 'no': 61,
 'only': 62,
 'when': 63,
 'ever': 64,
 '10': 65,
 'movies': 66,
 'plot': 67,
 'story': 68,
 'made': 69,
 'some': 70,
 'they': 71,
 'best': 72,
 'because': 73,
 'your': 74,
 'can': 75,
 'also': 76,
 "don't": 77,
 'films': 78,
 'than': 79,
 'its': 80,
 'scrip

In [10]:
tk.word_index['good']

31

In [11]:
tk.index_word[31]

'good'

In [13]:
joblib.dump(tk, 'tokenizer.pkl')

['tokenizer.pkl']

In [14]:
seqs = tk.texts_to_sequences(df.review)

In [15]:
seqs[0]

[4, 27, 27, 27, 287, 407, 1217, 13, 36, 4, 1218, 1219, 408, 142]

In [16]:
tk.index_word[1]

'<unk>'

In [18]:
tk.index_word[4], tk.index_word[27], tk.index_word[287]

('a', 'very', 'slow')

In [19]:
seq = seqs[0]

In [20]:
len(seq)

14

In [21]:
list(range(0, len(seq) - 4))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [23]:
data = []
for seq in seqs:
    for i in range(0, len(seq) - 4):
        data.append([seq[i:i+4], seq[i+4]])
data[:5]

[[[4, 27, 27, 27], 287],
 [[27, 27, 27, 287], 407],
 [[27, 27, 287, 407], 1217],
 [[27, 287, 407, 1217], 13],
 [[287, 407, 1217, 13], 36]]

In [24]:
seqs[0]

[4, 27, 27, 27, 287, 407, 1217, 13, 36, 4, 1218, 1219, 408, 142]

In [27]:
random.shuffle(data)
data[0]

[[1, 1, 30, 418], 173]

In [29]:
xs = np.array([x for x, y in data])
ys = np.array([y for x, y in data])

In [30]:
xs

array([[   1,    1,   30,  418],
       [1034,  734,  407,  282],
       [   1,    1,   29,  477],
       ...,
       [   4,  512,    3,    1],
       [  24,  168,   66,   44],
       [   2,  120,    6,   30]])

In [31]:
ys

array([173,   3, 260, ..., 379,  35,  25])

In [32]:
joblib.dump((xs, ys), 'lm-data.pkl')

['lm-data.pkl']

In [33]:
tk = joblib.load('tokenizer.pkl')
xs, ys = joblib.load('lm-data.pkl')

In [34]:
NUM_WORD = tk.num_words + 1

In [36]:
emb1 = tf.keras.layers.Embedding(
    input_dim = NUM_WORD,
    output_dim= 8
)

In [37]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D

In [40]:
lm = tf.keras.models.Sequential([
    emb1,
    GlobalAveragePooling1D(),
    Dense(8, activation='relu'),
    Dense(NUM_WORD)
])
lm.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 8)           16008     
                                                                 
 global_average_pooling1d_1   (None, 8)                0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_2 (Dense)             (None, 8)                 72        
                                                                 
 dense_3 (Dense)             (None, 2001)              18009     
                                                                 
Total params: 34,089
Trainable params: 34,089
Non-trainable params: 0
_________________________________________________________________


In [42]:
lm.compile('adam', 'sparse_categorical_crossentropy', ['accuracy'])
lm.fit(xs, ys, batch_size=128, epochs=200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x1dd7f30a080>

In [43]:
lm.save('lm.krs')

INFO:tensorflow:Assets written to: lm.krs\assets


## 단어 임베딩

In [44]:
emb1

<keras.layers.core.embedding.Embedding at 0x1dd7f4bee60>

In [45]:
emb1.embeddings

<tf.Variable 'embedding/embeddings:0' shape=(2001, 8) dtype=float32, numpy=
array([[ 0.00636067, -0.02665342,  0.0174864 , ..., -0.01197475,
         0.01596201, -0.00140953],
       [-0.02561057,  0.0160512 , -0.00714912, ...,  0.01661459,
        -0.00476409, -0.02121273],
       [-0.00139477,  0.04128186,  0.0097001 , ...,  0.00030349,
        -0.00086606, -0.02254425],
       ...,
       [ 0.04123129, -0.00050108,  0.03758924, ..., -0.00742885,
        -0.0081973 ,  0.03211562],
       [-0.0392708 , -0.0067884 , -0.0138414 , ..., -0.00747928,
         0.03647743,  0.04619696],
       [-0.03103733, -0.0334244 , -0.03093401, ...,  0.00289081,
         0.01166707,  0.03673227]], dtype=float32)>

In [46]:
e = emb1.embeddings.numpy()
e

array([[ 0.00636067, -0.02665342,  0.0174864 , ..., -0.01197475,
         0.01596201, -0.00140953],
       [-0.02561057,  0.0160512 , -0.00714912, ...,  0.01661459,
        -0.00476409, -0.02121273],
       [-0.00139477,  0.04128186,  0.0097001 , ...,  0.00030349,
        -0.00086606, -0.02254425],
       ...,
       [ 0.04123129, -0.00050108,  0.03758924, ..., -0.00742885,
        -0.0081973 ,  0.03211562],
       [-0.0392708 , -0.0067884 , -0.0138414 , ..., -0.00747928,
         0.03647743,  0.04619696],
       [-0.03103733, -0.0334244 , -0.03093401, ...,  0.00289081,
         0.01166707,  0.03673227]], dtype=float32)

In [47]:
w = emb1.get_weights()[0]
w

array([[ 0.00636067, -0.02665342,  0.0174864 , ..., -0.01197475,
         0.01596201, -0.00140953],
       [-0.02561057,  0.0160512 , -0.00714912, ...,  0.01661459,
        -0.00476409, -0.02121273],
       [-0.00139477,  0.04128186,  0.0097001 , ...,  0.00030349,
        -0.00086606, -0.02254425],
       ...,
       [ 0.04123129, -0.00050108,  0.03758924, ..., -0.00742885,
        -0.0081973 ,  0.03211562],
       [-0.0392708 , -0.0067884 , -0.0138414 , ..., -0.00747928,
         0.03647743,  0.04619696],
       [-0.03103733, -0.0334244 , -0.03093401, ...,  0.00289081,
         0.01166707,  0.03673227]], dtype=float32)

In [48]:
np.array_equal(e, w)

True

In [49]:
np.savez('word-emb.npz', emb=2)

In [51]:
[tk.index_word[i] for i in xs[0]]

['<unk>', '<unk>', 'so', "i'll"]

In [58]:
x = xs[0].reshape(1,-1)
x

array([[  1,   1,  30, 418]])

In [63]:
logit = lm.predict(x.astype('float32'))
logit.shape



(1, 2001)

In [65]:
logit, len(logit[0])

(array([[-0.00641964, -0.00474065, -0.00600547, ..., -0.00690872,
         -0.00730503, -0.00834102]], dtype=float32),
 2001)

In [66]:
logit.sum()

-14.750704

In [67]:
p = tf.nn.softmax(logit).numpy()
p

array([[0.00050023, 0.00050107, 0.00050043, ..., 0.00049998, 0.00049978,
        0.00049927]], dtype=float32)

In [69]:
p.sum()

1.0

In [71]:
logit.argmax(), logit[0, 28]

(28, 0.0)

In [72]:
tk.index_word[28]

'all'

In [83]:
del model2

In [84]:
emb2 = Embedding(
    input_dim = tk.num_words + 1,
    output_dim= 8,
    embeddings_initializer = tf.keras.initializers.Constant(e) 
)

model2 = Sequential([
    emb2,
    GlobalAveragePooling1D(),
    Dense(8, activation='relu'),
    Dense(1, activation='sigmoid')
])

model2.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 8)           16008     
                                                                 
 global_average_pooling1d_3   (None, 8)                0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_6 (Dense)             (None, 8)                 72        
                                                                 
 dense_7 (Dense)             (None, 1)                 9         
                                                                 
Total params: 16,089
Trainable params: 16,089
Non-trainable params: 0
_________________________________________________________________


In [85]:
pads = tf.keras.preprocessing.sequence.pad_sequences(seqs)

In [86]:
model2.compile('adam', 'binary_crossentropy', ['acc'])
model2.fit(pads, df.sentiment, batch_size=8, epochs=20, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1de0b6bb850>