In [0]:
%tensorflow_version 2.x

In [2]:
import tensorflow as tf

dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True,
)

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [0]:
import os
import glob
import pandas as pd

def get_data(n):
    train_dn = os.path.join(dn, n)
    contents = []
    sentiment = []
    pos_fn = os.path.join(train_dn, "pos", "*.txt")
    for fn in glob.glob(pos_fn):
        with open(fn, "r", encoding="utf-8") as f:
            contents.append(f.read())
            sentiment.append(1)

    neg_fn = os.path.join(train_dn, "neg", "*.txt")
    for fn in glob.glob(neg_fn):
        with open(fn, "r", encoding="utf-8") as f:
            contents.append(f.read())
            sentiment.append(0)

    df = pd.DataFrame({
            "content": contents,
            "sentiment": sentiment
        }, columns=["content", "sentiment"])
    return df

In [4]:
dn = os.path.dirname(dataset)
dn = os.path.join(dn, "aclImdb")
# glob.glob("/root/.keras/datasets/aclImdb/train/pos/*")

train_df = get_data("train")
test_df = get_data("test")
test_df

Unnamed: 0,content,sentiment
0,The evil Professor Moriarty plots to gain cont...,1
1,"As a French, i found it very pleasant to be ab...",1
2,Mary Pickford plays Annie Rooney--the daughter...,1
3,I don't recall a film which so deftly shows th...,1
4,When is ART going to overcome racism? I believ...,1
...,...,...
24995,This film is a perfect example that a movie ca...,0
24996,"First, I should say that I've seen the '39 ver...",0
24997,Oh God. Why is it that Nickelodeon has such a ...,0
24998,"Lots of flames, thousands of extras in battle ...",0


In [0]:
# 記得先將嵌入層(Embedding)當作第一層
# 1. tokenize: 找出多少種詞
from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words=3000)

# 這像是以前CountVectorizer的fit(找出多少種)
tok.fit_on_texts(train_df["content"])


In [0]:
# 先把正向轉換和反向轉換準備好
index_2_word = tok.index_word
word_2_index = {v:k for k, v in tok.index_word.items()}

In [7]:
# 轉化成數字，transform
# 這些數字會在3000(Tokenizer(num_words=3000)<<<這裡設定的)的精選單詞內

x_train_seq = tok.texts_to_sequences(train_df["content"])
x_test_seq = tok.texts_to_sequences(test_df["content"])
pd.DataFrame(x_test_seq)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009
0,1,442,2493,1843,5.0,1137.0,4.0,196.0,2.0,1.0,998.0,60.0,97.0,1173.0,1.0,322.0,15.0,1.0,7.0,7.0,1475.0,989.0,5.0,1.0,265.0,14.0,2689.0,1604.0,2753.0,1250.0,2.0,881.0,8.0,11.0,438.0,114.0,733.0,2456.0,2825.0,1050.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,14,3,780,10,255.0,9.0,52.0,2209.0,5.0,27.0,499.0,5.0,459.0,30.0,1.0,152.0,60.0,6.0,90.0,4.0,780.0,37.0,12.0,30.0,46.0,4.0,30.0,2540.0,99.0,521.0,2.0,30.0,3.0,173.0,4.0,82.0,180.0,96.0,10.0,457.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,1079,296,1,575,4.0,3.0,1000.0,12.0,453.0,8.0,1.0,1208.0,170.0,4.0,510.0,56.0,6.0,3.0,2677.0,2.0,182.0,758.0,4.0,556.0,1194.0,197.0,1686.0,2.0,34.0,1385.0,5.0,18.0,177.0,930.0,45.0,3.0,480.0,4.0,1814.0,7.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,10,89,2278,3,19.0,60.0,35.0,284.0,1.0,917.0,4.0,322.0,14.0,8.0,28.0,682.0,648.0,1.0,1148.0,4.0,1.0,19.0,6.0,1.0,197.0,2.0,2651.0,1.0,104.0,23.0,8.0,172.0,133.0,140.0,1.0,262.0,4.0,1.0,19.0,33.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,51,6,495,167,5.0,10.0,261.0,1.0,295.0,81.0,25.0,2066.0,53.0,234.0,11.0,17.0,13.0,90.0,4.0,3.0,174.0,2.0,225.0,11.0,1265.0,428.0,6.0,32.0,318.0,1252.0,4.0,3.0,84.0,1420.0,10.0,960.0,715.0,6.0,445.0,20.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,11,19,6,3,401.0,460.0,12.0,3.0,17.0,67.0,21.0,27.0,1109.0,16.0,3.0,309.0,349.0,582.0,42.0,574.0,12.0,47.0,13.0,3.0,173.0,4.0,55.0,2.0,778.0,80.0,11.0,1.0,745.0,6.0,2.0,895.0,1.0,813.0,215.0,96.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24996,83,10,141,132,12.0,204.0,107.0,1.0,307.0,30.0,219.0,1242.0,208.0,121.0,29.0,1.0,806.0,2.0,25.0,329.0,1.0,294.0,60.0,6.0,272.0,36.0,1.0,2.0,1359.0,4.0,1814.0,4.0,91.0,203.0,11.0,307.0,6.0,14.0,1032.0,14.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24997,446,557,135,6,9.0,12.0,45.0,138.0,3.0,251.0,55.0,57.0,3.0,317.0,539.0,17.0,10.0,380.0,11.0,17.0,235.0,25.0,74.0,49.0,18.0,9.0,13.0,7.0,7.0,3.0,96.0,343.0,500.0,244.0,2741.0,2.0,5.0,46.0,1145.0,1113.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24998,772,4,4,2260,8.0,982.0,136.0,772.0,4.0,304.0,728.0,10.0,89.0,101.0,1.0,111.0,138.0,3.0,1.0,62.0,97.0,25.0,74.0,577.0,227.0,50.0,2615.0,2.0,25.0,74.0,50.0,44.0,47.0,1168.0,35.0,73.0,8.0,1.0,362.0,1615.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [8]:
# 後面有接MLP, 一定要讓input是一樣的
from tensorflow.keras.preprocessing.sequence import pad_sequences
x_train_padseq = pad_sequences(x_train_seq, maxlen=256)
x_test_padseq = pad_sequences(x_test_seq, maxlen=256)
pd.DataFrame(x_test_padseq)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,442,2493,1843,5,1137,4,196,2,1,998,60,97,1173,1,322,15,1,7,7,1475,989,5,...,526,12,1,84,13,3,224,4,3,281,30,480,7,7,11,19,60,6,445,52,20,788,8,2689,343,534,1062,1101,2,1062,1895,1474,2,1,541,4,2575,2,8,2076
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,459,85,4,603,1987,85,4,46,1416,90,16,2208,1,1324,192,5,27,2771,85,4,1744,209,85,22,261,8,1,102,521,2,42,3,1019,96,137,64,9,44,22,67
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,3,75,19,40,12,42,431,21,790,38,115,154,666,685,5,3,244,62,12,6,52,1180,20,18,21,259,1075,438,18,21,2959,825,891,22,23,3,663,334,4,1
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,2155,97,27,256,1,19,6,21,8,270,39,55,18,244,8,1424,3,952,2,1130,322,19,1022,98,82,105,23,8,403,1517,12,26,59,719,11,5,3,322,19,6
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,15,1520,1336,2,384,175,21,853,1795,13,3,663,566,8,1,14,3,245,198,6,260,8,1,1205,587,2,72,141,29,4,1609,7,7,589,763,20,285,7,7,1464
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,374,10,13,9,59,53,2,1363,10,283,1,61,28,342,1,81,10,432,16,194,9,13,354,589,103,1,2,19,44,332,167,5,103,98,11,13,3,435,4,55
24996,329,1,294,60,6,272,36,1,2,1359,4,1814,4,91,203,11,307,6,14,1032,14,3,2642,17,20,222,3,279,22,771,107,32,4,245,15,9,1793,5,718,1798,...,1,294,14,996,31,2604,56,13,32,252,34,122,4,645,806,12,6,128,353,2,56,283,749,1,1055,869,12,11,106,66,5,782,8,878,38,122,33,554,1,17
24997,13,7,7,3,96,343,500,244,2741,2,5,46,1145,1113,1147,7,7,83,4,29,1,2725,53,1,5,165,642,151,13,52,52,52,1147,2,52,9,45,1,2741,376,...,169,111,2,2,94,14,108,669,14,33,67,40,769,145,100,3,134,9,211,750,2,7,7,10,470,5,64,46,776,880,8,130,9,7,7,860,143,221,16,58
24998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,138,3,1,62,97,25,74,577,227,50,2615,2,25,74,50,44,47,1168,35,73,8,1,362,1615,99,2345,5,27,11,93,8,58,581,2,10,101,11,36,1,19


In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Flatten, Dense, Dropout
# MLP: 0保留(padding)
INPUT_DIM = 3000 + 1
EMBEDDING_DIM = 64
INPUT_LENGTH = 256

model = Sequential()
model.add(Embedding(INPUT_DIM,
                    EMBEDDING_DIM,
                    mask_zero=True,
                    input_length=INPUT_LENGTH))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(2, activation='softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 256, 64)           192064    
_________________________________________________________________
flatten (Flatten)            (None, 16384)             0         
_________________________________________________________________
dense (Dense)                (None, 128)               2097280   
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 258       
Total params: 2,289,602
Trainable params: 2,289,602
Non-trainable params: 0
_________________________________________________________________


In [0]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy
# 非必要先做one-hot encoding
# SparseCategoricalCrossentropy 會逐筆做one-hot encoding
model.compile(loss=SparseCategoricalCrossentropy(),
              optimizer="adam",
              metrics=["accuracy"])

In [11]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

y_train = train_df["sentiment"]
y_test = test_df["sentiment"]

stop_callback = EarlyStopping(patience=3, restore_best_weights=True)
save_callback = ModelCheckpoint("fashion.h5", save_best_only=True)
model.fit(x_train_padseq, 
          y_train,
          batch_size=200,
          epochs=100,
          validation_split=0.1,
          callbacks=[stop_callback, save_callback],
          verbose=2)

Epoch 1/100
113/113 - 2s - loss: 0.4999 - accuracy: 0.7343 - val_loss: 0.2928 - val_accuracy: 0.8760
Epoch 2/100
113/113 - 1s - loss: 0.1828 - accuracy: 0.9315 - val_loss: 0.3036 - val_accuracy: 0.8756
Epoch 3/100
113/113 - 1s - loss: 0.0465 - accuracy: 0.9883 - val_loss: 0.4660 - val_accuracy: 0.8412
Epoch 4/100
113/113 - 1s - loss: 0.0082 - accuracy: 0.9992 - val_loss: 0.5351 - val_accuracy: 0.8528


<tensorflow.python.keras.callbacks.History at 0x7f2e3c477198>

In [12]:
model.evaluate(x_test_padseq, y_test)



[0.3103237748146057, 0.8653200268745422]

In [22]:
model.layers[0].get_weights()

[array([[ 0.00153363, -0.00974064,  0.00249944, ..., -0.00606399,
         -0.00063847, -0.00299234],
        [-0.00416719,  0.0156522 ,  0.03949567, ...,  0.02481796,
         -0.02202154,  0.01939226],
        [-0.03241707, -0.02534448,  0.02557174, ..., -0.02335948,
          0.03019652, -0.02350821],
        ...,
        [-0.01927915,  0.00546745, -0.0081289 , ...,  0.01434679,
         -0.02470182, -0.04382801],
        [-0.0406917 , -0.07132855,  0.01000506, ...,  0.00239864,
         -0.01909867,  0.01462507],
        [-0.0308555 ,  0.03101644,  0.02087346, ...,  0.04514276,
          0.02163912, -0.01351703]], dtype=float32)]

In [16]:
infer = Sequential()
infer.add(Embedding(INPUT_DIM,
                    EMBEDDING_DIM))
w = model.layers[0].get_weights()
infer.set_weights(w)
infer.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 64)          192064    
Total params: 192,064
Trainable params: 192,064
Non-trainable params: 0
_________________________________________________________________


In [21]:
w = input("你要轉換成哪個詞")
data = [[word_2_index[w]]]
print("詞向量:", infer.predict(data))

你要轉換成哪個詞student
詞向量: [[[-0.00105914 -0.0020631  -0.01876591 -0.00755534  0.01989232
    0.01585402 -0.04183568 -0.02519992  0.00465348 -0.04024911
    0.01921668  0.00177531  0.03114193 -0.051178    0.0335753
   -0.00216047  0.03411678  0.04702888  0.03809971 -0.03935854
    0.02161213 -0.04510923 -0.01883809 -0.05124252 -0.00272941
    0.02040105  0.00492789 -0.02159984  0.03697717 -0.01429486
    0.00876276 -0.02496424  0.02202799 -0.052978   -0.00219642
    0.03504447 -0.00286206  0.0518571  -0.01913481  0.02293452
   -0.02173744  0.03125426  0.01658905 -0.00134882  0.04367308
   -0.03130725 -0.01770249 -0.02921824  0.02950785 -0.03690568
   -0.02457516 -0.00076917  0.02315505 -0.01729396 -0.07236654
   -0.03372939 -0.04626777  0.04544401 -0.01866171  0.04162861
   -0.0228951   0.01672207 -0.00525077 -0.03069523]]]
