<a href="https://colab.research.google.com/github/Shiveringapple/MechineLearing/blob/main/Sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf

dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True,
)
print("extract on:", dataset)

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
extract on: /root/.keras/datasets/aclImdb.tar.gz


In [None]:
import glob
import os
import pandas as pd
def getdata(dataset, t):
  dn = os.path.dirname(dataset)
  dn = os.path.join(dn, "aclImdb", t)
  pos = glob.glob(os.path.join(dn, "pos", "*.txt")) + glob.glob(os.path.join(dn, "pos", "*.TXT"))
  neg = glob.glob(os.path.join(dn, "neg", "*.txt")) + glob.glob(os.path.join(dn, "neg", "*.TXT"))
  contents=[]
  for fn in pos + neg:
    with open(fn, encoding="utf_8") as f:
      contents.append(f.read())
  df = pd.DataFrame({
      "content":contents,
      "sentiment":[1] * len(pos) + [0] * len(neg)
  })
  return df
train_df = getdata(dataset, "train")
test_df = getdata(dataset, "test")
train_df

Unnamed: 0,content,sentiment
0,This movie leaves the intellectual mind thinki...,1
1,"I love the movies and own the comics, the comi...",1
2,this movie I saw some 10 years ago (maybe more...,1
3,I've always liked Johnny Concho and I wish thi...,1
4,"Based on an actual mining disaster, this early...",1
...,...,...
24995,This was an impulse pick up for me from the lo...,0
24996,"As an Altman fan, I'd sought out this movie fo...",0
24997,Before watching this film I had very low expec...,0
24998,I have seen Dolemite and also (Avenging) Disco...,0


In [None]:
test_df

Unnamed: 0,content,sentiment
0,"In 1984, Edgar Reitz surprised film-lovers all...",1
1,Here's another of the 1940's westerns that I w...,1
2,"After seeing this DVD, I was floored. It is SO...",1
3,'Checking Out' is an extraordinary film that t...,1
4,"Michael Polish's hypnotic ""Northfork"" is a fil...",1
...,...,...
24995,Two days ago I got a chance to watch this movi...,0
24996,This screened at Sundance last night to a rece...,0
24997,... And it's a not very good documentary eithe...,0
24998,The person who wrote the summary and rave revi...,0


# NLP 預處理
## 第一步：Tokenizer

### # 如果是中文： 先用分詞，空白鍵.join()

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words=3000)
tok.fit_on_texts(train_df["content"])

In [None]:
# token不會包括0，0是拿來padding用的
# tok.word_index
# len(tok.word_index)

# 第二步：to sequence (真的把詞轉成數字)

In [None]:
x_train_seq = tok.texts_to_sequences(train_df["content"])
x_test_seq = tok.texts_to_sequences(test_df["content"])
pd.DataFrame(x_train_seq)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,1776,1777,1778,1779,1780,1781,1782,1783,1784,1785,1786,1787,1788,1789,1790,1791,1792,1793,1794,1795,1796,1797,1798,1799,1800,1801,1802,1803,1804,1805,1806,1807,1808,1809,1810,1811,1812,1813,1814,1815
0,11,17,886,1,2766,327,533,2,266.0,5.0,1.0,62.0,10.0,96.0,562.0,388.0,135.0,81.0,59.0,1153.0,11.0,17.0,7.0,7.0,44.0,22.0,23.0,3.0,1513.0,334.0,11.0,17.0,200.0,21.0,15.0,1203.0,7.0,7.0,11.0,17.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,10,116,1,99,2,202,1,1,23.0,272.0,92.0,1.0,17.0,18.0,128.0,471.0,199.0,9.0,155.0,43.0,4.0,155.0,9.0,13.0,1187.0,44.0,1.0,99.0,185.0,1624.0,1187.0,10.0,59.0,25.0,38.0,2.0,10.0,241.0,665.0,329.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,11,17,10,216,46,155,150,593,276.0,50.0,10.0,559.0,9.0,8.0,3.0,2353.0,2.0,112.0,255.0,9.0,5.0,815.0,57.0,8.0,780.0,1.0,127.0,6.0,52.0,1763.0,2.0,1086.0,10.0,59.0,37.0,52.0,73.0,5.0,103.0,9.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,204,207,420,1802,2,10,654,11,19.0,68.0,43.0,20.0,1853.0,2.0,285.0,1260.0,2387.0,405.0,28.0,4.0,1.0,88.0,1727.0,351.0,8.0,24.0,608.0,8.0,11.0,28.0,7.0,7.0,51.0,72.0,83.0,906.0,1260.0,8.0,1.0,594.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,445,20,32,776,1685,11,399,1121,16.0,628.0,2568.0,128.0,1284.0,28.0,4.0,1.0,88.0,1128.0,123.0,811.0,1883.0,108.0,695.0,1618.0,153.0,405.0,3.0,2847.0,646.0,4.0,1.0,1854.0,12.0,1.0,1918.0,2997.0,4.0,60.0,496.0,4.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,11,13,32,1259,53,15,69,36,1.0,716.0,371.0,1129.0,89.0,94.0,1.0,169.0,1319.0,10.0,119.0,11.0,17.0,6.0,2331.0,914.0,2.0,1225.0,355.0,1.0,411.0,197.0,1.0,182.0,2322.0,2.0,24.0,1696.0,6.0,569.0,859.0,395.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24996,14,32,334,471,43,11,17,15,150.0,533.0,12.0,16.0,138.0,3.0,84.0,174.0,9.0,59.0,25.0,5.0,27.0,30.0,219.0,526.0,7.0,7.0,191.0,1319.0,7.0,7.0,11.0,6.0,28.0,4.0,191.0,174.0,1373.0,292.0,2.0,39.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24997,156,146,11,19,10,66,52,361,1396.0,2.0,432.0,5.0,40.0,64.0,1.0,1876.0,849.0,10.0,57.0,167.0,15.0,12.0,279.0,111.0,6.0,217.0,695.0,2973.0,106.0,940.0,6.0,695.0,2973.0,35.0,108.0,1805.0,2.0,35.0,73.0,8.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24998,10,25,107,2,79,104,82,475,492.0,4.0,1.0,36.0,260.0,461.0,1534.0,2564.0,18.0,11.0,19.0,1.0,403.0,2545.0,238.0,77.0,207.0,1068.0,3.0,315.0,270.0,8.0,58.0,480.0,15.0,2098.0,580.0,4.0,2705.0,8.0,19.0,362.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


# 第三步：padding 長度固定(截長truncating補短padding)

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
x_train_pad = pad_sequences(x_train_seq, maxlen=512)
x_test_pad = pad_sequences(x_test_seq, maxlen=512)
pd.DataFrame(x_train_pad)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,35,315,496,299,7,7,1,367,154,6,318,1,225,6,446,35,49,10,188,852,5,76,1,812,7,7,9,886,126,642,37,1810,1,2547,45,2813,673,431,21,3
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,39,124,21,37,369,39,22,121,40,1339,2137,44,10,97,1577,20,58,86,8,116,2,86,108,208,10,25,329,1,10,59,518,43,4,15,249,18,1395,47,6,207
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,2353,2,112,255,9,5,815,57,8,780,1,127,6,52,1763,2,1086,10,59,37,52,73,5,103,9,171,85,10,101,42,14,14,1,278,3,337,272,240,4,17
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,37,41,1802,6,1,1326,32,28,118,2,462,23,16,900,40,132,10,261,1802,13,1520,5,309,2,3,15,48,22,78,51,442,2903,3,985,177,8,29,1160,2,658
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1209,98,2262,2,131,346,2,369,8,693,61,28,1200,5,604,453,2,5,166,65,1481,964,50,71,150,300,128,45,1,563,2,746,3,461,8,356,6,3,461,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,761,3,173,50,37,3,360,17,173,29,4,1,290,102,23,2,447,2345,5,1125,628,16,2525,2459,1030,550,51,22,64,11,28,20,1,4,126,716,371,1129,398,1282
24996,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,61,15,1,2007,236,4,34,1,29,184,38,18,82,71,12,794,30,29,2200,60,6,181,773,5,78,42,112,74,622,20,371,5,58,1860,2,91,1875,25,1,4
24997,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,35,725,2,603,2970,12,32,856,504,97,1,2158,4,1,766,75,113,75,111,75,638,75,17,7,7,89,64,9,294,302,2,126,15,700,1277,1876,206,1197,3,75
24998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,2663,9,6,21,7,7,403,73,37,1,201,6,32,19,4,18,30,219,42,250,2,431,22,25,5,199,1107,5,131,81,15,1,778,40,21,12,73,354,16,58


In [None]:
from tensorflow.keras.layers import Embedding, Lambda, GlobalAveragePooling1D, Dense
from tensorflow.keras.models import Sequential
# 進來有幾種：input_dim = 3000 + 1(被padding占掉的0)
# 出來的情緒有幾種：output_dim
layers = [
    Embedding(3001, 256, mask_zero=True, input_length=512),
    GlobalAveragePooling1D(),
    Dense(2, activation="softmax")
]
# param = 3001*128 = 384128
model = Sequential(layers)
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 512, 128)          384128    
_________________________________________________________________
global_average_pooling1d_1 ( (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 2)                 258       
Total params: 384,386
Trainable params: 384,386
Non-trainable params: 0
_________________________________________________________________


In [None]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy
model.compile(loss=SparseCategoricalCrossentropy(),
       optimizer="adam",
       metrics=["accuracy"])

In [None]:
import numpy as np
y_train = np.array(train_df["sentiment"])
y_test = np.array(test_df["sentiment"])

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
callbacks = [
    ModelCheckpoint("sentiment.h5", save_best_only=True),
    EarlyStopping(patience=5, restore_best_weights=True)
]
model.fit(x_train_pad,
     y_train,
     validation_split=0.1,
     batch_size=200,
     epochs=100,
     verbose=2,
     callbacks=callbacks)

Epoch 1/100
113/113 - 3s - loss: 0.6345 - accuracy: 0.6376 - val_loss: 0.6326 - val_accuracy: 0.6608
Epoch 2/100
113/113 - 3s - loss: 0.4675 - accuracy: 0.8244 - val_loss: 0.4809 - val_accuracy: 0.7784
Epoch 3/100
113/113 - 3s - loss: 0.3631 - accuracy: 0.8661 - val_loss: 0.3682 - val_accuracy: 0.8448
Epoch 4/100
113/113 - 3s - loss: 0.3131 - accuracy: 0.8814 - val_loss: 0.3612 - val_accuracy: 0.8412
Epoch 5/100
113/113 - 3s - loss: 0.2835 - accuracy: 0.8913 - val_loss: 0.3157 - val_accuracy: 0.8624
Epoch 6/100
113/113 - 3s - loss: 0.2640 - accuracy: 0.8994 - val_loss: 0.3391 - val_accuracy: 0.8528
Epoch 7/100
113/113 - 3s - loss: 0.2502 - accuracy: 0.9045 - val_loss: 0.3443 - val_accuracy: 0.8496
Epoch 8/100
113/113 - 3s - loss: 0.2392 - accuracy: 0.9092 - val_loss: 0.3472 - val_accuracy: 0.8520
Epoch 9/100
113/113 - 3s - loss: 0.2309 - accuracy: 0.9111 - val_loss: 0.3566 - val_accuracy: 0.8520
Epoch 10/100
113/113 - 3s - loss: 0.2240 - accuracy: 0.9140 - val_loss: 0.3278 - val_accura

<tensorflow.python.keras.callbacks.History at 0x7f0240646748>

In [None]:
model.evaluate(x_test_pad, y_test)

In [None]:
newl = [
    Embedding(3001, 256, mask_zero=True, input_length=1)
]
newmodel = Sequential(newl)
w = model.layers[0].get_weights()
newmodel.layers[0].set_weights(w)
n = tok_word_index["the"]
print("the長這樣：", newmodel.predict([[n]]))

我只懂3000個詞
看一本書
只要看到3000詞其中一字就打個圈圈
打出512圈圈叫做length
變成256個情緒
# 化繁為簡 化抽象為具體

###語意由上下文構成 -> 克漏字
###W2V：上下詞預測中間詞 
#### W2V應用：商品推薦、行程推薦、音樂推薦
分析你喜歡的 推薦跟你一樣類型的人的