In [4]:
import pandas as pd
import numpy as np
import jieba
from gensim.models import Word2Vec
import tensorflow as tf
from tensorflow  import keras
from tensorflow.keras import layers, optimizers

In [5]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [7]:
data = pd.read_csv('https://raw.githubusercontent.com/SophonPlus/ChineseNlpCorpus/master/datasets/waimai_10k/waimai_10k.csv')

In [8]:
jieba.lcut(data.review[0])

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.925 seconds.
Prefix dict has been built successfully.


['很快', '，', '好吃', '，', '味道', '足', '，', '量', '大']

In [9]:
# Tokenization
data['text'] = data.review.apply(jieba.lcut)

In [10]:
data

Unnamed: 0,label,review,text
0,1,很快，好吃，味道足，量大,"[很快, ，, 好吃, ，, 味道, 足, ，, 量, 大]"
1,1,没有送水没有送水没有送水,"[没有, 送水, 没有, 送水, 没有, 送水]"
2,1,非常快，态度好。,"[非常, 快, ，, 态度, 好, 。]"
3,1,方便，快捷，味道可口，快递给力,"[方便, ，, 快捷, ，, 味道, 可口, ，, 快, 递给, 力]"
4,1,菜味道很棒！送餐很及时！,"[菜, 味道, 很棒, ！, 送餐, 很, 及时, ！]"
...,...,...,...
11982,0,以前几乎天天吃，现在调料什么都不放，,"[以前, 几乎, 天天, 吃, ，, 现在, 调料, 什么, 都, 不放, ，]"
11983,0,昨天订凉皮两份，什么调料都没有放，就放了点麻油，特别难吃，丢了一份，再也不想吃了,"[昨天, 订, 凉皮, 两份, ，, 什么, 调料, 都, 没有, 放, ，, 就, 放, ..."
11984,0,"凉皮太辣,吃不下都","[凉皮, 太辣, ,, 吃不下, 都]"
11985,0,本来迟到了还自己点！！！,"[本来, 迟到, 了, 还, 自己, 点, ！, ！, ！]"


In [11]:
myWord2Vec = Word2Vec(data.text, vector_size=250, epochs=10, sg=1, min_count=1) # using skip-gramm algorithm
print(myWord2Vec)

Word2Vec(vocab=11008, vector_size=250, alpha=0.025)


In [12]:
myWord2Vec.wv.key_to_index

{'，': 0,
 '了': 1,
 '的': 2,
 '！': 3,
 '。': 4,
 ',': 5,
 '很': 6,
 '都': 7,
 '是': 8,
 '我': 9,
 '也': 10,
 '不': 11,
 '还': 12,
 '好': 13,
 '味道': 14,
 '送餐': 15,
 '好吃': 16,
 '吃': 17,
 '送': 18,
 '就': 19,
 '不错': 20,
 '小时': 21,
 '给': 22,
 '没有': 23,
 '没': 24,
 '？': 25,
 '点': 26,
 '送到': 27,
 '说': 28,
 '…': 29,
 '速度': 30,
 '就是': 31,
 '等': 32,
 '才': 33,
 '太': 34,
 '外卖': 35,
 '到': 36,
 '快': 37,
 '难吃': 38,
 '在': 39,
 '菜': 40,
 '一个': 41,
 '啊': 42,
 '一般': 43,
 '送来': 44,
 '太慢': 45,
 '非常': 46,
 '饭': 47,
 '还是': 48,
 '时间': 49,
 '多': 50,
 '凉': 51,
 '和': 52,
 '有': 53,
 '有点': 54,
 '吧': 55,
 '慢': 56,
 '很快': 57,
 '可以': 58,
 '个': 59,
 '配送': 60,
 '百度': 61,
 '～': 62,
 '特别': 63,
 '但是': 64,
 '两个': 65,
 '粥': 66,
 '态度': 67,
 '肉': 68,
 '要': 69,
 '少': 70,
 '而且': 71,
 '不是': 72,
 '什么': 73,
 '这': 74,
 '打电话': 75,
 '电话': 76,
 '差': 77,
 '让': 78,
 '服务': 79,
 '人': 80,
 '再': 81,
 '分钟': 82,
 '你': 83,
 '吗': 84,
 '又': 85,
 '小哥': 86,
 '能': 87,
 '怎么': 88,
 '以后': 89,
 '饼': 90,
 '快递': 91,
 '量': 92,
 '米饭': 93,
 '东西': 94,
 '结果': 95,
 '这么': 9

In [13]:
myWord2Vec.wv.key_to_index.get('，')

0

In [14]:
# 将所有index加1， 把index 0留给补全空位做align sequence
embedding_matrix = myWord2Vec.wv.vectors
embedding_matrix.shape

(11008, 250)

In [15]:
embedding_matrix=np.vstack((np.array(np.zeros(250)),embedding_matrix))
embedding_matrix.shape
# total number of seperate words

(11009, 250)

In [16]:
# prepare training data
X_train=np.zeros([len(data.text),30],dtype='float64')
X_train.shape
# total number of comments

(11987, 30)

In [17]:
for i in range(len(data.text)):
    for j in range(min(len(data.text[i]),30)):
        if myWord2Vec.wv.key_to_index.get(data.text[i][j]) is None:
            X_train[i,j] = 0
        else:
            X_train[i,j]=1 + myWord2Vec.wv.key_to_index.get(data.text[i][j])

In [18]:
X_train[1]

array([  24., 3419.,   24., 3419.,   24., 3419.,    0.,    0.,    0.,
          0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
          0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
          0.,    0.,    0.])

In [19]:
data.text[1]

['没有', '送水', '没有', '送水', '没有', '送水']

In [20]:
y_train = data.label

In [21]:
model = keras.Sequential(name='RNN')
model.add(layers.Embedding(input_dim=len(myWord2Vec.wv.key_to_index)+1,output_dim=250))
model.add(layers.SimpleRNN(64))
model.add(layers.Dense(2,activation='softmax'))
model.summary()

2022-03-06 15:57:24.213052: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-03-06 15:57:24.905673: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22312 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:b5:00.0, compute capability: 8.6


Model: "RNN"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 250)         2752250   
                                                                 
 simple_rnn (SimpleRNN)      (None, 64)                20160     
                                                                 
 dense (Dense)               (None, 2)                 130       
                                                                 
Total params: 2,772,540
Trainable params: 2,772,540
Non-trainable params: 0
_________________________________________________________________


In [22]:
# using word2vec framework, so the embedding layer is not trainable
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False
model.summary()

Model: "RNN"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 250)         2752250   
                                                                 
 simple_rnn (SimpleRNN)      (None, 64)                20160     
                                                                 
 dense (Dense)               (None, 2)                 130       
                                                                 
Total params: 2,772,540
Trainable params: 20,290
Non-trainable params: 2,752,250
_________________________________________________________________


In [29]:
model.compile(optimizer='Adam',loss=keras.metrics.sparse_categorical_crossentropy,metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fbad01803a0>