In [7]:
import pandas as pd
import numpy as np
import json
import os
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers import Dropout, Conv1D,MaxPooling1D, LSTM, Dense, Bidirectional,SimpleRNN
from keras.utils import to_categorical

In [42]:
data = pd.read_csv('dataset/ai-challenger/sentiment_analysis_trainingset.csv')

In [43]:
data.head()

Unnamed: 0,id,content,location_traffic_convenience,location_distance_from_business_district,location_easy_to_find,service_wait_time,service_waiters_attitude,service_parking_convenience,service_serving_speed,price_level,...,environment_decoration,environment_noise,environment_space,environment_cleaness,dish_portion,dish_taste,dish_look,dish_recommendation,others_overall_experience,others_willing_to_consume_again
0,0,"""吼吼吼，萌死人的棒棒糖，中了大众点评的霸王餐，太可爱了。一直就好奇这个棒棒糖是怎么个东西，...",-2,-2,-2,-2,1,-2,-2,-2,...,-2,-2,-2,-2,-2,-2,1,-2,1,-2
1,1,"""第三次参加大众点评网霸王餐的活动。这家店给人整体感觉一般。首先环境只能算中等，其次霸王餐提...",-2,-2,-2,-2,-2,-2,-2,0,...,0,0,0,0,1,-2,-2,-2,1,-2
2,2,"""4人同行 点了10个小吃\n榴莲酥 榴莲味道不足 松软 奶味浓\n虾饺 好吃 两颗大虾仁\...",-2,-2,-2,-2,0,-2,1,0,...,-2,-2,1,-2,0,1,-2,-2,0,-2
3,3,"""之前评价了莫名其妙被删 果断继续差评！ 换了菜单 价格更低 开始砸牌子 但套餐还是有150...",-2,-2,-2,-2,-2,-2,-2,0,...,-2,-2,-2,-2,-2,-1,-2,-2,-1,-1
4,4,"""出乎意料地惊艳，椰子鸡清热降火，美容养颜，大大满足了爱吃火锅怕上火星人。椰子冻是帅帅的老板...",-2,-2,-2,-2,-2,-2,-2,-2,...,-2,-2,-2,-2,-2,1,1,-2,1,-2


In [44]:
data["content"].head()

0    "吼吼吼，萌死人的棒棒糖，中了大众点评的霸王餐，太可爱了。一直就好奇这个棒棒糖是怎么个东西，...
1    "第三次参加大众点评网霸王餐的活动。这家店给人整体感觉一般。首先环境只能算中等，其次霸王餐提...
2    "4人同行 点了10个小吃\n榴莲酥 榴莲味道不足 松软 奶味浓\n虾饺 好吃 两颗大虾仁\...
3    "之前评价了莫名其妙被删 果断继续差评！ 换了菜单 价格更低 开始砸牌子 但套餐还是有150...
4    "出乎意料地惊艳，椰子鸡清热降火，美容养颜，大大满足了爱吃火锅怕上火星人。椰子冻是帅帅的老板...
Name: content, dtype: object

In [45]:
labels = data["others_overall_experience"]
labels

0         1
1         1
2         0
3        -1
4         1
         ..
104995    1
104996    1
104997    0
104998    1
104999    0
Name: others_overall_experience, Length: 105000, dtype: int64

In [46]:
tokenizer = Tokenizer(filters='\t\n', char_level=True)
tokenizer

<keras_preprocessing.text.Tokenizer at 0x1fd4c3599a0>

In [47]:
contents = data["content"]

In [48]:
tokenizer.fit_on_texts(contents)

In [49]:
word_dict_file = "build/dictionary.json"

In [50]:
if not os.path.exists(os.path.dirname(word_dict_file)):
    os.makedirs(os.path.dirname(word_dict_file))
with open (word_dict_file,"w",encoding="utf-8") as outfile:
    json.dump(tokenizer.word_index,outfile,ensure_ascii=False)

num_words = len(tokenizer.word_index)+1

In [51]:
num_words

8216

In [52]:
# 将文字使用字段转成数字
contents_tokenizer = tokenizer.texts_to_sequences(contents)
# contents_tokenizer

In [53]:
# 将文本填充到相同的长度
x_processed = sequence.pad_sequences(contents_tokenizer,maxlen=512, value=0)

In [54]:
x_processed[5]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [55]:
train_size = int(data.shape[0] * .75)

In [56]:
train_size

78750

In [57]:
x_train,x_test = x_processed[:train_size],x_processed[train_size:]

In [58]:
x_train

array([[   0,    0,    0, ...,  196,    3,   24],
       [   0,    0,    0, ...,  452,  206,   24],
       [   0,    0,    0, ...,  522,   50,   24],
       ...,
       [   0,    0,    0, ...,   26,   26,   24],
       [   0,    0,    0, ...,   37,    3,   24],
       [  35, 1354, 1097, ...,    2,    3,   24]])

In [59]:
x_test

array([[ 49, 140, 451, ...,   3,   3,  24],
       [  0,   0,   0, ...,  46,  26,  24],
       [  0,   0,   0, ...,  29,  90,  24],
       ...,
       [200, 111, 200, ...,   3,   3,  24],
       [  0,   0,   0, ..., 165,  20,  24],
       [  0,   0,   0, ...,  46,   3,  24]])

In [60]:
labels = labels+2
labels = to_categorical(labels)
labels[5]

array([0., 0., 0., 1.], dtype=float32)

In [61]:
y_train,y_test = labels[:train_size],labels[train_size:]

In [62]:
x_train,x_test

(array([[   0,    0,    0, ...,  196,    3,   24],
        [   0,    0,    0, ...,  452,  206,   24],
        [   0,    0,    0, ...,  522,   50,   24],
        ...,
        [   0,    0,    0, ...,   26,   26,   24],
        [   0,    0,    0, ...,   37,    3,   24],
        [  35, 1354, 1097, ...,    2,    3,   24]]),
 array([[ 49, 140, 451, ...,   3,   3,  24],
        [  0,   0,   0, ...,  46,  26,  24],
        [  0,   0,   0, ...,  29,  90,  24],
        ...,
        [200, 111, 200, ...,   3,   3,  24],
        [  0,   0,   0, ..., 165,  20,  24],
        [  0,   0,   0, ...,  46,   3,  24]]))

In [63]:
y_train,y_test

(array([[0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 0., 1., 0.],
        ...,
        [0., 1., 0., 0.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.]], dtype=float32),
 array([[0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 0., 1.],
        ...,
        [0., 0., 1., 0.],
        [0., 0., 0., 1.],
        [0., 0., 1., 0.]], dtype=float32))

### 模型
- 使用双向RNN
- Embedding词嵌入层 将词向量映射到维度更低且能有词与词之间关系。
- Bidirectional(SimpleRNN(units=512)) 双向RNN
- Dropout 训练中每次更新输入单元的按比率随机设置为0 防止过拟合
- Dense 全连接层 输出4个标签的概率
- LSTM 长短期记忆网络
- Conv1D一维卷积 MaxPooling1D 一维池化
- categorical_crossentropy 多分类损失函数 二分类使用binary_crossentropy 优化器adam

### 双向RNN

In [64]:
# 双向CNN
model1 = Sequential()
model1.add(Embedding(num_words,32,input_length=512))
model1.add(Dropout(0.2))
model1.add(Bidirectional(SimpleRNN(units=512)))
model1.add(Dropout(0.2))
model1.add(Dense(4,activation='sigmoid'))
model1.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy'])

In [65]:
model1.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 512, 32)           262912    
_________________________________________________________________
dropout_3 (Dropout)          (None, 512, 32)           0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 1024)              558080    
_________________________________________________________________
dropout_4 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 4100      
Total params: 825,092
Trainable params: 825,092
Non-trainable params: 0
_________________________________________________________________


In [66]:
model1.fit(x_train,y_train, validation_split=0.25, epochs=20, batch_size=512)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1fd4c38b4f0>

In [67]:
score,acc = model1.evaluate(x_test,y_test, verbose=1, batch_size=1024)



In [68]:
score

0.7277776002883911

In [69]:
acc

0.7027809619903564

### LSTM

In [70]:
model2 = Sequential()
model2.add(Embedding(num_words,32,input_length=512))
model2.add(Dropout(0.2))
model2.add(LSTM(64,recurrent_dropout=0.5))
model2.add(Dropout(0.2))
model2.add(Dense(4,activation='sigmoid'))
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [71]:
model2.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 512, 32)           262912    
_________________________________________________________________
dropout_5 (Dropout)          (None, 512, 32)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 64)                24832     
_________________________________________________________________
dropout_6 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 4)                 260       
Total params: 288,004
Trainable params: 288,004
Non-trainable params: 0
_________________________________________________________________


In [73]:
model2.fit(x_train,y_train,validation_split=0.25, epochs=2, batch_size=512)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x1fd2a2406a0>

In [74]:
score, acc = model2.evaluate(x_test, y_test, verbose=1, batch_size=1024)



### LSTM +CNN

In [75]:
model3 =Sequential()
model3.add(Embedding(num_words,32,input_length=512))
model3.add(Dropout(0.2))
model3.add(Conv1D(64,5,activation='relu'))
model3.add(MaxPooling1D(pool_size=4))
model3.add(LSTM(64,recurrent_dropout=0.2))
model3.add(Dropout(0.2))
model3.add(Dense(4,activation='sigmoid'))
model3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [76]:
model3.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 512, 32)           262912    
_________________________________________________________________
dropout_7 (Dropout)          (None, 512, 32)           0         
_________________________________________________________________
conv1d (Conv1D)              (None, 508, 64)           10304     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 127, 64)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dropout_8 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 4)                

In [77]:
model3.fit(x_train, y_train, validation_split=0.25, epochs=2, batch_size=512)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x1fd7cea0670>

In [78]:
score, acc = model3.evaluate(x_test, y_test, verbose=1, batch_size=1024)



In [79]:
score,acc

(0.5942938923835754, 0.7602666616439819)

#### 在序列模型中加入卷积操作能加快网络训练时间