In [1]:
# 读取数据
import pandas as pd

train_df = pd.read_csv('.\data\data_train.csv')
test_df = pd.read_csv('.\data\data_test.csv')
train_df

Unnamed: 0,影评内容,真实标签
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1
...,...,...
24995,"Towards the end of the movie, I felt it was to...",0
24996,This is the kind of movie that my enemies cont...,0
24997,I saw 'Descent' last night at the Stockholm Fi...,0
24998,Some films that you pick up for a pound turn o...,0


In [2]:
# 对数据进行处理：分词，全为小写，只要英文
import re
def text2wordlist(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    words = text.lower().split()
    text = ' '.join(words)
    return text
# pd.apply 会返回一个列表，自己不会变

train_reviews = []
for review in train_df['影评内容'].values.tolist():
    train_reviews.append(text2wordlist(review))
train_df['影评内容'] = train_reviews   
train_label = train_df['真实标签'].values

test_reviews = []
for review in test_df['影评内容'].values.tolist():
    test_reviews.append(text2wordlist(review))
test_df['影评内容'] = test_reviews 
test_label = test_df['真实标签'].values

# test_reviews = test_df['影评内容'].values.tolist()
train_df

Unnamed: 0,影评内容,真实标签
0,bromwell high is a cartoon comedy it ran at th...,1
1,homelessness or houselessness as george carlin...,1
2,brilliant over acting by lesley ann warren bes...,1
3,this is easily the most underrated film inn th...,1
4,this is not the typical mel brooks film it was...,1
...,...,...
24995,towards the end of the movie i felt it was too...,0
24996,this is the kind of movie that my enemies cont...,0
24997,i saw descent last night at the stockholm film...,0
24998,some films that you pick up for a pound turn o...,0


In [3]:
# 词化
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
token = Tokenizer(num_words=10000)                # 创建词化器
token.fit_on_texts(train_reviews+test_reviews)    # 根据文本训练词化器

train_sequences = token.texts_to_sequences(train_reviews)  # 将文本转变为向量
test_sequences = token.texts_to_sequences(test_reviews)

# 将每一句话的长度固定
x_train = pad_sequences(train_sequences, padding='post', truncating='post', maxlen=500)
x_test = pad_sequences(test_sequences, padding='post', truncating='post', maxlen=500)

In [4]:
# 构建SimpleRNN模型
from keras.models import Sequential
from keras.layers import SimpleRNN, Embedding, Dense

vocabulary = 10000   # 词向量维度
word_num = 500       # 句子长度
embedding_dim = 32   # 嵌入维度
state_dim = 32       # 状态维度

model1 = Sequential()
model1.add(Embedding(vocabulary, embedding_dim, input_length=word_num))
model1.add(SimpleRNN(state_dim, return_sequences=False))
model1.add(Dense(1, activation='sigmoid'))
model1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 32)           320000    
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 32)                2080      
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 322,113
Trainable params: 322,113
Non-trainable params: 0
_________________________________________________________________


In [16]:
# 构建双向LSTM模型
import numpy as np 
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout

vocabulary = 10000
embedding_dim = 32
word_num = 500
hidden_dim = 32

model = Sequential()
model.add(Embedding(vocabulary, embedding_dim, input_length=word_num))  # embedding层
model.add(Bidirectional(LSTM(hidden_dim, return_sequences=False, dropout=0.2)))        # 隐层
# model.add(Dense(hidden_dim, activation='relu'))   # 全连接层                         
model.add(Dense(1, activation='sigmoid'))         # 输出层

model.summary()  # 可以打印模型的参数信息

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 500, 32)           320000    
_________________________________________________________________
bidirectional_4 (Bidirection (None, 64)                16640     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 65        
Total params: 336,705
Trainable params: 336,705
Non-trainable params: 0
_________________________________________________________________


In [7]:
from tensorflow import keras
import numpy as np
# adam = keras.optimizers.Adam(lr=0.0001, epsilon=1e-08, decay=0.0)
RMS = keras.optimizers.RMSprop(lr=0.001)

model1.compile(optimizer=RMS,
             loss='binary_crossentropy',
             metrics=['acc'])

index = np.arange(len(train_reviews))
np.random.shuffle(index)

batch_size = 32
epoch = 3
history = model1.fit(x_train[index], train_label[index], 
                   validation_split=0.3, epochs=epoch, batch_size=batch_size,
                   verbose=1, shuffle=True)

Epoch 1/3
Epoch 2/3
Epoch 3/3
 53/547 [=>............................] - ETA: 47s - loss: 0.6924 - acc: 0.5165

KeyboardInterrupt: 

In [13]:
test_loss, test_acc = model.evaluate(x_test, test_label, verbose=1)    
print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))    

Test Loss: 0.3420987129211426
Test Accuracy: 0.8702399730682373
