In [24]:
import sys
import pandas as pd
import numpy as np
import bz2
import keras
from functools import *
import collections

from keras.utils import to_categorical
from keras.models import Sequential, Model
from keras.layers import *
from keras.preprocessing.sequence import *

### 数据预处理
0. 载入数据集,建立模型参数
1. 获取数据集词典，并限制词量
2. 数据集中词映射为词典编号
3. 根据样本词集合载入词向量
4. 对短句填充至最大句长

In [32]:
data

14914

In [33]:
# 0. 载入数据集,建立模型参数
data = pd.read_csv("data_featured.csv", encoding='utf_8_sig').head(1000)
# 数据集中最大句子长度
print(data.length.max())


26


In [26]:
model_config={
    'max_length':60, # 最大句长
    'max_words':25000, # 最大词量
    'embedding_dim':300, # 词向量长度
    'model_path':'baseline_sentiment_model.h5',
    'words_dict_path':'words_dict.npy'
}

In [35]:
# 1. 读取字典
words_dict_list =np.load(model_config['words_dict_path']).tolist()


# 2.数据集中词映射为词典编号
data['words_index'] = data.words.apply(
    lambda item:list(map( lambda word: words_dict_list.index(word),eval(item))))
data['words_flag'] = data.words_flag.apply(lambda item: eval(item))
data['words_length'] = data.words_length.apply(lambda item: eval(item))

In [8]:
# 3. 根据样本词集合载入词向量
# 文件格式（词+向量）： word 1 2 3 4
# 词向量词典
embedding_matrix = np.zeros(
    (model_config['max_words'], model_config['embedding_dim']))
word_vector_count = 0
is_first_line = True
file = bz2.open('sgns.sogounews.bigram-char.bz2', mode='r')
for line in file:
    values = line.split()
    if is_first_line:
        print("word count : %s" % values[0])
        print("vector size : %s" % values[1])
        is_first_line = False
        continue
    word = values[0].decode('utf-8')
    if word in words_dict_list:
        index = words_dict_list.index(word)
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_matrix[index] = coefs
        word_vector_count = word_vector_count + 1

file.close()
print('Found %s word vectors.' % word_vector_count)

word count : b'365113'
vector size : b'300'
Found 21109 word vectors.


In [36]:
# 4. 对短句填充至最大句长
ids=data['event_id'].values
words_data = pad_sequences(data['words_index'].values, maxlen=model_config['max_length'],value=model_config['max_words'])
print('Shape of words_data tensor:', words_data.shape)

Shape of words_data tensor: (1000, 60)


In [29]:
# 2.建立模型
input_tensor = Input(shape=(None,))
embedding_layer=Embedding(model_config['max_words'], model_config['embedding_dim'] , input_length=model_config['max_length'])(input_tensor)
dropout_layer = SpatialDropout1D(0.12)(embedding_layer)
#flatten_layer = Flatten()(dropout_layer)
X_sentence = TimeDistributed(Dense(128, activation='relu'))(dropout_layer)
L_sentence = Bidirectional(LSTM(128))(X_sentence)
L_sentence = Dropout(0.5)(L_sentence)

L_sentence = Dense(64, activation='relu')(L_sentence)
L_sentence = BatchNormalization()(L_sentence)
L_sentence = Dropout(0.5)(L_sentence)
output_tensor = Dense(3, activation='sigmoid')(L_sentence)
model = Model(input_tensor, output_tensor)
# Embedding层载入词典索引为行号的词向量,Embedding层不参与训练
# model.layers[1].set_weights([embedding_matrix])
model.layers[1].trainable = False
print(model.summary())
# 模型优化
model.compile(optimizer='adamax', loss='categorical_crossentropy',metrics=['accuracy'])

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, None)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 60, 300)           7500000   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 60, 300)           0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 60, 128)           38528     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               263168    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                1644

In [30]:
# 模型载入
model.load_weights('baseline_sentiment_model.h5')

In [38]:
# 6. 模型预测
def get_sentiment(scores):
    index= scores.argmax()
    if index==2:
        return -1
    return index
predict_raw= model.predict(words_data)
test_predict=pd.DataFrame()
test_predict['event_id']= ids
test_predict['predict_sentiment']=list(map(lambda item: get_sentiment(item),predict_raw))

In [39]:
test_predict

Unnamed: 0,event_id,predict_sentiment
0,E8D6FD90FCB51DF95C30AB20D16A4DB3,0
1,288B9B734B201EFEA4E9B0D6FFC0A67A,0
2,BFAAF179CAA5C190991E5ED7EF30EDE5,0
3,17EB6E8B8B31ED7A7C5677F3E9574DF8,0
4,8D25659863A4986EBD7875D17E5A5314,1
...,...,...
995,760471EDEE98B8E6DE5D50DB2C92A73E,1
996,14694C45627DCE843FB4E698D899D059,0
997,2214483E09FEA823992104B43EA74D5C,0
998,E7DF7D097581A0009A9DC3B2CF26D85E,0


In [None]:
model.evaluate(data.sentiment, data.pred)

In [41]:
res=pd.merge(data,test_predict,on='event_id')
#.to_csv("predict_sentiment.csv", header=True, encoding='utf_8_sig', index=None)

In [47]:
# 准确率
print("accurancy : %s"%(len(res[res.sentiment==res.predict_sentiment])/len(res)))

accurancy : 0.568
