In [74]:
import keras

from keras.models import Model
from keras.models import Sequential
from keras.models import load_model

from keras.layers import Activation
from keras.layers import Input, Dense, Embedding, Conv1D, Conv2D, MaxPooling1D, MaxPool2D
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.layers import SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers.recurrent import LSTM,SimpleRNN

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences

from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping
from keras.optimizers import Adam

from keras.utils.vis_utils import plot_model
from keras.utils.np_utils import to_categorical

import jieba
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [1]:
# 导入自定义库
from utils.data_utils import clean_str
from utils.data_utils import build_vocab
from utils.data_utils import get_tokens

ModuleNotFoundError: No module named 'utils'

In [3]:
# 加载训练集合
df_train_dataset = pd.read_csv('./data/training-inspur.csv', encoding='utf-8')
# 加载测试集
df_test_dataset = pd.read_csv('./data/Preliminary-texting-1.csv', encoding='utf-8')
# 查看数据集合 shape
df_test_dataset.shape

(102024, 2)

In [4]:
# 提取数据集中所需的字段
df_train_dataset = df_train_dataset[['COMMCONTENT', 'COMMLEVEL']]
df_test_dataset = df_test_dataset[['COMMCONTENT']]

In [10]:
# 合并数据集用于构建词汇表
df_all_dataset = pd.concat([df_train_dataset, df_test_dataset], ignore_index=True)
df_all_dataset.shape

(122024, 2)

In [72]:
BASE_DIR = '/Users/tsw/ScenicSpotReviews'

W2V_DIR = BASE_DIR + '/embeddings/'

TEXT_DATA_DIR = BASE_DIR + '/data/'

MAX_NUM_WORDS = 33950

MAX_SEQUENCE_LENGTH = 150 # 每篇文章选取150个词

MAX_NB_WORDS = 80000 # 将字典设置为含有1万个词84480

EMBEDDING_DIM = 300 # 词向量维度，300维

VALIDATION_SPLIT = 0.1 # 测试集大小，全部数据的10%

BATCH_SIZE = 128

NUM_LABELS = 3

In [73]:
def seg_corpus(corpus):
    seg_corpus = []
    for line in corpus:
        line = str(line).strip()
        seg_list = jieba.cut(line, cut_all=False)
        # 过滤空字符
        seg_list = [w for w in seg_list if w != ' ']
        seg_corpus.append(" ".join(seg_list))
    return seg_corpus

In [25]:
# 对所有文本分词
seged_text = seg_corpus(df_all_dataset['COMMCONTENT'])

In [26]:
len(seged_text)

122024

In [27]:
# 将分词后的数据并入 df_all_dataset
df_all_dataset['COMMCONTENT_SEG'] = pd.DataFrame(seged_text,columns=['COMMCONTENT_SEG'])
df_all_dataset.head()

Unnamed: 0,COMMCONTENT,COMMLEVEL,COMMCONTENT_SEG
0,普通公园一个只是多了几个泉而已，人不多，适合老人孩子闲逛，买票的话还是贵了，人家说6.30之...,1.0,普通 公园 一个 只是 多 了 几个 泉 而已 ， 人不多 ， 适合 老人 孩子 闲逛 ， ...
1,跟儿子在里面玩了一天，非常好！跟儿子在里面玩了一天，非常好！真的很不错哦，有空还要去,1.0,跟 儿子 在 里面 玩 了 一天 ， 非常 好 ！ 跟 儿子 在 里面 玩 了 一天 ， 非...
2,这已经是第五次来这里玩了。每次孩子都很喜欢，不愿意从水里出来。有机会还会再来。还有比我更忠诚...,1.0,这 已经 是 第五次 来 这里 玩 了 。 每次 孩子 都 很 喜欢 ， 不 愿意 从水里 ...
3,当天在携程上定的票，打温泉度假村咨询电话和携程客服都说次日生效，但到酒店后，票能用。请客服人...,1.0,当天 在 携程 上定 的 票 ， 打 温泉 度假村 咨询电话 和 携程 客服 都 说 次日 ...
4,烟台历史的一部分，非常值得推荐去看看！海边景色也很漂亮！,1.0,烟台 历史 的 一部分 ， 非常 值得 推荐 去 看看 ！ 海边 景色 也 很漂亮 ！


In [28]:
# text_corpus
text_corpus = df_all_dataset['COMMCONTENT_SEG']
# 传入我们词向量的字典
tokenizer = Tokenizer(num_words=MAX_NB_WORDS) 
# 传入我们的训练数据，得到训练数据中出现的词的字典
tokenizer.fit_on_texts(text_corpus) 

In [34]:
'rows:',tokenizer.document_count

('rows:', 122024)

In [31]:
tokenizer.num_words

80000

In [36]:
# 根据训练数据中出现的词的字典，将训练数据转换为sequences
dataset_sequences = tokenizer.texts_to_sequences(text_corpus) 

In [37]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 100134 unique tokens.


In [40]:
padded_dataset_sequences = pad_sequences(dataset_sequences, maxlen=MAX_SEQUENCE_LENGTH)
padded_dataset_sequences.shape

(122024, 150)

In [43]:
# 划分训练集和测试集
from sklearn.model_selection import train_test_split
train_X,valid_X,train_y,valid_y =train_test_split(padded_dataset_sequences[:df_train_dataset.shape[0]], 
                                                  df_all_dataset['COMMLEVEL'][:df_train_dataset.shape[0]], 
                                                  test_size=0.1)

In [47]:
# label one-hot 表示
labels = df_all_dataset['COMMLEVEL'].dropna().map(int)#.values.tolist()
labels = to_categorical(labels-1) 


In [54]:
vocab,vocab_freqs = build_vocab(df_all_dataset['COMMCONTENT_SEG'])
vocab_size = min(MAX_NB_WORDS, len(vocab_freqs)) + 2
word2index = {x[0]: i+2 for i, x in enumerate(vocab_freqs.most_common(MAX_NB_WORDS))}
word2index["PAD"] = 0
word2index["UNK"] = 1
index2word = {v:k for k, v in word2index.items()}
len(word2index)

80002

In [52]:
print('Indexing word embeddings.')  
embeddings_index = {}
with open('./embeddings/sgns.weibo.word','r') as f:
    f = f.readlines()
    for i in f[1:]:
        values = i.strip().split(' ')
        word = str(values[0])
        embedding = np.asarray(values[1:],dtype='float')
        embeddings_index[word] = embedding
print('word embedding',len(embeddings_index))

Indexing word embeddings.
word embedding 195201


In [55]:
nb_words = min(MAX_NB_WORDS,len(word2index))
nb_words

80000

In [56]:
word_embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))

In [58]:
word_embedding_matrix.shape

(80001, 300)

In [59]:
for word, i in word2index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(str(word).upper())
    if embedding_vector is not None:
        word_embedding_matrix[i] = embedding_vector

In [60]:
word_embedding_matrix[:10]

array([[ 0.27259 ,  0.244615,  0.032857, ..., -0.199684, -0.084092,
         0.060737],
       [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ],
       [ 0.094386, -0.200944, -0.030828, ...,  0.003085,  0.023796,
        -0.201742],
       ...,
       [ 0.190794, -0.037967,  0.1013  , ..., -0.302136, -0.126407,
        -0.178464],
       [ 0.175443,  0.239842,  0.210521, ...,  0.071008,  0.177222,
        -0.062866],
       [-0.230501, -0.152982,  0.207998, ...,  0.007232, -0.494047,
        -0.179105]])

In [61]:
pre_embedding_layer =  Embedding(input_dim = nb_words+1, 
                             output_dim = EMBEDDING_DIM, 
                            weights=[word_embedding_matrix], 
                             input_length=MAX_SEQUENCE_LENGTH, 
                             mask_zero=True,
                             trainable=False
                            )

In [67]:
embedding_layer =  Embedding(input_dim = nb_words+1, 
                             output_dim = EMBEDDING_DIM, 
                             input_length=MAX_SEQUENCE_LENGTH, 
                             mask_zero=True,
                             trainable=True
                            )

In [76]:
print('Build model...')
model = Sequential()
model.add(pre_embedding_layer)
model.add(Bidirectional(GRU(256, dropout=0.2, recurrent_dropout=0.1, return_sequences=True)))
model.add(Bidirectional(GRU(256, dropout=0.2, recurrent_dropout=0.1)))
model.add(Dense(NUM_LABELS, activation='softmax'))

Build model...


In [77]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 150, 300)          24000300  
_________________________________________________________________
bidirectional_3 (Bidirection (None, 150, 512)          855552    
_________________________________________________________________
bidirectional_4 (Bidirection (None, 512)               1181184   
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 1539      
Total params: 26,038,575
Trainable params: 2,038,275
Non-trainable params: 24,000,300
_________________________________________________________________


In [78]:
# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [79]:
print('Train...')
batch_size = 256
epochs = 2

model.fit(x=train_X, y=to_categorical(train_y-1, num_classes=3), 
                    validation_data=(valid_X, to_categorical(valid_y-1, num_classes=3)[:]), 
                    batch_size=batch_size, 
                    epochs=epochs,
          shuffle=True,
                    verbose=1
         )

Train...
Train on 18000 samples, validate on 2000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x16db3b358>

In [82]:
df_test_dataset_seg = df_all_dataset['COMMCONTENT_SEG'][20000:]
test_dataset_sequences = tokenizer.texts_to_sequences(df_test_dataset_seg)
padded_test_dataset_sequences = pad_sequences(test_dataset_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [83]:
padded_test_dataset_sequences.shape

(102024, 150)

In [84]:
all_test_preds = model.predict(padded_test_dataset_sequences, batch_size=256)
w2v6 = np.argmax(all_test_preds,axis=1)[:]+1

KeyboardInterrupt: 

In [None]:
pd.Series(w2v6).value_counts(normalize=True)

In [None]:
# print('Build LSTM Model...')
# model = Sequential()
# model.add(pre_embedding_layer)
# model.add(LSTM(256, dropout=0.2, recurrent_dropout=0.1))  # try using a GRU instead, for fun
# model.add(Dense(3))
# model.add(Activation('tanh'))
# model.add(Dense(len(np.unique(valid_y)), activation='softmax'))