In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences

from keras.models import Model
from keras.models import Sequential


from keras.layers import Input, Dense, Embedding, Conv1D, Conv2D, MaxPooling1D, MaxPool2D
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.layers import SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers.recurrent import LSTM,SimpleRNN
from keras.layers import Activation

from keras.callbacks import Callback
from keras.optimizers import Adam

from keras.callbacks import ModelCheckpoint, EarlyStopping

from keras.models import load_model

from keras.utils.vis_utils import plot_model

from keras.utils.np_utils import to_categorical

import jieba
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# 导入自定义库
from utils.data_utils import clean_str
from utils.data_utils import build_vocab
from utils.data_utils import get_tokens

### 基本参数配置

In [None]:
BASE_DIR = '/Users/tsw/ScenicSpotReviews'

W2V_DIR = BASE_DIR + '/embeddings/'

TEXT_DATA_DIR = BASE_DIR + '/data/'

MAX_SEQUENCE_LENGTH = 100

MAX_NUM_WORDS = 33950
MAX_NB_WORDS = 30000

EMBEDDING_DIM = 300

VALIDATION_SPLIT = 0.2

BATCH_SIZE = 32

### 加载数据

In [None]:
df_dataset = pd.read_csv('./data/training-inspur.csv', encoding='utf-8')

### 数据预处理

In [None]:
COMMCONTENT_SEG = []

for sent in df_dataset['COMMCONTENT']:

    # Extract Sentence
    sent = str(sent).strip()

    sent = clean_str(sent)

    stopwords = [" ","!","...................................................................."]

    seg_list = jieba.cut(sent, cut_all=False)

    seg_list = [i for i in seg_list if i not in stopwords]
    
    COMMCONTENT_SEG.append(" ".join(seg_list))
df_dataset['COMMCONTENT_SEG'] = pd.DataFrame(COMMCONTENT_SEG,columns=['COMMCONTENT_SEG'])

In [None]:
df_dataset = df_dataset[df_dataset['COMMCONTENT_SEG']!=""]
df_dataset = df_dataset.reset_index()

In [None]:
df_dataset.head()

### 构建 Vocab 、word2index、index2word

In [None]:
vocab,vocab_freqs = build_vocab(df_dataset['COMMCONTENT_SEG'])

In [None]:
vocab_size = min(MAX_NB_WORDS, len(vocab_freqs)) + 2
word2index = {x[0]: i+2 for i, x in enumerate(vocab_freqs.most_common(MAX_NB_WORDS))}
word2index["PAD"] = 0
word2index["UNK"] = 1
index2word = {v:k for k, v in word2index.items()}

In [None]:
len(word2index)

In [None]:
def split_dataset(X, y, train_ratio=0.8):
    X = np.array(X)
    # seq_lens = np.array(seq_lens)
    data_size = len(X)

    # Shuffle the data
    shuffle_indices = np.random.permutation(np.arange(data_size))
    X, y = X[shuffle_indices], y[shuffle_indices]

    # Split into train and validation set
    train_end_index = int(train_ratio*data_size)
    train_X = X[:train_end_index]
    train_y = y[:train_end_index]

    valid_X = X[train_end_index:]
    valid_y = y[train_end_index:]
    
    return train_X,train_y,valid_X,valid_y

In [None]:
train_X,train_y,valid_X,valid_y = split_dataset(df_dataset['COMMCONTENT_SEG'], 
                                                df_dataset['COMMLEVEL'], 
                                                train_ratio=0.8)

In [None]:
train_X

### 将词转换为 index 向量

In [None]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)

tokenizer.fit_on_texts(df_dataset['COMMCONTENT_SEG'])

In [None]:
train_X[12]

In [None]:
tokenizer.texts_to_sequences([train_X[12]])

In [None]:
train_sequences = tokenizer.texts_to_sequences(train_X)
test_sequences = tokenizer.texts_to_sequences(valid_X)

### Padding

In [None]:
padded_train_sequences = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
padded_test_sequences = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
padded_test_sequences[0].shape

In [None]:
padded_train_sequences.shape

### embedding layer

In [None]:
print('Indexing word embeddings.')  
embeddings_index = {}
with open('./embeddings/zhihu.vec','r') as f:
    for i in f:
        values = i.split(' ')
        word = str(values[0])
        embedding = np.asarray(values[1:],dtype='float')
        embeddings_index[word] = embedding
print('word embedding',len(embeddings_index))

In [None]:
nb_words = min(MAX_NB_WORDS,len(word2index))
nb_words

In [None]:
word_embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))

In [None]:
for word, i in word2index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(str(word).upper())
    if embedding_vector is not None:
        word_embedding_matrix[i] = embedding_vector

In [None]:
# word_embedding_matrix[4]

In [None]:
embedding_layer = Embedding(nb_words + 1,
                                            EMBEDDING_DIM,
                                             weights=[word_embedding_matrix],
                 input_length=MAX_SEQUENCE_LENGTH,
#                             mask_zero=True,
                 trainable=True)

In [None]:
padded_train_sequences[0]

In [None]:
import keras
#写一个LossHistory类，保存loss和acc
class LossHistory(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.losses = {'batch': [], 'epoch': []}
        self.accuracy = {'batch': [], 'epoch': []}
        self.val_loss = {'batch': [], 'epoch': []}
        self.val_acc = {'batch': [], 'epoch': []}

    def on_batch_end(self, batch, logs={}):
        self.losses['batch'].append(logs.get('loss'))
        self.accuracy['batch'].append(logs.get('acc'))
        self.val_loss['batch'].append(logs.get('val_loss'))
        self.val_acc['batch'].append(logs.get('val_acc'))

    def on_epoch_end(self, batch, logs={}):
        self.losses['epoch'].append(logs.get('loss'))
        self.accuracy['epoch'].append(logs.get('acc'))
        self.val_loss['epoch'].append(logs.get('val_loss'))
        self.val_acc['epoch'].append(logs.get('val_acc'))

    def loss_plot(self, loss_type):
        iters = range(len(self.losses[loss_type]))
        #创建一个图
        plt.figure()
        # acc
        plt.plot(iters, self.accuracy[loss_type], 'r', label='train acc')#plt.plot(x,y)，这个将数据画成曲线
        # loss
        plt.plot(iters, self.losses[loss_type], 'g', label='train loss')
        if loss_type == 'epoch':
            # val_acc
            plt.plot(iters, self.val_acc[loss_type], 'b', label='val acc')
            # val_loss
            plt.plot(iters, self.val_loss[loss_type], 'k', label='val loss')
        plt.grid(True)#设置网格形式
        plt.xlabel(loss_type)
        plt.ylabel('acc-loss')#给x，y轴加注释
        plt.legend(loc="upper right")#设置图例显示位置
        plt.show()

In [None]:
print("build model...")

In [None]:
def cnn_model(self, params):
        """
        Build un-compiled model of shallow-and-wide CNN
        Args:
            params: dictionary of parameters for NN
        Returns:
            Un-compiled model
        """

        inp = Input(shape=(params['text_size'], params['embedding_size']))

        outputs = []
        for i in range(len(params['kernel_sizes_cnn'])):
            output_i = Conv1D(params['filters_cnn'], kernel_size=params['kernel_sizes_cnn'][i],
                              activation=None,
                              kernel_regularizer=l2(params['coef_reg_cnn']),
                              padding='same')(inp)
            output_i = BatchNormalization()(output_i)
            output_i = Activation('relu')(output_i)
            output_i = GlobalMaxPooling1D()(output_i)
            outputs.append(output_i)

        output = concatenate(outputs, axis=1)

        output = Dropout(rate=params['dropout_rate'])(output)
        output = Dense(params['dense_size'], activation=None,
                       kernel_regularizer=l2(params['coef_reg_den']))(output)
        output = BatchNormalization()(output)
        output = Activation('relu')(output)
        output = Dropout(rate=params['dropout_rate'])(output)
        output = Dense(self.n_classes, activation=None,
                       kernel_regularizer=l2(params['coef_reg_den']))(output)
        output = BatchNormalization()(output)
        act_output = Activation(params.get("last_layer_activation", "sigmoid"))(output)
        model = Model(inputs=inp, outputs=act_output)
        return model

In [None]:
model = cnn_model()
batch_size = 256
epochs = 8

# model.fit(x_train, y_train,
#           validation_split=0.1,
#           batch_size=batch_size,
#           epochs=epochs,
#           shuffle=True)
#创建一个实例LossHistory
history = LossHistory()

model.fit(x=padded_train_sequences[:], y=to_categorical(train_y-1, num_classes=None)[:], 
                    validation_data=(padded_test_sequences[:], to_categorical(valid_y-1, num_classes=None)[:]), 
                    batch_size=batch_size, 
                    #callbacks=[checkpoint],
                    callbacks=[history],
                    epochs=epochs,
                    verbose=1
         )

In [None]:
def text_cnn(maxlen=MAX_SEQUENCE_LENGTH, max_features=2000, embed_size=32):
    
    # Inputs
    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

    # Embeddings layers
    emb_comment =  Embedding(input_dim = nb_words+1, 
                             output_dim = EMBEDDING_DIM, 
                             weights=[word_embedding_matrix], 
                             input_length=MAX_SEQUENCE_LENGTH, 
                             mask_zero=False,
                             trainable=False)(sequence_input)

    # conv layers
    convs = []
    
    filter_sizes = [2, 3, 4, 5]
    
    for fsz in filter_sizes:
        l_conv = Conv1D(filters=100, kernel_size=fsz, activation='relu')(emb_comment)
        
        l_pool = MaxPooling1D(maxlen - fsz + 1)(l_conv)
        
        l_pool = Flatten()(l_pool)
        
        convs.append(l_pool)
    
    merge = concatenate(convs, axis=1)

    out = Dropout(0.5)(merge)
    
    output = Dense(32, activation='relu')(out)

    output = Dense(len(np.unique(valid_y)), activation='softmax')(output)

    # model = Model([sequence_input], output)
    # model = Model(inputs=sequence_input, output)
    model = Model(sequence_input, output)
    
    #  adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    #  model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])
    # 优化器我这里用了adadelta，也可以使用其他方法
    model.compile(loss='categorical_crossentropy', optimizer='Adadelta', metrics=['accuracy'])
    return model

In [None]:
model = text_cnn()
model.summary()

In [None]:
batch_size = 256
epochs = 8

# model.fit(x_train, y_train,
#           validation_split=0.1,
#           batch_size=batch_size,
#           epochs=epochs,
#           shuffle=True)
#创建一个实例LossHistory
history = LossHistory()

model.fit(x=padded_train_sequences[:], y=to_categorical(train_y-1, num_classes=None)[:], 
                    validation_data=(padded_test_sequences[:], to_categorical(valid_y-1, num_classes=None)[:]), 
                    batch_size=batch_size, 
                    #callbacks=[checkpoint],
                    callbacks=[history],
                    epochs=epochs,
                    verbose=1
         )

In [None]:
score, acc = model.evaluate(padded_test_sequences[:],to_categorical(valid_y-1, num_classes=None)[:], batch_size=batch_size)
#     print(score, acc)
print('test_loss: %f, accuracy: %f' % (score, acc))

In [None]:
history.loss_plot('epoch')

In [None]:
plot_model(model, 
           to_file='./textcnn_model.png', 
           show_shapes=True, 
           show_layer_names=True)


Model
![](./textcnn_model.png)

In [None]:
# train a 1D convnet with global maxpoolinnb_wordsg
#left model 第一块神经网络，卷积窗口是5*50（50是词向量维度）
model_left = Sequential()
# model.add(Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32'))
model_left.add(embedding_layer)
model_left.add(Conv1D(128, 5, activation='tanh'))
model_left.add(MaxPooling1D(5))
model_left.add(Conv1D(128, 5, activation='tanh'))
model_left.add(MaxPooling1D(5))
model_left.add(Conv1D(128, 5, activation='tanh'))
model_left.add(MaxPooling1D(18))
model_left.add(Flatten())

#right model <span style="font-family:Arial, Helvetica, sans-serif;">第二块神经网络，卷积窗口是4*50</span>

model_right = Sequential()
model_right.add(embedding_layer)
model_right.add(Conv1D(128, 4, activation='tanh'))
model_right.add(MaxPooling1D(4))
model_right.add(Conv1D(128, 4, activation='tanh'))
model_right.add(MaxPooling1D(4))
model_right.add(Conv1D(128, 4, activation='tanh'))
model_right.add(MaxPooling1D(28))
model_right.add(Flatten())

#third model <span style="font-family:Arial, Helvetica, sans-serif;">第三块神经网络，卷积窗口是6*50</span>
model_3 = Sequential()
model_3.add(embedding_layer)
model_3.add(Conv1D(128, 6, activation='tanh'))
model_3.add(MaxPooling1D(3))
model_3.add(Conv1D(128, 6, activation='tanh'))
model_3.add(MaxPooling1D(3))
model_3.add(Conv1D(128, 6, activation='tanh'))
model_3.add(MaxPooling1D(30))
model_3.add(Flatten())

merged = Concatenate([model_left, model_right,model_3])
# out = Concatenate()([model_left.output, model_right.output,model_3.output])
# merged = keras.layers.Merge([model_left, model_right,model_3], mode='concat') # 将三种不同卷积窗口的卷积层组合 连接在一起，当然也可以只是用三个model中的一个，一样可以得到不错的效果，只是本文采用论文中的结构设计

model = Sequential()

model.add(model_3) # add merge
# model.add(Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32'))

model.add(Dense(128, activation='tanh')) # 全连接层

# softmax，输出文本属于20种类别中每个类别的概率
model.add(Dense(len(np.unique(valid_y)), activation='softmax'))

In [None]:
# main_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name='main_input')
# preds = Dense(len(np.unique(valid_y)), activation='softmax')(x)

# model = Model()
# merged.summary()

In [None]:
model.summary()

In [None]:
# 优化器我这里用了adadelta，也可以使用其他方法
model.compile(loss='categorical_crossentropy',
              optimizer='Adadelta',
              metrics=['accuracy'])

In [None]:
# happy learning!
# model.fit(x_train, y_train, validation_data=(x_val, y_val),
#           nb_epoch=2, batch_size=128)
batch_size = 256
epochs = 5
model.fit(x=padded_train_sequences[:], y=to_categorical(train_y-1, num_classes=None)[:], 
                    validation_data=(padded_test_sequences[:], to_categorical(valid_y-1, num_classes=None)[:]), 
                    batch_size=128, 
                    #callbacks=[checkpoint], 
                    epochs=2,
                    verbose=1
         )


In [None]:
score, acc = model.evaluate(padded_test_sequences[:],to_categorical(valid_y-1, num_classes=None)[:], batch_size=batch_size)
print(score, acc)

In [None]:
model.summary()

In [None]:
from keras import layers

In [None]:
model_l= Sequential()  
model_l.add(Dense(50, input_shape=(784,)))  
model_l.add(Activation('relu'))  
   
model_r = Sequential()  
model_r.add(Dense(50, input_shape=(784,)))  
model_r.add(Activation('relu'))  
   
model = Sequential()
# merged = concatenate([model_left, model_right,model_3])
model.add(concatenate([model_l, model_r]))  

In [None]:
main_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name='main_input')
x =Embedding(nb_words + 1,
                 EMBEDDING_DIM,
                 weights=[word_embedding_matrix],
                 input_length=MAX_SEQUENCE_LENGTH,
                 mask_zero=True,
                 trainable=False)

drop_out = Dropout(0.1, name='dropout')(x)

w_aspect = Lambda(get_aspect, output_shape=(emb,), name="w_aspect")(drop_out)

w_context = Lambda(get_context, output_shape=(maxlen-1,emb), name="w_context")(drop_out)

w_aspect = Dense(emb, W_regularizer=l2(0.01), name="w_aspect_1")(w_aspect)

# hop 1
w_aspects = RepeatVector(maxlen-1, name="w_aspects1")(w_aspect)

merged = merge([w_context, w_aspects], name='merged1', mode='concat')

distributed = TimeDistributed(Dense(1, W_regularizer=l2(0.01), activation='tanh'), name="distributed1")(merged)

flat_alpha = Flatten(name="flat_alpha1")(distributed)

alpha = Dense(maxlen-1, activation='softmax', name="alpha1")(flat_alpha)

w_context_trans = Permute((2, 1), name="w_context_trans1")(w_context)

r_ = merge([w_context_trans, alpha], output_shape=(emb, 1), name="r_1", mode=get_R)

r = Reshape((emb,), name="r1")(r_)

w_aspect_linear = Dense(emb, W_regularizer=l2(0.01), activation='linear')(w_aspect)

merged = merge([r, w_aspect_linear], mode='sum')

w_aspect = Dense(emb, W_regularizer=l2(0.01), name="w_aspect_2")(merged)

# hop 2
w_aspects = RepeatVector(maxlen-1, name="w_aspects2")(w_aspect)
merged = merge([w_context, w_aspects], name='merged2', mode='concat')
distributed = TimeDistributed(Dense(1, W_regularizer=l2(0.01), activation='tanh'), name="distributed2")(merged)
flat_alpha = Flatten(name="flat_alpha2")(distributed)
alpha = Dense(maxlen-1, activation='softmax', name="alpha2")(flat_alpha)
w_context_trans = Permute((2, 1), name="w_context_trans2")(w_context)
r_ = merge([w_context_trans, alpha], output_shape=(emb, 1), name="r_2", mode=get_R)
r = Reshape((emb,), name="r2")(r_)
w_aspect_linear = Dense(emb, W_regularizer=l2(0.01), activation='linear')(w_aspect)
merged = merge([r, w_aspect_linear], mode='sum')

w_aspect = Dense(emb, W_regularizer=l2(0.01), name="w_aspect_3")(merged)

# hop 3
w_aspects = RepeatVector(maxlen-1, name="w_aspects3")(w_aspect)
merged = merge([w_context, w_aspects], name='merged3', mode='concat')
distributed = TimeDistributed(Dense(1, W_regularizer=l2(0.01), activation='tanh'), name="distributed3")(merged)
flat_alpha = Flatten(name="flat_alpha3")(distributed)
alpha = Dense(maxlen-1, activation='softmax', name="alpha3")(flat_alpha)
w_context_trans = Permute((2, 1), name="w_context_trans3")(w_context)
r_ = merge([w_context_trans, alpha], output_shape=(emb, 1), name="r_3", mode=get_R)
r = Reshape((emb,), name="r3")(r_)
w_aspect_linear = Dense(emb, W_regularizer=l2(0.01), activation='linear')(w_aspect)
merged = merge([r, w_aspect_linear], mode='sum')

h_ = Activation('tanh')(merged)

out = Dense(3, activation='softmax')(h_)

output = out

model = Model(input=[main_input], output=output)

In [None]:
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(200, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(120))
model.add(Activation('relu'))
model.add(Dense(len(np.unique(valid_y)),activation='softmax'))

In [None]:
model.summary()

In [None]:
# optimizer
model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

In [None]:
# filepath="./weights-improvement-{epoch:02d}-{val_acc:.3f}.hdf5"
# checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
batch_size = 256
epochs = 5
model.fit(x=padded_train_sequences[:], 
                    y=to_categorical(train_y-1, num_classes=None)[:], 
                    validation_data=(padded_test_sequences[:], to_categorical(valid_y-1, num_classes=None)[:]), 
                    batch_size=batch_size, 
                    #callbacks=[checkpoint], 
                    epochs=epochs,
                    verbose=1     
         )

In [None]:
 score, acc = model.evaluate(padded_test_sequences[:],to_categorical(valid_y-1, num_classes=None)[:], batch_size=batch_size)
    print(score, acc)

In [None]:
print(score, acc)

In [None]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
preds = Dense(len(np.unique(valid_y)), activation='softmax')(x)
embedded_sequences = embedding_layer(sequence_input)

In [None]:
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(18)(x)  # global max pooling


In [None]:
x = Flatten()(x)
x = Dense(128, activation='relu')(x)

In [None]:
preds = Dense(len(np.unique(valid_y)), activation='softmax')(x)

In [None]:
model = Model(sequence_input, preds)

In [None]:
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [None]:
# happy learning!
# model.fit(x_train, y_train, validation_data=(x_val, y_val),
#           nb_epoch=2, batch_size=128)
model.fit(x=padded_train_sequences[:], y=to_categorical(train_y-1, num_classes=None)[:], 
                    validation_data=(padded_test_sequences[:], to_categorical(valid_y-1, num_classes=None)[:]), 
                    batch_size=128, 
                    #callbacks=[checkpoint], 
                    epochs=2,
                    verbose=1
         )

In [None]:
model = Sequential()  # or Graph or whatever
model.add(embedding_layer)  # Adding Input Length
model.add(LSTM(units=50, activation='tanh'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax')) # Dense=>全连接层,输出维度=3
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
model.fit(x=padded_train_sequences[:], y=to_categorical(train_y-1, num_classes=None)[:], 
                    validation_data=(padded_test_sequences[:], to_categorical(valid_y-1, num_classes=None)[:]), 
                    batch_size=128, 
                    #callbacks=[checkpoint], 
                    epochs=2,
                    verbose=1
         )

In [None]:
##定义网络结构
def train_lstm(n_symbols,embedding_weights,x_train,y_train,x_test,y_test):
    print 'Defining a Simple Keras Model...'
    model = Sequential()  # or Graph or whatever
    model.add(Embedding(output_dim=vocab_dim,
                        input_dim=n_symbols,
                        mask_zero=True,
                        weights=[embedding_weights],
                        input_length=input_length))  # Adding Input Length
    model.add(LSTM(output_dim=50, activation='tanh'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax')) # Dense=>全连接层,输出维度=3
    model.add(Activation('softmax'))

    print 'Compiling the Model...'
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',metrics=['accuracy'])

    print "Train..." # batch_size=32
    model.fit(x_train, y_train, batch_size=batch_size, epochs=n_epoch,verbose=1)

    print "Evaluate..."
    score = model.evaluate(x_test, y_test,
                                batch_size=batch_size)

    yaml_string = model.to_yaml()
    with open('../model/lstm.yml', 'w') as outfile:
        outfile.write( yaml.dump(yaml_string, default_flow_style=True) )
    model.save_weights('../model/lstm.h5')
    print 'Test score:', score

In [None]:
padded_train_sequences

In [None]:
MAX_LENGTH=588
def get_simple_rnn_model():
    embedding_dim = 300
    
    embedding_matrix = np.random.random((MAX_NB_WORDS, embedding_dim))
    
    inp = Input(shape=(MAX_LENGTH, ))
    
    x = Embedding(input_dim=MAX_NB_WORDS, output_dim=embedding_dim, input_length=MAX_LENGTH, 
                  weights=[embedding_matrix], trainable=True)(inp)
    
    x = SpatialDropout1D(0.3)(x)
    
    x = Bidirectional(GRU(100, return_sequences=True))(x)
    
    avg_pool = GlobalAveragePooling1D()(x)
    
    max_pool = GlobalMaxPooling1D()(x)
    
    conc = concatenate([avg_pool, max_pool])
    
    outp = Dense(input_dim=100, activation="softmax", units=3)(conc)
    
    model = Model(inputs=inp, outputs=outp)
    
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

rnn_simple_model = get_simple_rnn_model()

def get_simple_rnn_model():
    MAX_LENGTH=588
    
    inp = Input(shape=(MAX_LENGTH, ))
    
    x = Embedding(nb_words + 1,
                 EMBEDDING_DIM,
                 weights=[word_embedding_matrix],
                 input_length=MAX_SEQUENCE_LENGTH,
                 trainable=False)(inp)
    
    model = Sequential()
    
    # RNN cell
    model.add()
    # dropout
#     x = SpatialDropout1D(0.3)(x)
    
    x = Bidirectional(GRU(100, return_sequences=True))(x)
    
    avg_pool = GlobalAveragePooling1D()(x)
    
    max_pool = GlobalMaxPooling1D()(x)
    
    conc = concatenate([avg_pool, max_pool])
    
#     outp = Dense(input_dim=200, activation="softmax", units=3)(conc)
    outp = Dense(1, activation="softmax")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    
    # optimizer
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    
    return model

In [None]:
model = load_model('./weights-improvement-01-0.360.hdf5')

y_pred_rnn_simple = model.predict(padded_test_sequences, verbose=1, batch_size=2048)

y_pred_rnn_simple = pd.DataFrame(y_pred_rnn_simple, columns=['prediction'])
y_pred_rnn_simple['prediction'] = y_pred_rnn_simple['prediction'].map(lambda p: 1 if p >= 0.5 else 0)
y_pred_rnn_simple.to_csv('./y_pred_rnn_simple.csv', index=False)

In [None]:
from sklearn.metrics import accuracy_score, auc, roc_auc_score
y_pred_rnn_simple = pd.read_csv('./y_pred_rnn_simple.csv')
print(accuracy_score(valid_y, y_pred_rnn_simple))

In [None]:
rnn_simple_model = get_simple_rnn_model()

In [None]:
rnn_simple_model = model()
plot_model(rnn_simple_model, 
           to_file='./rnn_simple_model.png', 
           show_shapes=True, 
           show_layer_names=True)
Model
![](./rnn_simple_model.png)

Model
![](./rnn_simple_model.png)

In [None]:
import pydot
import h5py

In [None]:
filepath="./weights-improvement-{epoch:02d}-{val_acc:.3f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

batch_size = 256
epochs = 2

history = model.fit(x=padded_train_sequences, 
                    y=train_y, 
                    validation_data=(padded_test_sequences, valid_y), 
                    batch_size=batch_size, 
                    callbacks=[checkpoint], 
                    epochs=epochs,
                    verbose=1) 

In [None]:
best_rnn_simple_model = load_model('./weights-improvement-01-0.318.hdf5')

y_pred_rnn_simple = best_rnn_simple_model.predict(padded_test_sequences, verbose=1, batch_size=2048)

y_pred_rnn_simple = pd.DataFrame(y_pred_rnn_simple, columns=['prediction'])
y_pred_rnn_simple['prediction'] = y_pred_rnn_simple['prediction'].map(lambda p: 1 if p >= 0.5 else 0)
y_pred_rnn_simple.to_csv('./y_pred_rnn_simple.csv', index=False)

In [None]:
from sklearn.metrics import accuracy_score, auc, roc_auc_score
y_pred_rnn_simple = pd.read_csv('./y_pred_rnn_simple.csv')
print(accuracy_score(valid_y, y_pred_rnn_simple))

### Data Preprocessing

In [None]:
data = pd.read_csv('./data/tweets.csv', encoding='latin1', usecols=['Sentiment', 'SentimentText'])
data.columns = ['sentiment', 'text']
data = data.sample(frac=1, random_state=42)
print(data.shape)

In [None]:
for row in data.head(10).iterrows():
    print(row[1]['sentiment'], row[1]['text']) 

推文是有噪声的，让我们通过删除url（网址）、hashtag（主题标签）和user mentions（用户提及）来清除它们。

In [None]:
def tokenize(tweet):
    tweet = re.sub(r'http\S+', '', tweet)
    tweet = re.sub(r"#(\w+)", '', tweet)
    tweet = re.sub(r"@(\w+)", '', tweet)
    tweet = re.sub(r'[^\w\s]', '', tweet)
    tweet = tweet.strip().lower()
    tokens = word_tokenize(tweet)
    return tokens

In [None]:
data['tokens'] = data.text.progress_map(tokenize)
data['cleaned_text'] = data['tokens'].map(lambda tokens: ' '.join(tokens))
data[['sentiment', 'cleaned_text']].to_csv('./data/cleaned_text.csv')

data = pd.read_csv('./data/cleaned_text.csv')
print(data.shape)

In [None]:
data.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data['cleaned_text'],
                  data['sentiment'],
                  test_size=0.1,
                   random_state=42,
                   stratify=data['sentiment'])

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

In [None]:
pd.DataFrame(y_test).to_csv('./predictions/y_true.csv', index=False, encoding='utf-8')

### 基于词ngrams的词袋模型

In [None]:
vectorizer_word = TfidfVectorizer(max_features=40000,
                                  min_df=5,
                                  max_df=0.5,
                                  analyzer='word',
                                  stop_words='english',
                                  ngram_range=(1, 2))

vectorizer_word.fit(x_train, leave=False)

tfidf_matrix_word_train = vectorizer_word.transform(x_train)
tfidf_matrix_word_test = vectorizer_word.transform(x_test)

In [None]:
lr_word = LogisticRegression(solver='sag', verbose=2)
lr_word.fit(tfidf_matrix_word_train, y_train)

In [None]:
joblib.dump(lr_word, './models/lr_word_ngram.pkl')

y_pred_word = lr_word.predict(tfidf_matrix_word_test)
pd.DataFrame(y_pred_word, columns=['y_pred']).to_csv('./predictions/lr_word_ngram.csv', 
index=False)

In [None]:
y_pred_word = pd.read_csv('./predictions/lr_word_ngram.csv')
print(accuracy_score(y_test, y_pred_word))

### 基于字符ngrams的词袋模型

In [None]:
class SentimentLSTM:
    def __init__(self):
        self.tokenizer = Tokenizer(num_words=vocab_size)
        self.stop_words = []
        self.model = None

    def load_stop_word(self,path='dict/stop_word.txt'):
        with open(path, 'r') as f:
            for line in f:
                content = line.strip()
                self.stop_words.append(content.decode('utf-8'))

    def jieba_cut(self,line):
        lcut = jieba.lcut(line)
        cut = [x for x in lcut if x not in self.stop_words]
        cut = " ".join(cut)
        return cut

    def load_cuted_corpus(self, dir, input):
        f = open(dir + '/' + input , 'r')
        lines = f.readlines()
        texts = []
        labels = []
        for line in lines:
            fields = line.split()
            rate = int(fields[0])
            if rate==0 or rate==3:
                continue
            elif rate < 3:
                rate = 0
            else:
                rate = 1
            cont = fields[1:]
            cont = " ".join(cont)
            texts.append(cont)
            labels.append(rate)

        self.tokenizer.fit_on_texts(texts)
        f.close()
        return texts,labels

    def load_data(self):
        x,y = self.load_cuted_corpus('corpus', 'review.csv')
        x = self.tokenizer.texts_to_sequences(x)
        x = S.pad_sequences(x,maxlen=sentence_max_len)
        y = to_categorical(y,num_classes=2)
        return ((x[0:500000],y[0:500000]), (x[500000:], y[500000:]))

    def train(self,epochs=50):
        print 'building model ...'
        self.model = SentimentLSTM.build_model()

        print 'loading data ...'
        (text_train, rate_train), (text_test, rate_text) = self.load_data()

        print 'training model ...'
        self.model.fit(text_train, rate_train,batch_size=1000,epochs=epochs)
        self.model.save('model/keras.model')
        score = self.model.evaluate(text_test,rate_text)
        print score

    def load_trained_model(self,path):
        model = SentimentLSTM.build_model()
        model.load_weights(path)
        return model

    def predict_text(self,text):
        if self.model == None:
            self.model = self.load_trained_model(model_path)
            self.load_stop_word()
            self.load_cuted_corpus('corpus', 'review.csv')

        vect = self.jieba_cut(text)
        vect = vect.encode('utf-8')
        vect = self.tokenizer.texts_to_sequences([vect,])
        print vect
        return self.model.predict_classes(S.pad_sequences(np.array(vect),100))

    @staticmethod
    def build_model():
        model = Sequential()
        model.add(Embedding(vocab_size, 256, input_length=sentence_max_len))
        model.add(Bidirectional(LSTM(128,implementation=2)))
        model.add(Dropout(0.5))
        model.add(Dense(2, activation='relu'))
        model.compile('RMSprop', 'categorical_crossentropy', metrics=['accuracy'])
        return model

In [None]:
def main():
    lstm = SentimentLSTM()
    lstm.train(10)
    while True:
        input = raw_input('Please input text:')
        if input == 'quit':
            break
        print lstm.predict_text(input)

if __name__=="__main__":
    main()