In [1]:
import keras
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

Using TensorFlow backend.


In [63]:
train_file = '../task1/sentiment-analysis-on-movie-reviews/train.tsv'
test_file = '../task1/sentiment-analysis-on-movie-reviews/test.tsv'
train = pd.read_csv(train_file, sep = '\t')
test = pd.read_csv(test_file, sep = '\t')
train['Phrase'] = train['Phrase'].str.lower()#先变成小写字母
train['tokenize'] = train.apply(lambda x: nltk.word_tokenize(x['Phrase']),axis=1)#分词，为下一步提词干作准备
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")#英文停用词有内置的
train['tokenize'] = train['tokenize'].apply(lambda x: [stemmer.stem(y) for y in x])#提词干，相当于一个降噪
train['tokenize'] = train['tokenize'].apply(lambda x: ' '.join(x))

test['Phrase'] = test['Phrase'].str.lower()#先变成小写字母
test['tokenize'] = test.apply(lambda x: nltk.word_tokenize(x['Phrase']),axis=1)#分词，为下一步提词干作准备
test['tokenize'] = test['tokenize'].apply(lambda x: [stemmer.stem(y) for y in x])#提词干，相当于一个降噪
test['tokenize'] = test['tokenize'].apply(lambda x: ' '.join(x))

In [74]:
from keras.layers import Input, Conv1D, Dense, Embedding, MaxPooling1D,Flatten, LSTM
from keras.models import Model
lenlist = []
wordset = set()
for i in train.tokenize:
    i = i.split()
    lenlist.append(len(i))
    for j in i:
        wordset.add(j)
lenlist.sort()
print("80%句子长度小于：",lenlist[int(0.8*len(lenlist))])
print("词表总数：",len(wordset))

80%句子长度小于： 11
词表总数： 11872


In [37]:
vocab_size = len(wordset)
sen_len = lenlist[int(0.8*len(lenlist))]
embedding_size = 256

In [72]:
################################CNN Model##############################################
inputs = Input(shape=(sen_len,), dtype='int32')
inputs_embedding = Embedding(vocab_size, embedding_size, input_length = sen_len)(inputs)
conv1 = Conv1D(10,3,strides = 1, padding = 'same', activation='relu')(inputs_embedding)#10个卷积核，一个核的尺寸是3x256（这个256是不需要输入的，1D实际上的含义是控制一维，相当于ngram维）,步长为1，因此输出结果是10x11
pooling1 = MaxPooling1D(2)(conv1)
conv2 = Conv1D(10,3,strides = 1, padding = 'same', activation='relu')(pooling1)
pooling2 = MaxPooling1D(2)(conv2)
conv3 = Conv1D(10,3,strides = 1, padding = 'same', activation='relu')(pooling2)
pooling3 = MaxPooling1D(2)(conv3)
flat = Flatten()(pooling3)
dense = Dense(128, activation='relu')(flat)
preds = Dense(5, activation='sigmoid')(dense)
model = Model(inputs,preds)
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',#这里用sparse_categorical就不用再对输出的格式做onehot,
              metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_10 (InputLayer)        (None, 11)                0         
_________________________________________________________________
embedding_8 (Embedding)      (None, 11, 256)           3039232   
_________________________________________________________________
conv1d_19 (Conv1D)           (None, 11, 10)            7690      
_________________________________________________________________
max_pooling1d_12 (MaxPooling (None, 5, 10)             0         
_________________________________________________________________
conv1d_20 (Conv1D)           (None, 5, 10)             310       
_________________________________________________________________
max_pooling1d_13 (MaxPooling (None, 2, 10)             0         
_________________________________________________________________
conv1d_21 (Conv1D)           (None, 2, 10)             310       
__________

In [76]:
################################LSTM Model##############################################
inputs = Input(shape=(sen_len,), dtype='int32')
inputs_embedding = Embedding(vocab_size, embedding_size, input_length = sen_len)(inputs)#这里是随机embedding的方式初始化，也可以先用gensim训练一个词向量作为weight初始化；或者用glove之类的别人训练好的词向量初始化
lstm = LSTM(units=128, dropout=0.2, recurrent_dropout=0.2)(inputs_embedding)#lstm如果想实现变长，要用mask
preds = Dense(5, activation='softmax')(lstm)
lstm_model = Model(inputs,preds)
lstm_model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',#这里用sparse_categorical就不用再对输出的格式做onehot,
              metrics=['acc'])
lstm_model.summary()

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_12 (InputLayer)        (None, 11)                0         
_________________________________________________________________
embedding_10 (Embedding)     (None, 11, 256)           3039232   
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               197120    
_________________________________________________________________
dense_10 (Dense)             (None, 5)                 645       
Total params: 3,236,997
Trainable params: 3,236,997
Non-trainable params: 0
_________________________________________________________________


In [64]:
#处理输入格式
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
xtrain = train.tokenize
ytrain = train.Sentiment
xtest = test.tokenize

In [66]:
tk = Tokenizer()
tk.fit_on_texts(xtrain.values)
x_train = tk.texts_to_sequences(xtrain)
x_test = tk.texts_to_sequences(xtest)
x_train = pad_sequences(x_train, sen_len)
x_test = pad_sequences(x_test, sen_len)

In [73]:
model.fit(x_train, ytrain, batch_size=128, epochs=7, verbose=1)

Instructions for updating:
Use tf.cast instead.
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x1a4d522080>

In [77]:
lstm_model.fit(x_train, ytrain, batch_size=128, epochs=7, verbose=1)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x1a4e16f0b8>