## 亚马逊商城服装商品评论的情感分析

In [18]:
# First import all the necessary packages
from __future__ import print_function
import time
import numpy as np
import pandas
import codecs
import operator
import requests
import pickle

from keras.preprocessing import sequence, text
from keras.models import Model, Sequential, load_model
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, Input, merge, concatenate, BatchNormalization
from keras.preprocessing.text import Tokenizer

import os
from os import listdir
from os.path import isfile, join

In [2]:
#Set GPU[0] as default training device
import cntk
cntk.try_set_default_device(cntk.device.gpu(0))

True

In [3]:
np.random.seed(1337)  # for reproducibility

In [4]:
max_features = 5000 # maximum number of words in your dictionary
max_len = 100  # cut texts after this number of words (among top max_features most common words)

In [5]:
#import data from csv file
data = pandas.read_csv("Data/dataset_clothing.csv", delimiter=',',quotechar='"',quoting=0,names=['review','sentiment'],header=None)
X = data['review'].apply(str).values
Y = data['sentiment'].values

#Split data into training data(80%) and test data(20%)
arr_rand = np.random.rand(X.shape[0])
split = arr_rand < np.percentile(arr_rand, 80)
X_train = X[split]
Y_train = Y[split]
X_test =  X[~split]
Y_test = Y[~split]

print('x:')
print(X_train[:1])
print(X_test[:1])
print('X_train.length: ', len(X_train))
print('X_test.length: ', len(X_test))
print('y:')
print(Y_train[:1])
print(Y_test[:1])
print('Y_train.length: ', len(Y_train))
print('Y_test.length: ', len(Y_test))

x:
[ "We've test-buy several similar items of different brands..... THIS ONE WAS THE BEST - (by my husband opinion) - so we've got him several more - couple days later."]
[ "I have been very happy with this brand of shorts. They are comfortable and seem to wear very well. No complaints at all. I like that they are a little longer than most. They don't provide any support, so you wouldn't want to wear them jogging, but for hiking they are great."]
X_train.length:  42648
X_test.length:  10662
y:
[1]
[1]
Y_train.length:  42648
Y_test.length:  10662


In [6]:
#Tokenize works to list of integers where each integer is a key to a word, filter all the punctuation.
reviewTokenizer = Tokenizer(num_words=max_features, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" ")
reviewTokenizer.fit_on_texts(X_train)

#Print top 20 words 
#Note zero is reserved for non frequent words
for word, value in reviewTokenizer.word_index.items():
    if value < 20:
        print(value, word)

15 are
14 but
19 on
6 it
11 in
10 this
12 they
2 i
13 my
18 that
8 for
4 a
9 of
5 to
16 not
1 the
7 is
17 these
3 and


In [7]:
#Create int to word dictionary
intToWord = {}
for word, value in reviewTokenizer.word_index.items():
    intToWord[value] = word

#Add a symbol for null placeholder
intToWord[0] = "!!!NA!!!"

print(intToWord[1])
print(intToWord[2])
print(intToWord[32])

the
i
wear


In [8]:
print(X_train[0])

#Convert word strings to integer sequence lists
X_train = reviewTokenizer.texts_to_sequences(X_train)
X_test = reviewTokenizer.texts_to_sequences(X_test)

print(X_train[0])
for value in X_train[0]:
    print(intToWord[value])

We've test-buy several similar items of different brands..... THIS ONE WAS THE BEST - (by my husband opinion) - so we've got him several more - couple days later.
[2112, 1554, 89, 297, 625, 592, 9, 198, 648, 10, 37, 20, 1, 256, 143, 13, 247, 954, 24, 2112, 96, 401, 297, 50, 380, 300, 720]
we've
test
buy
several
similar
items
of
different
brands
this
one
was
the
best
by
my
husband
opinion
so
we've
got
him
several
more
couple
days
later


In [9]:
# Pad the data to a solid length
X_train = sequence.pad_sequences(X_train, maxlen=max_len, padding='post')
X_test = sequence.pad_sequences(X_test, maxlen=max_len, padding='post')
Y_train = np.array(Y_train)
Y_test = np.array(Y_test)

print("Pad sequences (samples x time)")
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

Pad sequences (samples x time)
X_train shape: (42648, 100)
X_test shape: (10662, 100)


In [10]:
#Set model hyper parameters
epochs = 4
embedding_neurons = 128
lstm_neurons = 64
batch_size = 32

In [11]:
# Bi-directional LSTM

# This is the placeholder tensor for the input sequences
sequence = Input(shape=(max_len,), dtype='int32')

# This embedding layer will transform the sequences of integers into vectors of size embedding
# Embedding layer converts dense int input to one-hot in real time to save memory
embedded = Embedding(max_features, embedding_neurons, input_length=max_len)(sequence)

# Normalize embeddings by input/word in sentence
bnorm = BatchNormalization()(embedded)

# Create Bidirectional LSTM model
merged = Bidirectional(LSTM(lstm_neurons, dropout=0.2, recurrent_dropout=0.2), merge_mode='concat')(embedded)

after_dp = Dropout(0.5)(merged)
output = Dense(1, activation='sigmoid')(after_dp)

model_bidir_atom = Model(inputs=sequence, outputs=output)

# Review model structure
print(model_bidir_atom.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 128)          640000    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               98816     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 738,945
Trainable params: 738,945
Non-trainable params: 0
_________________________________________________________________
None


In [12]:
# Bi-directional Atom

# Using optimizers of rmsprop
model_bidir_atom.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])

print('Train...')
start_time = time.time()

# Train the model
history_bidir_atom = model_bidir_atom.fit(X_train, Y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_data=[X_test, Y_test], 
                    verbose=2)

end_time = time.time()
average_time_per_epoch = (end_time - start_time) / epochs
print("avg sec per epoch:", average_time_per_epoch)

Train...
Train on 42648 samples, validate on 10662 samples
Epoch 1/4
222s - loss: 0.3285 - acc: 0.8669 - val_loss: 0.2539 - val_acc: 0.8983
Epoch 2/4
222s - loss: 0.2479 - acc: 0.9055 - val_loss: 0.2434 - val_acc: 0.9054
Epoch 3/4
223s - loss: 0.2227 - acc: 0.9143 - val_loss: 0.2285 - val_acc: 0.9106
Epoch 4/4
223s - loss: 0.2045 - acc: 0.9225 - val_loss: 0.2302 - val_acc: 0.9137
avg sec per epoch: 223.0854321718216


In [13]:
# Save model
model_bidir_atom.save("Model/bidir.hdf5")
model_json_string=model_bidir_atom.to_json()
open('Model/bidir_architecture.json','w+').write(model_json_string)
model_bidir_atom.save_weights("Model/bidir_weights.h5")

# Save tokenizer
with open('Model/tokenizer.pickle', 'wb') as handle:
    pickle.dump(reviewTokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

print('saved!')

saved!


In [14]:
# Load model and tokenizer
with open('Model/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

model = load_model("Model/bidir.hdf5")

In [19]:
test = "This is my second pair, I wore the first pair for over 5 years and still have them (as a back up).  They are sturdy and fashionable."
test = tokenizer.texts_to_sequences([test])
test = sequence.pad_sequences(test, maxlen=max_len)

#model_bidir_atom.predict(test)
model.predict(test)

array([[ 0.83434963]], dtype=float32)