In [1]:
# preprocessing imports
import pandas as pd
import numpy as np
from time import time
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import text_to_word_sequence

Using TensorFlow backend.


In [2]:
# functions we implemented
from custom_functions import init_embeddings_map, get_embed_and_pad_func, get_embed_aspects

In [3]:
emb_size = 50
embedding_map = init_embeddings_map("glove.6B." + str(emb_size) + "d.txt")

In [4]:
raw_data = pd.read_csv("data/unembedded_grouped_cleaned_data.csv")

In [23]:
# Train/test split for our model is unique, we need to hold out a
# set of users and movies so that our network never learns those 
test_size = 0.005

# get test_size percentage of users
unique_users = raw_data.loc[:, "reviewerID"].unique()
users_size = len(unique_users)
print('users_size:', users_size)
np.random.seed(2019)
test_idx = np.random.choice(users_size,
                              size=int(users_size * test_size),
                              replace=False)

# get test users
test_users = unique_users[test_idx]
print('test_users_size:', len(test_users))

# everyone else is a training user
train_users = np.delete(unique_users, test_idx)

test = raw_data[raw_data["reviewerID"].isin(test_users)]
train = raw_data[raw_data["reviewerID"].isin(train_users)]

unique_test_movies = test["asin"].unique()

# drop the movies that also appear in our test set. In order to be
# a true train/test split, we are forced to discard some data entirely
train = train.where(np.logical_not(train["asin"].isin(unique_test_movies))).dropna()

users_size: 2928
test_users_size: 14


In [6]:
user_seq_sizes = raw_data.loc[:, "userReviews"].apply(lambda x: x.split()).apply(len)
item_seq_sizes = raw_data.loc[:, "movieReviews"].apply(lambda x: x.split()).apply(len)

In [7]:
u_ptile = 40
i_ptile = 15
u_seq_len = int(np.percentile(user_seq_sizes, u_ptile))
i_seq_len = int(np.percentile(item_seq_sizes, i_ptile))

In [8]:
embedding_fn = get_embed_and_pad_func(i_seq_len, u_seq_len, np.array([0.0] * emb_size), embedding_map)

train_embedded = train.apply(embedding_fn, axis=1)
test_embedded = test.apply(embedding_fn, axis=1)

Aspects 超参数设置 与 Embedding映射

In [9]:
# aspects = ['price', 'quality', 'battery', 'oil', 'car', 'tire', 'paint', 'light', 'engine']
aspects = ['price', 'quality', 'oil', 'paint', 'engine']
embed_aspects = get_embed_aspects(aspects, embedding_map)
print(embed_aspects.shape)

(5, 50)


粘贴来的one_step_of_attention

# Define part of the attention layer gloablly so as to
# share the same layers for each attention step.
def softmax(x):
    return K.softmax(x, axis=1)

at_repeat = RepeatVector(Tx)
at_concatenate = Concatenate(axis=-1)
at_dense1 = Dense(8, activation="tanh")
at_dense2 = Dense(1, activation="relu")
at_softmax = Activation(softmax, name='attention_weights')
at_dot = Dot(axes=1)

def one_step_of_attention(h_prev, a):
    """
    Get the context.
    
    Input:
    h_prev - Previous hidden state of a RNN layer (m, n_h)
    a - Input data, possibly processed (m, Tx, n_a)
    
    Output:
    context - Current context (m, Tx, n_a)
    """
    # Repeat vector to match a's dimensions
    h_repeat = at_repeat(h_prev)
    # Calculate attention weights
    i = at_concatenate([a, h_repeat])
    i = at_dense1(i)
    i = at_dense2(i)
    attention = at_softmax(i)
    # Calculate the context
    context = at_dot([attention, a])
    
    return context

# Recommendation Model

In [10]:
# modeling imports
from keras.models import Model
from keras.callbacks import EarlyStopping, TensorBoard, ReduceLROnPlateau
from keras.layers import LSTM
from keras.layers import Input, Dense, Permute, Reshape, RepeatVector, Activation, Lambda
from keras.activations import tanh, softmax
from keras.layers.merge import Add, Dot, Concatenate, Multiply
from MyLayer import WeightedAdd

In [11]:
class MyModel():
    def __init__(self, embedding_size, hidden_size, rnn_hidden_size, u_seq_len, m_seq_len, n_aspects, filters=2, kernel_size=8,
                 strides=6):
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.rnn_hidden_size = rnn_hidden_size
        self.filters = filters
        self.kernel_size = kernel_size
        self.n_aspects = n_aspects
        self.embed_aspects = Input(shape=(self.n_aspects, self.embedding_size))
        self.inputU, self.towerU = self.create_deepconn_tower(u_seq_len)
        self.inputM, self.towerM = self.create_deepconn_tower(m_seq_len)
        # self.joined = Concatenate()([self.towerU, self.towerM])
        # self.outNeuron = Dense(1)(self.joined)
    
    def aspect_attention_block(self, lstm_inputs, embed_aspect_repeat, LSTM_1, Dense_1, Dense_2, Dense_3, WeightedAdd_1, Dense_4):
        lstm_out = LSTM_1(lstm_inputs)
        wh = Dense_1(lstm_out)
        wv = Dense_2(embed_aspect_repeat)
        m = Concatenate(axis=-1)([wh, wv])
        m = Activation(tanh)(m)
        a_probs = Dense_3(m)
        output_attention_mul = Dot(axes=1)([lstm_out, a_probs])
        # name='attention_mul'
        output_attention_mul = Reshape((self.rnn_hidden_size, ))(output_attention_mul)
        aspect_rating = Dense_4(output_attention_mul)
        
        return aspect_rating
        
    def create_deepconn_tower(self, max_seq_len):
        input_layer = Input(shape=(max_seq_len, self.embedding_size))
        # 用于变量的动态命名
        createVar = locals()
        
        # 不同aspect间的参数共享
        LSTM_1 = LSTM(self.rnn_hidden_size, activation="tanh", return_sequences=True)
        Dense_1 = Dense(self.rnn_hidden_size, kernel_initializer='glorot_normal')
        Dense_2 = Dense(self.embedding_size, kernel_initializer='glorot_normal')
        Dense_3 = Dense(1, activation="softmax")
        # name='attention_vec'
        WeightedAdd_1 = WeightedAdd(1)
        Dense_4 = Dense(1, activation="softmax")
        
        for n in range(self.n_aspects):
            one_embed_aspects = Lambda(lambda x: x[:, n])(self.embed_aspects)
            
            embed_aspect_repeat = RepeatVector(max_seq_len)(one_embed_aspects)
            # 问题出在[:,:,]，要用Lambda层
            lstm_inputs = Concatenate(axis=-1)([input_layer, embed_aspect_repeat])
            createVar['tower'+str(n)] = self.aspect_attention_block(lstm_inputs, embed_aspect_repeat, LSTM_1, Dense_1, Dense_2, Dense_3, WeightedAdd_1, Dense_4)

        for n in range(self.n_aspects-1):
           createVar['tower0'] = Concatenate(axis=-1)([createVar['tower0'], createVar['tower'+str(n+1)]])
        tower = createVar['tower0']
        print("user/item aspect feature:", tower.shape)
        
        return input_layer, tower

    def create_deepconn_dp(self):
        dotproduct = Dot(axes=1)([self.towerU, self.towerM])
        # output = Add()([self.outNeuron, dotproduct])
        show = Concatenate(axis=-1)([self.towerU, self.towerM])
        output = Dense(1, use_bias=True)(dotproduct)
        self.model = Model(inputs=[self.embed_aspects, self.inputU, self.inputM], outputs=[output])
        self.model.compile(optimizer='Adam', loss='mse')
        
    def train(self, train_data, embed_aspects, batch_size, epochs=3500):
        tensorboard = TensorBoard(log_dir="tf_logs/{}".format(time()))
        self.create_deepconn_dp()
        print(self.model.summary())
        
        user_reviews = np.array(list(train_data.loc[:, "userReviews"]))
        movie_reviews = np.array(list(train_data.loc[:, "movieReviews"]))
        
        # 将aspects扩充至训练集的大小
        embed_aspects = np.expand_dims(embed_aspects, axis=0)
        embed_aspects = np.repeat(embed_aspects, user_reviews.shape[0], axis=0)
        
        self.train_inputs = [embed_aspects, user_reviews, movie_reviews]
        self.train_outputs = train_data.loc[:, "overall"]
        
        reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=2, mode='auto')
        early_stopping = EarlyStopping(monitor='val_loss', patience=3, mode='min')
        self.history = self.model.fit(self.train_inputs,
                                      self.train_outputs,
                                      callbacks=[tensorboard, reduce_lr, early_stopping],
                                      validation_split=0.05,
                                      batch_size=batch_size,
                                      epochs=epochs)
        

In [12]:
hidden_size = 64
rnn_hidden_size = 64
n_aspects = len(aspects)
mymodel = MyModel(emb_size, hidden_size, rnn_hidden_size, u_seq_len, i_seq_len, n_aspects)

batch_size = 32
mymodel.train(train_embedded, embed_aspects, batch_size, epochs=20)

mymodel.model.save("mymodel.h5")

user/item aspect feature: (?, 5)
user/item aspect feature: (?, 5)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 5, 50)        0                                            
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, 50)           0           input_1[0][0]                    
__________________________________________________________________________________________________
lambda_2 (Lambda)               (None, 50)           0           input_1[0][0]                    
__________________________________________________________________________________________________
lambda_6 (Lambda)               (None, 50)           0           input_1[0][0]                    
___________________________________________

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20


In [26]:
user_reviews = np.array(list(test_embedded.loc[:, "userReviews"]))
movie_reviews = np.array(list(test_embedded.loc[:, "movieReviews"]))

test_embed_aspects = np.expand_dims(embed_aspects, axis=0)
test_embed_aspects = np.repeat(test_embed_aspects, user_reviews.shape[0], axis=0)

test_inputs = [test_embed_aspects, user_reviews, movie_reviews]

dat = pd.DataFrame(test_inputs)
dat.to_csv("data/test_data.csv")

true_rating = np.array(list(test_embedded.loc[:, "overall"])).reshape((-1, 1))
print(true_rating)

predictions = mymodel.model.predict(test_inputs)
print(predictions)

error = np.square(predictions - true_rating)

print("MSE:", np.average(error))

[[ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 4.]
 [ 3.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 2.]
 [ 4.]
 [ 5.]
 [ 5.]
 [ 4.]
 [ 4.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 4.]
 [ 4.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 1.]
 [ 4.]
 [ 4.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 4.]
 [ 5.]
 [ 5.]
 [ 4.]
 [ 4.]
 [ 4.]
 [ 5.]
 [ 5.]
 [ 4.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 4.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 4.]
 [ 5.]
 [ 3.]
 [ 4.]
 [ 5.]
 [ 4.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 3.]
 [ 4.]
 [ 4.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 4.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 4.]
 [ 5.]
 [ 3.]
 [ 4.]
 [ 5.]
 [ 4.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 4.]
 [ 5.]
 [ 5.]
 [ 1.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 2.]
 [ 5.]
 [ 1.]
 [ 4.]
 [ 5.]
 [ 5.]
 [ 4.]
 [ 5.]
 [ 4.]
 [ 5.]
 [ 1.]
 [ 5.]
 [ 3.]
 [ 5.]
 [ 1.]
 [ 1.]
 [ 4.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 4.]
 [ 4.]
 [ 5.]
 [ 5.]
 [ 5.]
 [ 4.]]
[[ 4.45679092]
 [ 4.45679092]
 [ 

获取某一层的输出或参数

In [17]:
# 获取某一层的输出
def get_layer_output(mymodel, layer_name, input_data):
    layer_model = Model(inputs=mymodel.model.input, output=mymodel.model.get_layer(layer_name).output)
    # predict 默认batch size是32
    layer_out = layer_model.predict(input_data)
    return layer_out

# 获取某一层的权重和偏置
def get_layer_parameter(mymodel, layer_name):
    weights, bias = mymodel.model.get_layer(layer_name).get_weights()
    return weights,bias

In [25]:
layer_out = get_layer_output(mymodel, 'concatenate_28', test_inputs)
print(layer_out)

  This is separate from the ipykernel package so we can avoid doing imports until


[[ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  