In [1]:
# preprocessing imports
import pandas as pd
import numpy as np
from time import time
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import text_to_word_sequence

Using TensorFlow backend.


In [2]:
# functions we implemented
from custom_functions import init_embeddings_map, get_embed_and_pad_func, get_embed_aspects

In [3]:
emb_size = 50
embedding_map = init_embeddings_map("glove.6B." + str(emb_size) + "d.txt")

In [4]:
raw_data = pd.read_csv("data/unembedded_grouped_cleaned_data.csv")

In [5]:
# Train/test split for our model is unique, we need to hold out a
# set of users and movies so that our network never learns those 
test_size = 0.005

# get test_size percentage of users
unique_users = raw_data.loc[:, "reviewerID"].unique()
users_size = len(unique_users)

np.random.seed(2019)
test_idx = np.random.choice(users_size,
                              size=int(users_size * test_size),
                              replace=False)

# get test users
test_users = unique_users[test_idx]

# everyone else is a training user
train_users = np.delete(unique_users, test_idx)

test = raw_data[raw_data["reviewerID"].isin(test_users)]
train = raw_data[raw_data["reviewerID"].isin(train_users)]

unique_test_movies = test["asin"].unique()

# drop the movies that also appear in our test set. In order to be
# a true train/test split, we are forced to discard some data entirely
train = train.where(np.logical_not(train["asin"].isin(unique_test_movies))).dropna()

In [6]:
user_seq_sizes = raw_data.loc[:, "userReviews"].apply(lambda x: x.split()).apply(len)
item_seq_sizes = raw_data.loc[:, "movieReviews"].apply(lambda x: x.split()).apply(len)

In [7]:
u_ptile = 40
i_ptile = 15
u_seq_len = int(np.percentile(user_seq_sizes, u_ptile))
i_seq_len = int(np.percentile(item_seq_sizes, i_ptile))

In [8]:
embedding_fn = get_embed_and_pad_func(i_seq_len, u_seq_len, np.array([0.0] * emb_size), embedding_map)
    
train_embedded = train.apply(embedding_fn, axis=1)
test_embedded = test.apply(embedding_fn, axis=1)

Aspects 超参数设置 与 Embedding映射

In [9]:
aspects = ['automotive']
embed_aspects = get_embed_aspects(aspects, embedding_map)
print(embed_aspects)

[array([-4.5280e-01,  3.6372e-01,  7.0773e-01,  1.0992e+00, -5.4294e-01,
       -1.6191e-01, -9.5348e-01, -9.8411e-01,  3.6076e-01,  2.8066e-02,
        9.2158e-01, -2.7851e-01, -1.0479e+00,  2.2303e-01, -3.2145e-01,
        2.9460e-01, -6.2475e-01,  1.6479e+00,  6.0934e-01, -1.3257e+00,
        1.2286e+00, -9.7205e-02, -1.4367e+00, -1.6709e-01, -2.6060e-01,
       -3.3898e-01, -1.2955e+00, -2.6721e-01, -3.3512e-01,  7.0806e-01,
        1.5725e+00, -1.0747e-01,  7.0784e-01, -1.1352e-03,  4.6224e-01,
       -1.1650e-01, -6.0889e-01,  8.8951e-01,  9.8686e-01,  7.7583e-01,
        1.5276e-01, -3.1497e-01,  2.8402e-01, -5.5208e-01,  7.5648e-01,
        4.3723e-01, -1.1144e-01,  1.2754e+00, -1.3541e-01,  1.8779e-01],
      dtype=float32)]


粘贴来的one_step_of_attention

# Define part of the attention layer gloablly so as to
# share the same layers for each attention step.
def softmax(x):
    return K.softmax(x, axis=1)

at_repeat = RepeatVector(Tx)
at_concatenate = Concatenate(axis=-1)
at_dense1 = Dense(8, activation="tanh")
at_dense2 = Dense(1, activation="relu")
at_softmax = Activation(softmax, name='attention_weights')
at_dot = Dot(axes=1)

def one_step_of_attention(h_prev, a):
    """
    Get the context.
    
    Input:
    h_prev - Previous hidden state of a RNN layer (m, n_h)
    a - Input data, possibly processed (m, Tx, n_a)
    
    Output:
    context - Current context (m, Tx, n_a)
    """
    # Repeat vector to match a's dimensions
    h_repeat = at_repeat(h_prev)
    # Calculate attention weights
    i = at_concatenate([a, h_repeat])
    i = at_dense1(i)
    i = at_dense2(i)
    attention = at_softmax(i)
    # Calculate the context
    context = at_dot([attention, a])
    
    return context

# DeepCoNN Recommendation Model

In [10]:
# modeling imports
from keras.models import Model
from keras.callbacks import EarlyStopping, TensorBoard
from keras.layers import LSTM
from keras.layers import Input, Dense, Permute, Reshape, RepeatVector, Activation, Lambda
from keras.activations import tanh, softmax
from keras.layers.merge import Add, Dot, Concatenate, Multiply
from WeightedAdd import WeightedAdd

In [13]:
class DeepCoNN():
    def __init__(self, embedding_size, hidden_size, rnn_hidden_size, u_seq_len, m_seq_len, filters=2, kernel_size=8,
                 strides=6):
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.rnn_hidden_size = rnn_hidden_size
        self.filters = filters
        self.kernel_size = kernel_size
        self.embed_aspects = Input(shape=(self.embedding_size,))
        self.inputU, self.towerU = self.create_deepconn_tower(u_seq_len)
        self.inputM, self.towerM = self.create_deepconn_tower(m_seq_len)
        self.joined = Concatenate()([self.towerU, self.towerM])
        self.outNeuron = Dense(1)(self.joined)
    
    def aspect_attention_block(self, input_layer, max_seq_len, embed_aspect_repeat):
        lstm_out = LSTM(self.rnn_hidden_size, activation="tanh", return_sequences=True)(input_layer)
        # lstm_out.shape = (None, time_steps, input_dim)
        wh = Dense(int(lstm_out.shape[2]))(lstm_out)
        wv = Dense(self.embedding_size)(embed_aspect_repeat)
        m = Concatenate(axis=-1)([wh, wv])
        m = Activation(tanh)(m)
        
        a_probs = Dense(1, activation="softmax")(m)
        # name='attention_vec'
        output_attention_mul = Dot(axes=1)([lstm_out, a_probs])
        # name='attention_mul'
        
        lstm_last_output = Lambda(lambda x: x[:, -1])(lstm_out)
        lstm_last_output = Reshape((self.rnn_hidden_size, 1))(lstm_last_output)
        h_ = Concatenate(axis=-1)([output_attention_mul, lstm_last_output])
        h_ = WeightedAdd(1)(h_)
        h_ = Reshape((self.rnn_hidden_size,))(h_)
        
        return h_
        
    def create_deepconn_tower(self, max_seq_len):
        input_layer = Input(shape=(max_seq_len, self.embedding_size))
        embed_aspect_repeat = RepeatVector(max_seq_len)(self.embed_aspects)
        
        lstm_inputs = Concatenate(axis=-1)([input_layer, embed_aspect_repeat])
        tower = self.aspect_attention_block(lstm_inputs, max_seq_len, embed_aspect_repeat)
        
        tower = Dense(self.hidden_size, activation="relu")(tower)
        return input_layer, tower

    def create_deepconn_dp(self):
        dotproduct = Dot(axes=1)([self.towerU, self.towerM])
        output = Add()([self.outNeuron, dotproduct])
        self.model = Model(inputs=[self.embed_aspects, self.inputU, self.inputM], outputs=[output])
        self.model.compile(optimizer='Adam', loss='mse')
        
    def train(self, train_data, embed_aspects, batch_size, epochs=3500):
        tensorboard = TensorBoard(log_dir="tf_logs/{}".format(time()))
        self.create_deepconn_dp()
        print(self.model.summary())
        
        user_reviews = np.array(list(train_data.loc[:, "userReviews"]))
        movie_reviews = np.array(list(train_data.loc[:, "movieReviews"]))
        print(user_reviews.shape)
        # 将aspects扩充至训练集的大小
        # embed_aspects = np.expand_dims(embed_aspects, axis=0)
        embed_aspects = np.repeat(embed_aspects, user_reviews.shape[0], axis=0)
        print(embed_aspects.shape)

        self.train_inputs = [embed_aspects, user_reviews, movie_reviews]
        self.train_outputs = train_data.loc[:, "overall"]

        self.history = self.model.fit(self.train_inputs,
                                      self.train_outputs,
                                      callbacks=[tensorboard],
                                      validation_split=0.05,
                                      batch_size=batch_size,
                                      epochs=epochs)
        

In [None]:
hidden_size = 64
rnn_hidden_size = 64
deepconn = DeepCoNN(emb_size, hidden_size, rnn_hidden_size, u_seq_len, i_seq_len, embed_aspects)

batch_size = 32
deepconn.train(train_embedded, embed_aspects, batch_size, epochs=20)

deepconn.model.save("lstm.h5")

(?, 64, 2) (2, 1)
(?, 64, 2) (2, 1)
Instructions for updating:
keep_dims is deprecated, use keepdims instead
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 318, 50)      0                                            
__________________________________________________________________________________________________
repeat_vector_2 (RepeatVector)  (None, 318, 50)      0           input_3[0][0]                    
__________________________________________________________________________________________________
input_5 (InputLayer)            (None, 329, 50)      0                                            


(17981, 318, 50)
(17981, 50)
Train on 17081 samples, validate on 900 samples
Epoch 1/20


In [None]:
user_reviews = np.array(list(test_embedded.loc[:, "userReviews"]))
movie_reviews = np.array(list(test_embedded.loc[:, "movieReviews"]))

test_embed_aspects = np.repeat(embed_aspects, user_reviews.shape[0], axis=0)

test_inputs = [test_embed_aspects, user_reviews, movie_reviews]

dat = pd.DataFrame(test_inputs)
dat.to_csv("data/test_data.csv")

true_rating = np.array(list(test_embedded.loc[:, "overall"])).reshape((-1, 1))

predictions = deepconn.model.predict(test_inputs)

error = np.square(predictions - true_rating)

print("MSE:", np.average(error))