In [1]:
# preprocessing imports
import pandas as pd
import numpy as np
from time import time
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import text_to_word_sequence

Using TensorFlow backend.


In [2]:
# functions we implemented
from custom_functions import init_embeddings_map, get_embed_and_pad_func, get_embed_aspects

In [3]:
emb_size = 50
embedding_map = init_embeddings_map("glove.6B." + str(emb_size) + "d.txt")

In [4]:
raw_data = pd.read_csv("data/unembedded_grouped_cleaned_data.csv")

In [5]:
# Train/test split for our model is unique, we need to hold out a
# set of users and movies so that our network never learns those 
test_size = 0.005

# get test_size percentage of users
unique_users = raw_data.loc[:, "reviewerID"].unique()
users_size = len(unique_users)

np.random.seed(2019)
test_idx = np.random.choice(users_size,
                              size=int(users_size * test_size),
                              replace=False)

# get test users
test_users = unique_users[test_idx]

# everyone else is a training user
train_users = np.delete(unique_users, test_idx)

test = raw_data[raw_data["reviewerID"].isin(test_users)]
train = raw_data[raw_data["reviewerID"].isin(train_users)]

unique_test_movies = test["asin"].unique()

# drop the movies that also appear in our test set. In order to be
# a true train/test split, we are forced to discard some data entirely
train = train.where(np.logical_not(train["asin"].isin(unique_test_movies))).dropna()

In [6]:
user_seq_sizes = raw_data.loc[:, "userReviews"].apply(lambda x: x.split()).apply(len)
item_seq_sizes = raw_data.loc[:, "movieReviews"].apply(lambda x: x.split()).apply(len)

In [7]:
u_ptile = 40
i_ptile = 15
u_seq_len = int(np.percentile(user_seq_sizes, u_ptile))
i_seq_len = int(np.percentile(item_seq_sizes, i_ptile))

In [8]:
embedding_fn = get_embed_and_pad_func(i_seq_len, u_seq_len, np.array([0.0] * emb_size), embedding_map)
    
train_embedded = train.apply(embedding_fn, axis=1)
test_embedded = test.apply(embedding_fn, axis=1)

aspects超参数设置与embedding映射

In [9]:
aspects = ['price']
embed_aspects = get_embed_aspects(aspects, embedding_map)
print(embed_aspects)

[[-0.44953999  0.11784     0.65070999 -0.042841    0.58203    -0.12502
  -0.17475    -0.79378998  0.20936     0.67821997 -0.07809     0.21738
  -0.38139001 -0.99193001  1.1904      0.84296    -0.077351   -0.59403002
  -0.56399    -1.32500005  1.35899997 -0.63020998 -0.21871001 -0.49675
  -0.50682998 -1.17770004 -0.142       0.13053     0.85829997  0.93814999
   3.19269991 -0.070536    0.97513002  0.86562002  0.78035003 -1.28419995
  -0.19745    -0.19072001  0.23972    -0.61404002 -0.085975    0.54900998
   0.48752001  0.15015     0.24876    -0.24506     0.24964     0.45262
   0.40169001  0.67246997]]


# DeepCoNN Recommendation Model

In [10]:
# modeling imports
from keras.models import Model
from keras.callbacks import EarlyStopping, TensorBoard
from keras.layers import LSTM
from keras.layers import Input, Dense, Permute, Reshape, RepeatVector, Activation, Lambda
from keras.activations import tanh, softmax
from keras.layers.merge import Add, Dot, Concatenate, Multiply
from MyLayer import WeightedAdd

In [11]:
class DeepCoNN():
    def __init__(self, embedding_size, hidden_size, rnn_hidden_size, u_seq_len, m_seq_len, filters=2, kernel_size=8,
                 strides=6):
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.rnn_hidden_size = rnn_hidden_size
        self.filters = filters
        self.kernel_size = kernel_size
        self.embed_aspects = Input(shape=(self.embedding_size,))
        self.inputU, self.towerU = self.create_deepconn_tower(u_seq_len)
        self.inputM, self.towerM = self.create_deepconn_tower(m_seq_len)
        self.joined = Concatenate()([self.towerU, self.towerM])
        self.outNeuron = Dense(1)(self.joined)
    
    def aspect_attention_block(self, input_layer, max_seq_len, embed_aspect_repeat):
        lstm_out = LSTM(self.rnn_hidden_size, activation="tanh", return_sequences=True)(input_layer)
        # lstm_out.shape = (None, time_steps, input_dim)
        wh = Dense(int(lstm_out.shape[2]))(lstm_out)
        wv = Dense(self.embedding_size)(embed_aspect_repeat)
        m = Concatenate(axis=-1)([wh, wv])
        m = Activation(tanh)(m)
        
        a_probs = Dense(1, activation="softmax")(m)
        # name='attention_vec'
        output_attention_mul = Dot(axes=1)([lstm_out, a_probs])
        # name='attention_mul'
        
        lstm_last_output = Lambda(lambda x: x[:, -1])(lstm_out)
        lstm_last_output = Reshape((self.rnn_hidden_size, 1))(lstm_last_output)
        h_ = Concatenate(axis=-1)([output_attention_mul, lstm_last_output])
        h_ = WeightedAdd(1)(h_)
        h_ = Reshape((self.rnn_hidden_size,))(h_)
        
        return h_
        
    def create_deepconn_tower(self, max_seq_len):
        input_layer = Input(shape=(max_seq_len, self.embedding_size))
        embed_aspect_repeat = RepeatVector(max_seq_len)(self.embed_aspects)
        
        lstm_inputs = Concatenate(axis=-1)([input_layer, embed_aspect_repeat])
        tower = self.aspect_attention_block(lstm_inputs, max_seq_len, embed_aspect_repeat)
        
        tower = Dense(self.hidden_size, activation="relu")(tower)
        return input_layer, tower

    def create_deepconn_dp(self):
        dotproduct = Dot(axes=1)([self.towerU, self.towerM])
        output = Add()([self.outNeuron, dotproduct])
        self.model = Model(inputs=[self.embed_aspects, self.inputU, self.inputM], outputs=[output])
        self.model.compile(optimizer='Adam', loss='mse')
        
    def train(self, train_data, embed_aspects, batch_size, epochs=3500):
        tensorboard = TensorBoard(log_dir="tf_logs/{}".format(time()))
        self.create_deepconn_dp()
        print(self.model.summary())
        
        user_reviews = np.array(list(train_data.loc[:, "userReviews"]))
        movie_reviews = np.array(list(train_data.loc[:, "movieReviews"]))
        print(user_reviews.shape)
        # 将aspects扩充至训练集的大小
        # embed_aspects = np.expand_dims(embed_aspects, axis=0)
        embed_aspects = np.repeat(embed_aspects, user_reviews.shape[0], axis=0)
        print(embed_aspects.shape)

        self.train_inputs = [embed_aspects, user_reviews, movie_reviews]
        self.train_outputs = train_data.loc[:, "overall"]

        self.history = self.model.fit(self.train_inputs,
                                      self.train_outputs,
                                      callbacks=[tensorboard],
                                      validation_split=0.05,
                                      batch_size=batch_size,
                                      epochs=epochs)
        

In [12]:
hidden_size = 64
rnn_hidden_size = 64
deepconn = DeepCoNN(emb_size, hidden_size, rnn_hidden_size, u_seq_len, i_seq_len, embed_aspects)

batch_size = 32
deepconn.train(train_embedded, embed_aspects, batch_size, epochs=10)

deepconn.model.save("lstm.h5")

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 318, 50)      0                                            
__________________________________________________________________________________________________
repeat_vector_1 (RepeatVector)  (None, 318, 50)      0           input_1[0][0]                    
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 329, 50)      0                                            
__________________________________________________________________________________________________
repeat_vec

Train on 17081 samples, validate on 900 samples
Epoch 1/10
 1152/17081 [=>............................] - ETA: 5:29 - loss: 29.3684

KeyboardInterrupt: 

In [None]:
user_reviews = np.array(list(test_embedded.loc[:, "userReviews"]))
movie_reviews = np.array(list(test_embedded.loc[:, "movieReviews"]))

test_embed_aspects = np.repeat(embed_aspects, user_reviews.shape[0], axis=0)

test_inputs = [test_embed_aspects, user_reviews, movie_reviews]

dat = pd.DataFrame(test_inputs)
dat.to_csv("data/test_data.csv")

true_rating = np.array(list(test_embedded.loc[:, "overall"])).reshape((-1, 1))

predictions = deepconn.model.predict(test_inputs)

error = np.square(predictions - true_rating)

print("MSE:", np.average(error))