In [1]:
# preprocessing imports
import pandas as pd
import numpy as np
from time import time
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import text_to_word_sequence

Using TensorFlow backend.


In [2]:
# functions we implemented
from custom_functions import init_embeddings_map, get_embed_and_pad_func, get_embed_aspects

In [3]:
emb_size = 50
embedding_map = init_embeddings_map("glove.6B." + str(emb_size) + "d.txt")

In [4]:
raw_data = pd.read_csv("data/unembedded_grouped_cleaned_data.csv")

In [5]:
# Train/test split for our model is unique, we need to hold out a
# set of users and movies so that our network never learns those 
test_size = 0.005

# get test_size percentage of users
unique_users = raw_data.loc[:, "reviewerID"].unique()
users_size = len(unique_users)

np.random.seed(2019)
test_idx = np.random.choice(users_size,
                              size=int(users_size * test_size),
                              replace=False)

# get test users
test_users = unique_users[test_idx]

# everyone else is a training user
train_users = np.delete(unique_users, test_idx)

test = raw_data[raw_data["reviewerID"].isin(test_users)]
train = raw_data[raw_data["reviewerID"].isin(train_users)]

unique_test_movies = test["asin"].unique()

# drop the movies that also appear in our test set. In order to be
# a true train/test split, we are forced to discard some data entirely
train = train.where(np.logical_not(train["asin"].isin(unique_test_movies))).dropna()

In [6]:
user_seq_sizes = raw_data.loc[:, "userReviews"].apply(lambda x: x.split()).apply(len)
item_seq_sizes = raw_data.loc[:, "movieReviews"].apply(lambda x: x.split()).apply(len)

In [7]:
u_ptile = 40
i_ptile = 15
u_seq_len = int(np.percentile(user_seq_sizes, u_ptile))
i_seq_len = int(np.percentile(item_seq_sizes, i_ptile))

In [8]:
embedding_fn = get_embed_and_pad_func(i_seq_len, u_seq_len, np.array([0.0] * emb_size), embedding_map)
    
train_embedded = train.apply(embedding_fn, axis=1)
test_embedded = test.apply(embedding_fn, axis=1)

# IAN

In [9]:
# modeling imports
from keras import regularizers
from keras.models import Model
from keras.callbacks import EarlyStopping, TensorBoard
from keras.layers import LSTM
from keras.layers import Input, Dense, Permute, Reshape, RepeatVector, Activation, Lambda, GlobalAveragePooling1D
from keras.activations import tanh, softmax
from keras.layers.merge import Add, Dot, Concatenate, Multiply
from keras.backend import mean
from MyLayer import AttentionScore

In [10]:
class IAN():
    def __init__(self, embedding_size, hidden_size, rnn_hidden_size, u_seq_len, m_seq_len, filters=2, kernel_size=8,
                 strides=6):
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.rnn_hidden_size = rnn_hidden_size
        self.filters = filters
        self.kernel_size = kernel_size
        self.u_seq_len = u_seq_len
        self.m_seq_len = m_seq_len
        self.inputU, self.lstm_outU, self.mean_lstmoutU = self.create_deepconn_tower(self.u_seq_len)
        self.inputM, self.lstm_outM, self.mean_lstmoutM = self.create_deepconn_tower(self.m_seq_len)
    
    def compute_attention_score(self, h, t, max_seq_len):
        score = AttentionScore()([h, t])
        alpha = Activation('softmax')(score)
        tower = Dot(axes=1)([h, alpha])
        return tower
        
    
    def create_deepconn_tower(self, max_seq_len):
        input_layer = Input(shape=(max_seq_len, self.embedding_size))
        lstm_out = LSTM(self.rnn_hidden_size, activation="tanh", return_sequences=True)(input_layer)
        # lstm_out.shape = (None, time_steps, input_dim)
        print(lstm_out)
        mean_lstm_out = GlobalAveragePooling1D()(lstm_out)
        print(mean_lstm_out.shape)
        return input_layer, lstm_out, mean_lstm_out

    def create_deepconn_dp(self):
        towerU = self.compute_attention_score(self.lstm_outU, self.mean_lstmoutM, self.u_seq_len)
        towerM = self.compute_attention_score(self.lstm_outM, self.mean_lstmoutU, self.m_seq_len)
        output = Concatenate()([towerU, towerM])
        output = Dense(1, activation='tanh', use_bias=True, kernel_regularizer=regularizers.l2(0.001))(output)
        
        dotproduct = Dot(axes=1)([towerU, towerM])
        output = Add()([output, dotproduct])
        # output = Activation('softmax')(output)
        self.model = Model(inputs=[self.inputU, self.inputM], outputs=[output])
        self.model.compile(optimizer='Adam', loss='mse')
        
    def train(self, train_data, batch_size, epochs=3500):
        tensorboard = TensorBoard(log_dir="tf_logs/{}".format(time()))
        self.create_deepconn_dp()
        print(self.model.summary())
        
        user_reviews = np.array(list(train_data.loc[:, "userReviews"]))
        movie_reviews = np.array(list(train_data.loc[:, "movieReviews"]))
       
        self.train_inputs = [user_reviews, movie_reviews]
        self.train_outputs = train_data.loc[:, "overall"]

        early_stopping = EarlyStopping(monitor='val_loss', patience=3, mode='min')
        self.history = self.model.fit(self.train_inputs,
                                      self.train_outputs,
                                      callbacks=[tensorboard, early_stopping],
                                      validation_split=0.05,
                                      batch_size=batch_size,
                                      epochs=epochs)
        

In [11]:
hidden_size = 64
rnn_hidden_size = 64
ian = IAN(emb_size, hidden_size, rnn_hidden_size, u_seq_len, i_seq_len)

batch_size = 32
ian.train(train_embedded, batch_size, epochs=20)

ian.model.save("lstm.h5")

Tensor("lstm_1/transpose_1:0", shape=(?, ?, 64), dtype=float32)
(?, 64)
Tensor("lstm_2/transpose_1:0", shape=(?, ?, 64), dtype=float32)
(?, 64)
AttentionScore input shape:
 [(None, 318, 64), (None, 64)]
AttentionScore input shape:
 [(None, 329, 64), (None, 64)]
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 318, 50)      0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 329, 50)      0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 318, 64)      29440       input_1[0][0]                    
_____________________________________________

In [12]:
user_reviews = np.array(list(test_embedded.loc[:, "userReviews"]))
movie_reviews = np.array(list(test_embedded.loc[:, "movieReviews"]))

test_inputs = [user_reviews, movie_reviews]

dat = pd.DataFrame(test_inputs)
dat.to_csv("data/test_data.csv")

true_rating = np.array(list(test_embedded.loc[:, "overall"])).reshape((-1, 1))

predictions = ian.model.predict(test_inputs)

error = np.square(predictions - true_rating)

print("MSE:", np.average(error))

MSE: 0.931364871244


In [13]:
#检查对于softmax的使用