In [1]:
import tensorflow as tf
import numpy as np
import random
import math
import os

# Dataset and evaluation protocols reused from
# https://github.com/hexiangnan/neural_collaborative_filtering
from Dataset import Dataset
from evaluate import evaluate_model

In [2]:
def generate_instances(train_mat, positive_size=2, negative_time=8, is_sparse=False):
    data = []
    users_num,items_num = train_mat.shape
    
    if is_sparse:
        indptr = train_mat.indptr
        indices = train_mat.indices
    for u in range(users_num):
        if is_sparse:
            rated_items = indices[indptr[u]:indptr[u+1]] # the rated id of user u
        else:
            rated_items = np.where(train_mat[u,:]>0)[0]
        
        for item0 in rated_items:
            for item1 in np.random.choice(rated_items, size=positive_size):
                data.append([u,item0,item1,1.])
            for _ in range(positive_size*negative_time):
                item1 = np.random.randint(items_num) # no matter item1 is positive or negtive
                item2 = np.random.randint(items_num)
                while item2 in rated_items:
                    item2 = np.random.randint(items_num)
                data.append([u,item1,item2,0.])
    return data

In [26]:
def setup_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    tf.set_random_seed(seed)


def evaluate(model, test_ratings, test_negatives, K=10):
    """Helper that calls evaluate from the NCF libraries."""
    (hits, ndcgs) = evaluate_model(model, test_ratings, test_negatives, K=K, num_thread=1)
    return np.array(hits).mean(), np.array(ndcgs).mean()


def get_similar_items(item_mat, idx, topk=5):
    m,k = item_mat.shape
    target_item = item_mat[idx,:]
    target_mat = np.reshape(np.tile(target_item,m),(-1,k))
    sim = [np.dot(target_mat[i], item_mat[i])/(np.linalg.norm(target_mat[i])*np.linalg.norm(item_mat[i])) 
           for i in range(m)] 
    sorted_items = np.argsort(-np.array(sim))
    return sorted_items[:topk+1] # the most similar is itself


def get_key(item_dict, value):
    key = -1
    for (k, v) in item_dict.items():
        if v == value:
            key = k
    return key


def id_name(file_dir):
    id_name_dict = {}
    f = open(file_dir, 'r', encoding='latin-1')
    for line in f.readlines():
        movie_id, movie_name = line.split('|')[:2]
        id_name_dict[int(movie_id)] = movie_name
        
    return id_name_dict


# read original records
def get_item_dict(file_dir):
    # output: 
    # N: the number of user;
    # M: the number of item
    # data: the list of rating information
    user_ids_dict, rated_item_ids_dict = {},{}
    N, M, u_idx, i_idx = 0,0,0,0 
    data_rating = []
    data_time = []
    f = open(file_dir)
    for line in f.readlines():
        if '::' in line:
            u, i, r = line.split('::')[:3]
        elif ',' in line:
            u, i, r = line.split(',')[:3]
        else:
            u, i, r = line.split()[:3]
    
        if u not in user_ids_dict:
            user_ids_dict[u]=u_idx
            u_idx+=1
        if i not in rated_item_ids_dict:
            rated_item_ids_dict[i]=i_idx
            i_idx+=1
        data_rating.append([user_ids_dict[u],rated_item_ids_dict[i],float(r)])
    
    f.close()
    N = u_idx
    M = i_idx

    return rated_item_ids_dict

In [4]:
class CoNet():
    def __init__(self,               
                 users_num = None,               # user number
                 items_num = None,               # item number
                 batch_size = 1024,               # batch size
                 embedding_size_users = 64,      # the embedding size of user
                 embedding_size_items = 64,      # the embedding size of item
                 hidden_size = [64,64],     # hidden size of all layers
                 learning_rate = 1e-3,           # learning rate
                 lamda_regularizer = 1e-5,       # regularizer
                 is_attention = True,           # is attention layer
                 seed = 42,
                 model_path = 'model'            # the save path for trained model
                ):
        self.users_num = users_num
        self.items_num = items_num
        self.batch_size = batch_size
        self.embedding_size_users = embedding_size_users
        self.embedding_size_items = embedding_size_items
        self.hidden_size = hidden_size
        self.learning_rate = learning_rate
        self.lamda_regularizer = lamda_regularizer
        self.is_attention = is_attention
        self.seed = seed
        self.model_path = model_path

        # loss records
        self.train_loss_records = []  
        self.build_graph()   

        
    def build_graph(self):
        self.graph = tf.Graph()
        with self.graph.as_default():
            setup_seed(self.seed) # set seed for training
            
            # _________ input data _________
            self.user_inputs = tf.placeholder(tf.int32, shape = [None,1], name='user_inputs')
            self.item_inputs = tf.placeholder(tf.int32, shape = [None, 2], name='item_inputs')
            self.train_labels = tf.placeholder(tf.float32, shape = [None,1], name='train_labels') 
            
            # _________ variables _________
            self.weights = self._initialize_weights()
            
            # _________ train _____________
            self.y_ = self.inference(user_inputs=self.user_inputs, 
                                     item_inputs=self.item_inputs, 
                                     is_attention=self.is_attention)
            self.loss_train = self.loss_function(true_labels=self.train_labels, 
                                                 predicted_labels=tf.reshape(self.y_,shape=[-1,1]),
                                                 lamda_regularizer=self.lamda_regularizer,
                                                 loss_type='cross_entropy')
            self.train_op = tf.train.AdamOptimizer(learning_rate=self.learning_rate,beta1=0.9, beta2=0.999, epsilon=1e-08).minimize(self.loss_train) 

            # _________ prediction _____________
            self.predictions = self.inference(user_inputs=self.user_inputs, 
                                              item_inputs=self.item_inputs, 
                                              is_attention=self.is_attention)
        
            # init
            self.saver = tf.train.Saver() #  
            init = tf.global_variables_initializer()
            self.sess = self._init_session()
            self.sess.run(init)
    
    
    def _init_session(self):
        # adaptively growing memory
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        return tf.Session(config=config)
    
    
    def _initialize_weights(self):
        all_weights = dict()

        # -----embedding layer------
        all_weights['embedding_users'] = tf.Variable(tf.random_normal([self.users_num, self.embedding_size_users], 
                                                                      0, 0.1),name='embedding_users')
        all_weights['embedding_items'] = tf.Variable(tf.random_normal([self.items_num, self.embedding_size_items], 
                                                                      0, 0.1),name='embedding_items') 
        
        # ------hidden layer------
        all_weights['weight_0'] = tf.Variable(tf.random_normal([self.embedding_size_users+self.embedding_size_items,
                                                                self.hidden_size[0]], 0.0, 0.1),name='weight_0')
        all_weights['bias_0'] = tf.Variable(tf.zeros([self.hidden_size[0]]), name='bias_0')
        all_weights['weight_1'] = tf.Variable(tf.random_normal([self.hidden_size[0],self.hidden_size[1]], 
                                                               0.0, 0.1), name='weight_1')
        all_weights['bias_1'] = tf.Variable(tf.zeros([self.hidden_size[1]]), name='bias_1')
        all_weights['weight_2'] = tf.Variable(tf.random_normal([self.hidden_size[1],self.hidden_size[2]], 
                                                               0.0, 0.1), name='weight_1')
        all_weights['bias_2'] = tf.Variable(tf.zeros([self.hidden_size[2]]), name='bias_1')
        
        # ------output layer-----
        all_weights['weight_n'] = tf.Variable(tf.random_normal([self.hidden_size[-1], 1], 0, 0.1), name='weight_n')
        all_weights['bias_n'] = tf.Variable(tf.zeros([1]), name='bias_n')

        return all_weights
        
    
    def fit(self, data_sequence):
        train_size = len(data_sequence)
        
        np.random.shuffle(data_sequence)
        batch_size = self.batch_size
        total_batch = math.ceil(train_size/batch_size)

        for batch in range(total_batch):
            start = (batch*batch_size)% train_size
            end = min(start+batch_size, train_size)
            data_array = np.array(data_sequence[start:end])

            feed_dict = {self.user_inputs: np.reshape(data_array[:,0],(-1,1)), 
                         self.item_inputs: data_array[:,1:3],
                         self.train_labels: np.reshape(data_array[:,-1],(-1,1))}  
            loss, opt = self.sess.run([self.loss_train,self.train_op], feed_dict=feed_dict)
            self.train_loss_records.append(loss)
            
        return self.train_loss_records

        
    # forward propagation
    def inference(self, user_inputs, item_inputs, is_attention=False):
        embed_users = tf.reshape(tf.nn.embedding_lookup(self.weights['embedding_users'], user_inputs),
                                 shape=[-1, self.embedding_size_users])
        embed_items0 = tf.reshape(tf.nn.embedding_lookup(self.weights['embedding_items'], item_inputs[:,0]),
                                 shape=[-1, self.embedding_size_items])
        embed_items1 = tf.reshape(tf.nn.embedding_lookup(self.weights['embedding_items'], item_inputs[:,1]),
                                 shape=[-1, self.embedding_size_items])
        embed_items = (embed_items0 + embed_items1)/2.
        
        if is_attention:
            score0 = tf.reshape(tf.reduce_sum(embed_users*embed_items0,1), shape=[-1,1])
            score1 = tf.reshape(tf.reduce_sum(embed_users*embed_items1,1), shape=[-1,1])
            alpha = tf.nn.sigmoid(score0-score1)
            embed_items = alpha*embed_items0 + (1.-alpha)*embed_items1
        
        layer0 = tf.nn.relu(tf.matmul(tf.concat([embed_users,embed_items],1), self.weights['weight_0']) + self.weights['bias_0'])
        layer1 = tf.nn.relu(tf.matmul(layer0, self.weights['weight_1']) + self.weights['bias_1'])
        layer2 = tf.nn.relu(tf.matmul(layer1, self.weights['weight_2']) + self.weights['bias_2'])
        y_ = tf.matmul(layer2,self.weights['weight_n']) +self.weights['bias_n']
        return y_         
        
        
    def loss_function(self, true_labels, predicted_labels,lamda_regularizer=1e-5, loss_type='cross_entropy'):   
        if loss_type =='cross_entropy':
            cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=true_labels, logits=predicted_labels))
        else:
            cost = tf.reduce_mean(tf.square(1.+true_labels)*tf.square(true_labels-tf.sigmoid(predicted_labels)))
            #mse = tf.losses.mean_squared_error(true_labels, tf.sigmoid(predicted_labels))
        if lamda_regularizer>0:
            regularizer_1 = tf.contrib.layers.l2_regularizer(lamda_regularizer)
            regularization = regularizer_1(
                self.weights['embedding_users']) + regularizer_1(
                self.weights['embedding_items'])+ regularizer_1(
                self.weights['weight_0']) + regularizer_1(
                self.weights['weight_1']) + regularizer_1(
                self.weights['weight_n']) + regularizer_1(
                self.weights['weight_2'])
            cost = cost + regularization

        return cost    
 
    
    # save model
    def save_model(self, save_path):
        if os.path.isfile(save_path):
            raise RuntimeError('the save path should be a dir')
        if not os.path.exists(save_path):
            os.mkdir(save_path)

        tf_path = os.path.join(save_path, 'trained_model') 
        if os.path.exists(tf_path):
            os.remove(tf_path)
            
        self.saver.save(self.sess,tf_path)
        
        
    def evaluate(self, test_sequence, topK=10):
        score = np.zeros([self.users_num, self.items_num])
        users = np.array([u for u in range(self.users_num)])
        items = np.array([i for i in range(self.items_num)])
  
        for u in range(self.users_num):
            user_ids = np.reshape(u * np.ones([self.items_num]),(-1,1))
            feed_dict = {self.user_inputs: user_ids,self.item_inputs:np.c_[items,items]}
            out = self.sess.run([self.predictions],feed_dict=feed_dict)
            score[u,:] = np.reshape(out,(-1,self.items_num))
            
        ranklist = get_topk(prediction=score,test_sequence=np.array(test_sequence), topK=topK)
        #print(ranklist)
        hits,ndcgs = hit_ndcg(test_sequence=np.array(test_sequence), ranklist=ranklist)
        hr,ndcg = np.array(hits).mean(),np.array(ndcgs).mean()
        return hr,ndcg
    
    
    def predict(self, pairs, batch_size, verbose):
        """Computes predictions for a given set of user-item pairs.
        Args:
          pairs: A pair of lists (users, items) of the same length.
          batch_size: unused.
          verbose: unused.
        Returns:
          predictions: A list of the same length as users and items, such that
          predictions[i] is the models prediction for (users[i], items[i]).
        """
        del batch_size, verbose
        num_examples = len(pairs[0])
        assert num_examples == len(pairs[1])
        predictions = np.empty(num_examples)
        pairs = np.array(pairs, dtype=np.int16)
        for i in range(num_examples):
            feed_dict = {self.user_inputs:np.reshape(pairs[0][i],(-1,1)),
                         self.item_inputs:np.c_[pairs[1][i],pairs[1][i]]}
            out = self.sess.run([self.predictions],feed_dict=feed_dict)
            predictions[i] = np.reshape(out,(-1))
            #predictions[i] = self._predict_one(pairs[0][i], pairs[1][i])
        return predictions
    
    
    def get_embeddings(self):
        embeddings = self.sess.run(self.weights['embedding_items'])
        return embeddings

In [8]:
def train(model, train_mat, test_ratings, test_negatives, users_num, items_num, 
          train_list=None, test_list=None, positive_size=2, negative_time=16, epochs=100, topK=10, mode='hr'):
    
    if train_list!=None:
        train_mat= sequence2mat(sequence=train_list, N=users_num, M=items_num) # train data : user-item matrix
        is_sparse = False
    
    hr_list=[]
    ndcg_list=[]
    hr, ndcg = evaluate(model, test_ratings, test_negatives, K=topK)
    embeddings = model.get_embeddings()
    #hr,ndcg = model.evaluate(test_sequence=test_list, topK=topK)
    hr_list.append(hr)
    ndcg_list.append(ndcg)
    print('Init: HR = %.4f, NDCG = %.4f' %(hr, ndcg))
    best_hr, best_ndcg = hr, ndcg
    for epoch in range(epochs):
        data_sequence = generate_instances(
            train_mat,positive_size=positive_size, negative_time=negative_time,is_sparse=True)
        loss_records = model.fit(data_sequence=data_sequence)
        # Evaluation
        hr, ndcg = evaluate(model, test_ratings, test_negatives, K=topK)
        #hr,ndcg = model.evaluate(test_sequence=test_list, topK=topK)
        hr_list.append(hr)
        ndcg_list.append(ndcg)
        print('epoch=%d, loss=%.4f, HR=%.4f, NDCG=%.4f' %(epoch,loss_records[-1],hr,ndcg))
        
        mlist = hr_list
        if mode == 'ndcg':
            mlist = ndcg_list
        if (len(mlist) > 10) and (mlist[-2] < mlist[-3] > mlist[-1]):
            best_hr, best_ndcg = hr_list[-3], ndcg_list[-3]
            embeddings = model.get_embeddings()
            break
        best_hr, best_ndcg = hr, ndcg
        embeddings = model.get_embeddings()
            
    print("End. Best HR = %.4f, NDCG = %.4f. " %(best_hr, best_ndcg))
    #model.save_model(save_path=model_path)
    return embeddings

In [11]:
dataset_path = 'data/100k'

# Load the dataset
dataset = Dataset(dataset_path)
train_mat, test_ratings, test_negatives = dataset.trainMatrix, dataset.testRatings, dataset.testNegatives
print('Dataset: #user=%d, #item=%d, #train_pairs=%d, #test_pairs=%d' % (
    dataset.num_users, dataset.num_items, train_mat.nnz, len(test_ratings)))

batch_size = 1024                # batch size
embedding_size_users = 64        # the embedding size of user
embedding_size_items = 64        # the embedding size of item
hidden_size = [64,64,32]       # the size of all hidden layers
learning_rate = 1e-3             # learning rate
lamda_regularizer = 1e-5         # regularizer for all learning weights
is_attention = True
model_path = 'model'             # the path for trained model
positive_size = 2
negative_time = 8
epochs = 64
topK = 10
mode = 'hr'
seed = 7

# Initialize the model
model = CoNet(users_num = dataset.num_users,
              items_num = dataset.num_items,
              batch_size = batch_size,
              embedding_size_users = embedding_size_users,
              embedding_size_items = embedding_size_items,
              hidden_size = hidden_size,
              learning_rate = learning_rate,
              lamda_regularizer = lamda_regularizer,
              is_attention = is_attention,
              seed = seed)

# Train and evaluate model
embeddings = train(model=model, 
                   train_mat=train_mat.tocsr(), 
                   test_ratings=test_ratings, 
                   test_negatives=test_negatives, 
                   users_num=dataset.num_users, 
                   items_num=dataset.num_items,  
                   positive_size=positive_size,
                   negative_time=negative_time,
                   epochs=epochs, 
                   topK=topK,
                   mode=mode)
print('----------------------------------------------------------')

Dataset: #user=943, #item=1682, #train_pairs=99057, #test_pairs=943
Init: HR = 0.0710, NDCG = 0.0360
epoch=0, loss=0.1942, HR=0.4666, NDCG=0.2695
epoch=1, loss=0.1930, HR=0.6161, NDCG=0.3485
epoch=2, loss=0.1706, HR=0.6469, NDCG=0.3659
epoch=3, loss=0.1385, HR=0.6787, NDCG=0.3875
epoch=4, loss=0.1067, HR=0.6840, NDCG=0.3984
epoch=5, loss=0.0673, HR=0.6840, NDCG=0.4065
epoch=6, loss=0.1608, HR=0.6957, NDCG=0.4098
epoch=7, loss=0.0977, HR=0.6808, NDCG=0.4128
epoch=8, loss=0.1272, HR=0.6840, NDCG=0.4043
epoch=9, loss=0.0896, HR=0.6861, NDCG=0.4173
epoch=10, loss=0.1159, HR=0.6946, NDCG=0.4094
epoch=11, loss=0.1007, HR=0.6882, NDCG=0.4167
epoch=12, loss=0.0790, HR=0.6957, NDCG=0.4156
epoch=13, loss=0.0858, HR=0.7116, NDCG=0.4213
epoch=14, loss=0.0585, HR=0.6903, NDCG=0.4163
epoch=15, loss=0.0798, HR=0.6935, NDCG=0.4140
End. Best HR = 0.7116, NDCG = 0.4213. 
----------------------------------------------------------


In [27]:
file_dir = 'datasets/ml-100k/u.item'
id_name_dict = id_name(file_dir) # original id : movie name

file_dir = 'datasets/ml-100k/u.data'
item_dict = get_item_dict(file_dir) # original id : new id

In [34]:
movieid_list = [174, 127, 449]
for movieid in movieid_list:
    print('MovieID:', movieid, '; MovieName:', id_name_dict[movieid])

MovieID: 174 ; MovieName: Raiders of the Lost Ark (1981)
MovieID: 127 ; MovieName: Godfather, The (1972)
MovieID: 449 ; MovieName: Star Trek: The Motion Picture (1979)


In [36]:
movieid_list = [174, 127, 449]

for movieid in movieid_list:
    print('MovieID:', movieid, '; MovieName:', id_name_dict[movieid])
    original_id = str(movieid)
    target_item = item_dict[original_id]

    top5 = get_similar_items(embeddings, idx=target_item)
    movie_list = [get_key(item_dict=item_dict, value=i) for i in top5]
    rec_list = [id_name_dict[int(movie_id)] for movie_id in movie_list[1:]]
    for i in range(len(rec_list)):
        print('\n{0}: {1}'.format(i+1, rec_list[i]))
    print('------------------------------------------------------------------')

MovieID: 174 ; MovieName: Raiders of the Lost Ark (1981)

1: Empire Strikes Back, The (1980)

2: Fugitive, The (1993)

3: Indiana Jones and the Last Crusade (1989)

4: Back to the Future (1985)

5: Silence of the Lambs, The (1991)
------------------------------------------------------------------
MovieID: 127 ; MovieName: Godfather, The (1972)

1: Star Wars (1977)

2: Fargo (1996)

3: Return of the Jedi (1983)

4: GoodFellas (1990)

5: Fugitive, The (1993)
------------------------------------------------------------------
MovieID: 449 ; MovieName: Star Trek: The Motion Picture (1979)

1: Star Trek VI: The Undiscovered Country (1991)

2: Star Trek V: The Final Frontier (1989)

3: Star Trek IV: The Voyage Home (1986)

4: Star Trek III: The Search for Spock (1984)

5: Star Trek: The Wrath of Khan (1982)
------------------------------------------------------------------
