In [1]:
import tensorflow as tf
import numpy as np
import random
import math
import os

# Dataset and evaluation protocols reused from
# https://github.com/hexiangnan/neural_collaborative_filtering
from Dataset import Dataset
from evaluate import evaluate_model

In [2]:
# generate data sequence from user-item matrix
def generate_instances(train_mat, positive_size=1, negative_time=4, is_sparse=False):
    data = []
    users_num,items_num = train_mat.shape
    
    if is_sparse:
        indptr = train_mat.indptr
        indices = train_mat.indices
    for u in range(users_num):
        if is_sparse:
            rated_items = indices[indptr[u]:indptr[u+1]] #用户u中有评分项的id
        else:
            rated_items = np.where(train_mat[u,:]>0)[0]
        
        for item0 in rated_items:
            for item1 in np.random.choice(rated_items, size=positive_size):
                data.append([u,item0,item1,1.])
            for _ in range(positive_size*negative_time):
                item1 = np.random.randint(items_num) # no matter item1 is positive or negtive
                item2 = np.random.randint(items_num)
                while item2 in rated_items:
                    item2 = np.random.randint(items_num)
                data.append([u,item2,item1,0.])
    return data

# read data sequence from file generated by generate_instances function
def read_list(file_dir):
    data = []
    with open(file_dir, "r", encoding='utf-8') as f:
        data = [[int(line.split()[0]), int(line.split()[1]), int(line.split()[2]), float(line.split()[3])] for line in f.readlines()]
    return data

In [3]:
def setup_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    tf.set_random_seed(seed)

def evaluate(model, test_ratings, test_negatives, K=10):
    """Helper that calls evaluate from the NCF libraries."""
    (hits, ndcgs) = evaluate_model(model, test_ratings, test_negatives, K=K, num_thread=1)
    return np.array(hits).mean(), np.array(ndcgs).mean()


def get_similar_items(item_mat, idx, topk=5):
    m,k = item_mat.shape
    target_item = item_mat[idx,:]
    target_mat = np.reshape(np.tile(target_item,m),(-1,k))
    sim = [np.dot(target_mat[i], item_mat[i])/(np.linalg.norm(target_mat[i])*np.linalg.norm(item_mat[i])) 
           for i in range(m)] 
    sorted_items = np.argsort(-np.array(sim))
    return sorted_items[:topk+1] # the most similar is itself

def get_key(item_dict, value):
    key = -1
    for (k, v) in item_dict.items():
        if v == value:
            key = k
    return key


# read original records
def get_item_dict(file_dir):
    # output: 
    # N: the number of user;
    # M: the number of item
    # data: the list of rating information
    user_ids_dict, rated_item_ids_dict = {},{}
    N, M, u_idx, i_idx = 0,0,0,0 
    data_rating = []
    data_time = []
    f = open(file_dir)
    for line in f.readlines():
        if '::' in line:
            u, i, r = line.split('::')[:3]
        elif ',' in line:
            u, i, r = line.split(',')[:3]
        else:
            u, i, r = line.split()[:3]
    
        if u not in user_ids_dict:
            user_ids_dict[u]=u_idx
            u_idx+=1
        if i not in rated_item_ids_dict:
            rated_item_ids_dict[i]=i_idx
            i_idx+=1
        data_rating.append([user_ids_dict[u],rated_item_ids_dict[i],float(r)])
    
    f.close()
    N = u_idx
    M = i_idx

    return rated_item_ids_dict

In [11]:
class Net():
    def __init__(self,               
                 users_num = None, #用户数
                 items_num = None, #商品数
                 batch_size = 1024, #batch大小
                 embedding_size = 64, # 嵌入空间维度
                 out_channels = 32,
                 kernel_size = 3,
                 learning_rate = 1e-3, #学习率
                 lamda_regularizer = 1e-6,#正则项系数
                 alpha = 1.0,
                 seed = 2
                ):
        self.users_num = users_num
        self.items_num = items_num
        self.batch_size = batch_size
        self.embedding_size = embedding_size
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.learning_rate = learning_rate
        self.lamda_regularizer = lamda_regularizer
        self.alpha = alpha
        self.seed = seed
        self.padding = 0 # 不填充
        self.stride = 1

        # loss records
        self.train_loss_records = []  
        self.build_graph()   

        
    def build_graph(self):
        self.graph = tf.Graph()
        with self.graph.as_default():
            setup_seed(self.seed)
            
            # _________ input data _________
            self.user_inputs = tf.placeholder(tf.int32, shape = [None, 1], name='user_inputs')
            self.item_inputs = tf.placeholder(tf.int32, shape = [None, 2], name='item_inputs')
            self.train_labels = tf.placeholder(tf.float32, shape = [None, 1], name='train_labels') 
            
            # _________ variables _________
            self.weights = self._initialize_weights()
            
            # _________ train _____________
            self.y_uij, self.y_ui = self.inference(user_inputs=self.user_inputs, item_inputs=self.item_inputs)
            self.loss_train = self.loss_function(true_labels=self.train_labels, 
                                                 y_uij=tf.reshape(self.y_uij,shape=[-1, 1]),
                                                 y_ui=tf.reshape(self.y_ui,shape=[-1, 1]),
                                                 lamda_regularizer=self.lamda_regularizer)
            self.train_op = tf.train.AdamOptimizer(learning_rate=self.learning_rate,beta1=0.9, beta2=0.999, epsilon=1e-08).minimize(self.loss_train) 

            # _________ prediction _____________
            self.predictions,_ = self.inference(user_inputs=self.user_inputs, item_inputs=self.item_inputs)
        
            #变量初始化 init
            self.saver = tf.train.Saver() #  
            init = tf.global_variables_initializer()
            self.sess = self._init_session()
            self.sess.run(init)
    
    
    def _init_session(self):
        # adaptively growing memory
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        return tf.Session(config=config)
    
    
    def _initialize_weights(self):
        all_weights = dict()

        # -----embeddings------
        all_weights['embedding_users'] = tf.Variable(tf.random_normal([self.users_num, self.embedding_size], 0, 0.1),name='embedding_users')
        all_weights['embedding_items'] = tf.Variable(tf.random_normal([self.items_num, self.embedding_size], 0, 0.1),name='embedding_items') 
        
        # ------CNN for ui------
        all_weights['cnn_ui'] = tf.Variable(tf.random_normal(
            [2, 2, 1, self.out_channels], 0, 0.1),name='cnn_ui')
        all_weights['bias_ui'] = tf.Variable(tf.zeros([self.out_channels]), name='bias_ui')
        self.out_size_ui = int(((self.embedding_size - 2 + 2 * self.padding)/self.stride + 1) * self.out_channels)
        all_weights['linear_0'] = tf.Variable(tf.random_normal([self.out_size_ui, 1], 0, 0.1), name='linear_0')
        all_weights['bias_0'] = tf.Variable(tf.zeros([1]), name='bias_0')
        
        
        # ------CNN for uij------
        # filter=[filter_height,filter_width,in_channels,out_channels]
        all_weights['cnn_1'] = tf.Variable(tf.random_normal(
            [self.kernel_size, self.kernel_size, 1, self.out_channels], 0, 0.1),name='cnn_1')
        all_weights['bias_1'] = tf.Variable(tf.zeros([self.out_channels]), name='bias_1')
        
        input_size = self.embedding_size
        if self.kernel_size == 2:
            all_weights['cnn_2'] = tf.Variable(tf.random_normal(
                [self.kernel_size, self.kernel_size, self.out_channels, self.out_channels], 0, 0.1),name='cnn_2')
            all_weights['bias_2'] = tf.Variable(tf.zeros([self.out_channels]), name='bias_2')
            input_size = (self.embedding_size - self.kernel_size + 2 * self.padding)/self.stride + 1
            
        self.out_size = int(((input_size - self.kernel_size + 2 * self.padding)/self.stride + 1) * self.out_channels)
        all_weights['linear'] = tf.Variable(tf.random_normal([self.out_size, 1], 0, 0.1), name='linear')
        all_weights['bias'] = tf.Variable(tf.zeros([1]), name='bias')

        return all_weights
        
    
    def train(self, data_sequence):
        train_size = len(data_sequence)
        
        np.random.shuffle(data_sequence)
        batch_size = self.batch_size
        total_batch = math.ceil(train_size/batch_size)

        for batch in range(total_batch):
            start = (batch * batch_size)%train_size
            end = min(start + batch_size, train_size)
            data_array = np.array(data_sequence[start:end])

            feed_dict = {self.user_inputs: np.reshape(data_array[:,0],(-1,1)), 
                         self.item_inputs: data_array[:,1:3],
                         self.train_labels: np.reshape(data_array[:,-1],(-1,1))}  
            loss, opt = self.sess.run([self.loss_train,self.train_op], feed_dict=feed_dict)
            self.train_loss_records.append(loss)
            
        return self.train_loss_records

        
    # CNN for (u,i)
    def net_ui(self, embed_users, embed_items):
        connection = tf.reshape(tf.concat([embed_users, embed_items], 1), shape=[-1, 2, self.embedding_size, 1])
        conv = tf.nn.conv2d(input=connection, filter=self.weights['cnn_ui'], strides=[1, self.stride, self.stride, 1], padding='VALID')
        out = tf.nn.relu(tf.nn.bias_add(conv, self.weights['bias_ui']))
        out = tf.reshape(out, [-1,self.out_size_ui])
        y_ = tf.matmul(out, self.weights['linear_0']) + self.weights['bias_0']
        return y_
        
        
    # CNN for (u,i,j)
    def net_uij(self, embed_users, embed_items0, embed_items1):
        connection = tf.reshape(tf.concat([embed_items0, embed_users, embed_items1], 1), shape=[-1, 3, self.embedding_size, 1])
        conv = tf.nn.conv2d(input=connection, filter=self.weights['cnn_1'], strides=[1, self.stride, self.stride, 1], padding='VALID')
        # input shape: [ batch, in_height, in_width, in_channel ]
        out = tf.nn.relu(tf.nn.bias_add(conv, self.weights['bias_1']))
        
        if self.kernel_size == 2:
            conv = tf.nn.conv2d(input=out, filter=self.weights['cnn_2'], strides=[1, 1, 1, 1], padding='VALID')
            out = tf.nn.relu(tf.nn.bias_add(conv, self.weights['bias_2']))
        
        out = tf.reshape(out, [-1,self.out_size])
        y_ = tf.matmul(out, self.weights['linear']) + self.weights['bias']
        return y_
    
        
    # 网络的前向传播
    def inference(self, user_inputs, item_inputs):
        embed_users = tf.reshape(tf.nn.embedding_lookup(self.weights['embedding_users'], user_inputs),
                                 shape=[-1, self.embedding_size])
        embed_items0 = tf.reshape(tf.nn.embedding_lookup(self.weights['embedding_items'], item_inputs[:,0]),
                                 shape=[-1, self.embedding_size])
        embed_items1 = tf.reshape(tf.nn.embedding_lookup(self.weights['embedding_items'], item_inputs[:,1]),
                                 shape=[-1, self.embedding_size])
        
        y_uij = self.net_uij(embed_users, embed_items0, embed_items1)
        y_ui = self.net_ui(embed_users, embed_items0)
        
        return y_uij, y_ui         
        
        
    def loss_function(self, true_labels, y_uij, y_ui,lamda_regularizer=1e-6, loss_type='mse'):   
        cost_ui = tf.losses.mean_squared_error(true_labels, y_ui)
        cost_uij = tf.losses.mean_squared_error(true_labels, y_uij)
        if loss_type == 'cross_entropy':
            cost_uij = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=true_labels, logits=y_uij))
            #cost = tf.reduce_mean(tf.square(1.+true_labels)*tf.square(true_labels-tf.sigmoid(predicted_labels)))
            #mse = tf.losses.mean_squared_error(true_labels, tf.sigmoid(predicted_labels))
            
        regularization = 0.0
        if lamda_regularizer > 0:
            regularizer_1 = tf.contrib.layers.l2_regularizer(lamda_regularizer)
            regularization = regularizer_1(
                self.weights['embedding_users']) + regularizer_1(
                self.weights['embedding_items'])+ regularizer_1(
                self.weights['cnn_1']) + regularizer_1(
                self.weights['linear'])+ regularizer_1(
                self.weights['cnn_ui']) + regularizer_1(
                self.weights['linear_0'])
            if self.kernel_size == 2:
                regularization = regularization + regularizer_1(self.weights['cnn_2'])
        
        cost = cost_uij + self.alpha * cost_ui + regularization
        return cost    
        
        
    def evaluate(self, test_sequence, topK=10):
        score = np.zeros([self.users_num, self.items_num])
        users = np.array([u for u in range(self.users_num)])
        items = np.array([i for i in range(self.items_num)])
  
        for u in range(self.users_num):
            user_ids = np.reshape(u * np.ones([self.items_num]),(-1,1))
            feed_dict = {self.user_inputs: user_ids, self.item_inputs:np.c_[items,items]}
            out = self.sess.run([self.predictions], feed_dict=feed_dict)
            score[u,:] = np.reshape(out,(-1, self.items_num))
            
        ranklist = get_topk(prediction=score, test_sequence=np.array(test_sequence), topK=topK)
        #print(ranklist)
        hits,ndcgs = hit_ndcg(test_sequence=np.array(test_sequence), ranklist=ranklist)
        hr,ndcg = np.array(hits).mean(),np.array(ndcgs).mean()
        return hr,ndcg
    
    def predict(self, pairs, batch_size, verbose):
        """Computes predictions for a given set of user-item pairs.
        Args:
          pairs: A pair of lists (users, items) of the same length.
          batch_size: unused.
          verbose: unused.
        Returns:
          predictions: A list of the same length as users and items, such that
          predictions[i] is the models prediction for (users[i], items[i]).
        """
        del batch_size, verbose
        num_examples = len(pairs[0])
        assert num_examples == len(pairs[1])
        predictions = np.empty(num_examples)
        pairs = np.array(pairs, dtype=np.int16)
        for i in range(num_examples):
            feed_dict = {self.user_inputs:np.reshape(pairs[0][i], (-1,1)),
                         self.item_inputs:np.c_[pairs[1][i], pairs[1][i]]}
            out = self.sess.run([self.predictions], feed_dict=feed_dict)
            predictions[i] = np.reshape(out,(-1))
            #predictions[i] = self._predict_one(pairs[0][i], pairs[1][i])
        return predictions
    
    def get_embeddings(self):
        embeddings = self.sess.run(self.weights['embedding_items'])
        return embeddings

In [5]:
def train(model, train_mat, test_ratings, test_negatives, users_num, items_num, train_list=None, test_list=None,
          positive_size=1, negative_time=4, epochs=128, topK=10, mode='ndcg'):
    
    if train_list!=None:
        train_mat= sequence2mat(sequence=train_list, N=users_num, M=items_num) # train data : user-item matrix
        is_sparse = False
    
    hr_list=[]
    ndcg_list=[]
    hr, ndcg = evaluate(model, test_ratings, test_negatives, K=topK)
    #hr,ndcg = model.evaluate(test_sequence=test_list, topK=topK)
    hr_list.append(hr)
    ndcg_list.append(ndcg)
    print('Init: HR = %.4f, NDCG = %.4f' %(hr, ndcg))
    best_hr, best_ndcg = hr, ndcg
    for epoch in range(epochs):
        data_sequence = generate_instances(
            train_mat, positive_size=positive_size, negative_time=negative_time, is_sparse=True)
        loss_records = model.train(data_sequence=data_sequence)
        # Evaluation
        hr, ndcg = evaluate(model, test_ratings, test_negatives, K=topK)
        #hr,ndcg = model.evaluate(test_sequence=test_list, topK=topK)
        hr_list.append(hr)
        ndcg_list.append(ndcg)
        print('epoch=%d, loss=%.4f, HR=%.4f, NDCG=%.4f' %(epoch,loss_records[-1],hr,ndcg))
        
        mlist = hr_list
        if mode == 'ndcg':
            mlist = ndcg_list
        if (len(mlist) > 10) and (mlist[-2] < mlist[-3] > mlist[-1]):
            best_hr, best_ndcg = hr_list[-3], ndcg_list[-3]
            break
        best_hr, best_ndcg = hr, ndcg          
            
    print("End. Best HR = %.4f, NDCG = %.4f. " %(best_hr, best_ndcg))

In [12]:
dataset_path = 'data/100k'

# Load the dataset
dataset = Dataset(dataset_path)
train_mat, test_ratings, test_negatives = dataset.trainMatrix, dataset.testRatings, dataset.testNegatives
print('Dataset: #user=%d, #item=%d, #train_pairs=%d, #test_pairs=%d' % (dataset.num_users, dataset.num_items, train_mat.nnz, len(test_ratings)))

embedding_size = [32]
out_channels = 64
learning_rate = 5e-3 #学习率
lamda_regularizer = [1e-6]
alpha = 1.0
kernel_size = 3

positive_size = 1
negative_time = 4
epochs = 64
batch_size = 1024 #batch大小
topK = 10
mode = 'hr'

for e in embedding_size:
    for l in lamda_regularizer:
        #创建模型
        model = Net(users_num = dataset.num_users,
                    items_num = dataset.num_items,
                    batch_size = batch_size,
                    embedding_size = e,
                    out_channels = out_channels,
                    kernel_size = kernel_size,
                    learning_rate = learning_rate,
                    lamda_regularizer = l,
                    alpha = alpha)

        # Train and evaluate model
        train(model=model,
              train_mat=train_mat.tocsr(), 
              test_ratings=test_ratings, 
              test_negatives=test_negatives, 
              users_num=dataset.num_users, 
              items_num=dataset.num_items,
              positive_size=positive_size,
              negative_time=negative_time,
              epochs=epochs,
              topK=topK,
              mode=mode)

Dataset: #user=943, #item=1682, #train_pairs=99057, #test_pairs=943
Init: HR = 0.1029, NDCG = 0.0400
epoch=0, loss=0.1745, HR=0.5440, NDCG=0.3044
epoch=1, loss=0.1667, HR=0.6607, NDCG=0.3766
epoch=2, loss=0.1459, HR=0.6755, NDCG=0.3904
epoch=3, loss=0.1344, HR=0.6734, NDCG=0.3902
epoch=4, loss=0.1250, HR=0.6734, NDCG=0.3977
epoch=5, loss=0.1358, HR=0.6935, NDCG=0.3947
epoch=6, loss=0.1295, HR=0.6988, NDCG=0.4001
epoch=7, loss=0.1412, HR=0.7063, NDCG=0.4137
epoch=8, loss=0.1202, HR=0.6978, NDCG=0.4160
epoch=9, loss=0.1273, HR=0.7105, NDCG=0.4192
epoch=10, loss=0.1304, HR=0.6893, NDCG=0.4099
epoch=11, loss=0.1197, HR=0.6861, NDCG=0.4061
End. Best HR = 0.7105, NDCG = 0.4192. 
