In [1]:
import numpy as np
import random
import math
import os
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Flatten, Conv2D, Embedding

# Dataset and evaluation protocols reused from
# https://github.com/hexiangnan/neural_collaborative_filtering
from Dataset import Dataset
from evaluate import evaluate_model

In [2]:
def generate_instances(train_mat, positive_size=2, negative_time=8, is_sparse=False):
    data = []
    users_num,items_num = train_mat.shape
    
    if is_sparse:
        indptr = train_mat.indptr
        indices = train_mat.indices
    for u in range(users_num):
        if is_sparse:
            rated_items = indices[indptr[u]:indptr[u+1]] #用户u中有评分项的id
        else:
            rated_items = np.where(train_mat[u,:]>0)[0]
        
        for item0 in rated_items:
            for item1 in np.random.choice(rated_items, size=positive_size):
                data.append([u,item0,item1,1.])
            for _ in range(positive_size*negative_time):
                item1 = np.random.randint(items_num) # no matter item1 is positive or negtive
                item2 = np.random.randint(items_num)
                while item2 in rated_items:
                    item2 = np.random.randint(items_num)
                data.append([u,item2,item1,0.])
    return data

In [3]:
def setup_seed(seed):
    np.random.seed(seed)  # Numpy module.
    random.seed(seed)  # Python random module.
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
        

def evaluate(model, test_ratings, test_negatives, K=10):
    """Helper that calls evaluate from the NCF libraries."""
    (hits, ndcgs) = evaluate_model(model, test_ratings, test_negatives, K=K, num_thread=1)
    return np.array(hits).mean(), np.array(ndcgs).mean()


def get_similar_items(item_mat, idx, topk=5):
    m,k = item_mat.shape
    target_item = item_mat[idx,:]
    target_mat = np.reshape(np.tile(target_item,m),(-1,k))
    sim = [np.dot(target_mat[i], item_mat[i])/(np.linalg.norm(target_mat[i])*np.linalg.norm(item_mat[i])) 
           for i in range(m)] 
    sorted_items = np.argsort(-np.array(sim))
    return sorted_items[:topk+1] # the most similar is itself

def get_key(item_dict, value):
    key = -1
    for (k, v) in item_dict.items():
        if v == value:
            key = k
    return key


# read original records
def get_item_dict(file_dir):
    # output: 
    # N: the number of user;
    # M: the number of item
    # data: the list of rating information
    user_ids_dict, rated_item_ids_dict = {},{}
    N, M, u_idx, i_idx = 0,0,0,0 
    data_rating = []
    data_time = []
    f = open(file_dir)
    for line in f.readlines():
        if '::' in line:
            u, i, r = line.split('::')[:3]
        elif ',' in line:
            u, i, r = line.split(',')[:3]
        else:
            u, i, r = line.split()[:3]
    
        if u not in user_ids_dict:
            user_ids_dict[u]=u_idx
            u_idx+=1
        if i not in rated_item_ids_dict:
            rated_item_ids_dict[i]=i_idx
            i_idx+=1
        data_rating.append([user_ids_dict[u],rated_item_ids_dict[i],float(r)])
    
    f.close()
    N = u_idx
    M = i_idx

    return rated_item_ids_dict


def id_name(file_dir):
    id_name_dict = {}
    f = open(file_dir, 'r', encoding='latin-1')
    for line in f.readlines():
        movie_id, movie_name = line.split('|')[:2]
        id_name_dict[int(movie_id)] = movie_name
        
    return id_name_dict

In [4]:
class UInet(Model):
    def __init__(self, embedding_user, embedding_item, embedding_size=16, out_channels=64, kernel_size=2, stride=1, padding='valid', n_class=1):
        super(UInet, self).__init__()
        self.embedding_size = embedding_size
        self.embedding_user, self.embedding_item = embedding_user, embedding_item
        self.cnn = Conv2D(filters=out_channels, kernel_size=kernel_size, 
                          strides=stride, padding=padding, activation='relu')
        self.flatten = Flatten()
        #self.linear = nn.Linear(int(((self.embedding_size-kernel_size+2*padding)/stride+1)*out_channels), n_class)
        self.linear = Dense(n_class)
 
    def call(self, x):
        embed_users = self.embedding_user(x[:,0])
        embed_items0 = self.embedding_item(x[:,1])
        embed_items1 = self.embedding_item(x[:,2])
        out = tf.reshape(tf.concat([embed_users, embed_items0],1), (-1, 2, self.embedding_size, 1))#[batch, in_height ,in_width, in_channels]
        out = self.cnn(out)          
        out = self.flatten(out)
        out = self.linear(out) 
        return out
    
    def predict(self, pairs, batch_size, verbose):
        """Computes predictions for a given set of user-item pairs.
        Args:
          pairs: A pair of lists (users, items) of the same length.
          batch_size: unused.
          verbose: unused.
        Returns:
          predictions: A list of the same length as users and items, such that
          predictions[i] is the models prediction for (users[i], items[i]).
        """
        del batch_size, verbose
        num_examples = len(pairs[0])
        assert num_examples == len(pairs[1])
        predictions = np.empty(num_examples)
        pairs = np.array(pairs, dtype=np.int16)
        for i in range(num_examples):
            x = np.c_[pairs[0][i],pairs[1][i],pairs[1][i]]
            x = tf.convert_to_tensor(x)
            out = self.call(x)
            predictions[i] = tf.reshape(out, [-1]).numpy()
        return predictions

In [5]:
class UIInet(Model):
    def __init__(self, embedding_user, embedding_item, embedding_size=8, out_channels=64, kernel_size=2, stride=1, padding='valid', n_class=1):
        super(UIInet, self).__init__()
        self.embedding_size, self.kernel_size = embedding_size, kernel_size
        self.embedding_user, self.embedding_item = embedding_user, embedding_item
        self.cnn1 = Conv2D(filters=out_channels, kernel_size=kernel_size, 
                           strides=stride, padding=padding, activation='relu')
        if self.kernel_size == 2:
            self.cnn2 = Conv2D(filters=out_channels, kernel_size=kernel_size, 
                               strides=stride, padding=padding, activation='relu')
        self.flatten = Flatten()
        self.linear = Dense(n_class)
 
    def call(self, x):
        embed_users = self.embedding_user(x[:,0])
        embed_items0 = self.embedding_item(x[:,1])
        embed_items1 = self.embedding_item(x[:,2])
        out = tf.reshape(tf.concat([embed_items0, embed_users, embed_items1],1), (-1, 3, self.embedding_size, 1))
        out = self.cnn1(out)
        if self.kernel_size == 2: 
            out = self.cnn2(out)          
        out = self.flatten(out)
        out = self.linear(out) 
        return out
    
    def predict(self, pairs, batch_size, verbose):
        """Computes predictions for a given set of user-item pairs.
        Args:
          pairs: A pair of lists (users, items) of the same length.
          batch_size: unused.
          verbose: unused.
        Returns:
          predictions: A list of the same length as users and items, such that
          predictions[i] is the models prediction for (users[i], items[i]).
        """
        del batch_size, verbose
        num_examples = len(pairs[0])
        assert num_examples == len(pairs[1])
        predictions = np.empty(num_examples)
        pairs = np.array(pairs, dtype=np.int16)
        for i in range(num_examples):
            x = np.c_[pairs[0][i],pairs[1][i],pairs[1][i]]
            x = tf.convert_to_tensor(x)
            out = self.call(x)
            predictions[i] = tf.reshape(out, [-1]).numpy()
        return predictions

In [6]:
class Net(Model):
    def __init__(self, users_num, items_num, embedding_size=16, out_channels=64, kernel_size=2, stride=1, padding='valid', n_class=1):
        super(Net, self).__init__()
        self.embedding_size, self.kernel_size, self.items_num, self.users_num = embedding_size, kernel_size, items_num, users_num
        self.embedding_user  = Embedding(self.users_num, self.embedding_size)
        self.embedding_item = Embedding(self.items_num, self.embedding_size)

        self.net_ui = UInet(embedding_user=self.embedding_user, 
                            embedding_item=self.embedding_item, 
                            embedding_size=self.embedding_size, 
                            out_channels=out_channels, 
                            kernel_size=2, 
                            stride=stride, 
                            padding=padding, 
                            n_class=n_class)
        self.net_uii = UIInet(embedding_user=self.embedding_user, 
                              embedding_item=self.embedding_item, 
                              embedding_size=self.embedding_size, 
                              out_channels=out_channels, 
                              kernel_size=self.kernel_size, 
                              stride=stride, 
                              padding=padding, 
                              n_class=n_class)
 
    def call(self, x):
        out1 = self.net_ui(x)          
        out2 = self.net_uii(x) 
        return out1, out2
    
    def predict(self, pairs, batch_size, verbose):
        """Computes predictions for a given set of user-item pairs.
        Args:
          pairs: A pair of lists (users, items) of the same length.
          batch_size: unused.
          verbose: unused.
        Returns:
          predictions: A list of the same length as users and items, such that
          predictions[i] is the models prediction for (users[i], items[i]).
        """
        del batch_size, verbose
        num_examples = len(pairs[0])
        assert num_examples == len(pairs[1])
        predictions = np.empty(num_examples)
        pairs = np.array(pairs, dtype=np.int16)
        for i in range(num_examples):
            x = np.c_[pairs[0][i],pairs[1][i],pairs[1][i]]
            x = tf.convert_to_tensor(x)
            out, _ = self.call(x)
            predictions[i] = tf.reshape(out, [-1]).numpy()
        return predictions
    
    def get_embeddings(self):
        idx = tf.convert_to_tensor([i for i in range(self.items_num)], dtype = tf.int32)
        embeddings = self.embedding_item(idx)
        return embeddings

In [7]:
def train(model, train_mat, test_ratings, test_negatives, users_num, items_num, train_list=None, test_list=None,
          learning_rate = 1e-2, weight_decay=1e-6, alpha=1., positive_size=1, negative_time=4, epochs=64, 
          batch_size=1024, topK=10, mode='hr'):
    
    optimizer = tf.keras.optimizers.Adam(learning_rate)
    
    if train_list!=None:
        train_mat= sequence2mat(sequence=train_list, N=users_num, M=items_num) # train data : user-item matrix
        is_sparse = False
    
    hr_list=[]
    ndcg_list=[]
    hr, ndcg = evaluate(model, test_ratings, test_negatives, K=topK)
    embeddings = model.get_embeddings()
    hr_list.append(hr)
    ndcg_list.append(ndcg)
    print('Init: HR = %.4f, NDCG = %.4f' %(hr, ndcg))
    best_hr, best_ndcg = hr, ndcg
    for epoch in range(epochs):
        data_sequence = generate_instances(train_mat, positive_size=positive_size, negative_time=negative_time, is_sparse=True)
        
        train_size = len(data_sequence)
        np.random.shuffle(data_sequence)
        batch_size = batch_size
        total_batch = math.ceil(train_size/batch_size)

        for batch in range(total_batch):
            start = (batch*batch_size)% train_size
            end = min(start+batch_size, train_size)
            data_array = np.array(data_sequence[start:end])
            x = tf.convert_to_tensor(data_array[:,:3], dtype=tf.int32) 
            y = tf.reshape(tf.convert_to_tensor(data_array[:,-1], dtype=tf.float32),(-1,1))
            with tf.GradientTape() as tape: # 梯度记录器
                y1, y2 = model(x)
                loss = tf.losses.MSE(tf.cast(y2, dtype=tf.float32), y) + alpha * tf.losses.MSE(tf.cast(y1, dtype=tf.float32), y)
            grads = tape.gradient(loss, model.variables) # 求导
            optimizer.apply_gradients(grads_and_vars = zip(grads, model.variables))
            
            
        # Evaluation
        hr, ndcg = evaluate(model, test_ratings, test_negatives, K=topK)
        hr_list.append(hr)
        ndcg_list.append(ndcg)
        print('epoch=%d, HR=%.4f, NDCG=%.4f' %(epoch, hr, ndcg))
        
        mlist = hr_list
        if mode == 'ndcg':
            mlist = ndcg_list
        if (len(mlist) > 10) and (mlist[-2] < mlist[-3] > mlist[-1]):
            best_hr, best_ndcg = hr_list[-3], ndcg_list[-3]
            embeddings = model.get_embeddings()
            break
        best_hr, best_ndcg = hr, ndcg
        embeddings = model.get_embeddings()
            
    print("End. Best HR = %.4f, NDCG = %.4f. " %(best_hr, best_ndcg))
    return embeddings

In [8]:
dataset_path = 'data/lastfm'

# Load the dataset
dataset = Dataset(dataset_path)
train_mat, test_ratings, test_negatives = dataset.trainMatrix, dataset.testRatings, dataset.testNegatives
print('Dataset: #user=%d, #item=%d, #train_pairs=%d, #test_pairs=%d' 
      % (dataset.num_users, dataset.num_items, train_mat.nnz, len(test_ratings)))

embedding_size = 32# e=32,o=32 or 64
out_channels = 8
learning_rate = 1e-2
weight_decay = 1e-6
alpha = 1.0
kernel_size = 2

positive_size = 1
negative_time = 4
epochs = 64
batch_size = 1024 #batch大小
topK = 10
mode = 'hr'

setup_seed(1)
# Initialize the model
model = Net(users_num=dataset.num_users, items_num=dataset.num_items, 
            embedding_size=embedding_size, out_channels=out_channels, kernel_size=kernel_size)

# Train and evaluate model
embeddings = train(model=model, 
                  train_mat=train_mat.tocsr(), 
                  test_ratings=test_ratings, 
                  test_negatives=test_negatives, 
                  users_num=dataset.num_users, 
                  items_num=dataset.num_items,  
                  learning_rate=learning_rate,
                  weight_decay=weight_decay,
                  alpha=alpha,
                  positive_size=positive_size,
                  negative_time=negative_time,
                  epochs=epochs,
                  batch_size=batch_size,
                  topK=topK,
                  mode=mode)
print('----------------------------------------------------------')

Dataset: #user=518, #item=3488, #train_pairs=45654, #test_pairs=518
Init: HR = 0.1236, NDCG = 0.0585
epoch=0, HR=0.5019, NDCG=0.3113
epoch=1, HR=0.6776, NDCG=0.4635
epoch=2, HR=0.7317, NDCG=0.5076
epoch=3, HR=0.7201, NDCG=0.5331
epoch=4, HR=0.7355, NDCG=0.5232
epoch=5, HR=0.7432, NDCG=0.5291
epoch=6, HR=0.7529, NDCG=0.5410
epoch=7, HR=0.7645, NDCG=0.5394
epoch=8, HR=0.7587, NDCG=0.5410
epoch=9, HR=0.7606, NDCG=0.5386
End. Best HR = 0.7645, NDCG = 0.5394. 
----------------------------------------------------------


In [13]:
tf.saved_model.save(model, './model/002')

INFO:tensorflow:Assets written to: ./model/002\assets
