In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import numpy as np
import random
import math
import os

# Dataset and evaluation protocols reused from
# https://github.com/hexiangnan/neural_collaborative_filtering
from Dataset import Dataset
from evaluate import evaluate_model

In [2]:
def generate_instances(train_mat, positive_size=2, negative_time=8, is_sparse=False):
    data = []
    users_num,items_num = train_mat.shape
    
    if is_sparse:
        indptr = train_mat.indptr
        indices = train_mat.indices
    for u in range(users_num):
        if is_sparse:
            rated_items = indices[indptr[u]:indptr[u+1]] #用户u中有评分项的id
        else:
            rated_items = np.where(train_mat[u,:]>0)[0]
        
        for item0 in rated_items:
            for item1 in np.random.choice(rated_items, size=positive_size):
                data.append([u,item0,item1,1.])
            for _ in range(positive_size*negative_time):
                item1 = np.random.randint(items_num) # no matter item1 is positive or negtive
                item2 = np.random.randint(items_num)
                while item2 in rated_items:
                    item2 = np.random.randint(items_num)
                data.append([u,item2,item1,0.])
    return data

In [3]:
def setup_seed(seed):
    np.random.seed(seed)  # Numpy module.
    random.seed(seed)  # Python random module.
    os.environ['PYTHONHASHSEED'] = str(seed) # 为了禁止hash随机化，使得实验可复现。
    torch.manual_seed(seed) # 为CPU设置随机种子
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed) # 为当前GPU设置随机种子
        torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU，为所有GPU设置随机种子
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True
        

def evaluate(model, test_ratings, test_negatives, K=10):
    """Helper that calls evaluate from the NCF libraries."""
    (hits, ndcgs) = evaluate_model(model, test_ratings, test_negatives, K=K, num_thread=1)
    return np.array(hits).mean(), np.array(ndcgs).mean()


def get_similar_items(item_mat, idx, topk=5):
    m,k = item_mat.shape
    target_item = item_mat[idx,:]
    target_mat = np.reshape(np.tile(target_item,m),(-1,k))
    sim = [np.dot(target_mat[i], item_mat[i])/(np.linalg.norm(target_mat[i])*np.linalg.norm(item_mat[i])) 
           for i in range(m)] 
    sorted_items = np.argsort(-np.array(sim))
    return sorted_items[:topk+1] # the most similar is itself

def get_key(item_dict, value):
    key = -1
    for (k, v) in item_dict.items():
        if v == value:
            key = k
    return key


# read original records
def get_item_dict(file_dir):
    # output: 
    # N: the number of user;
    # M: the number of item
    # data: the list of rating information
    user_ids_dict, rated_item_ids_dict = {},{}
    N, M, u_idx, i_idx = 0,0,0,0 
    data_rating = []
    data_time = []
    f = open(file_dir)
    for line in f.readlines():
        if '::' in line:
            u, i, r = line.split('::')[:3]
        elif ',' in line:
            u, i, r = line.split(',')[:3]
        else:
            u, i, r = line.split()[:3]
    
        if u not in user_ids_dict:
            user_ids_dict[u]=u_idx
            u_idx+=1
        if i not in rated_item_ids_dict:
            rated_item_ids_dict[i]=i_idx
            i_idx+=1
        data_rating.append([user_ids_dict[u],rated_item_ids_dict[i],float(r)])
    
    f.close()
    N = u_idx
    M = i_idx

    return rated_item_ids_dict

# read id and its name
def id_name(file_dir):
    id_name_dict = {}
    f = open(file_dir, 'r', encoding='latin-1')
    for line in f.readlines():
        movie_id, movie_name = line.split('|')[:2]
        id_name_dict[int(movie_id)] = movie_name
        
    return id_name_dict

In [4]:
class CoNet(nn.Module):
    def __init__(self, users_num, items_num, embedding_size_users=64, embedding_size_items = 64, 
                 hidden_size = [64,32,16,8], is_attention = False):
        super(CoNet, self).__init__()
        self.embedding_size_users, self.embedding_size_items= embedding_size_users, embedding_size_items 
        self.items_num, self.users_num = items_num, users_num
        self.hidden_size, self.is_attention = hidden_size, is_attention
        self.embedding_user  = nn.Embedding(self.users_num, self.embedding_size_users)
        self.embedding_item = nn.Embedding(self.items_num, self.embedding_size_items)
        self.layer1 = nn.Linear(self.embedding_size_users + self.embedding_size_items, self.hidden_size[0])
        self.layers = [nn.Sequential(nn.Linear(self.hidden_size[i], self.hidden_size[i+1]), nn.ReLU()) for i in range(len(self.hidden_size) - 1)]
        self.linear = nn.Linear(self.hidden_size[-1], 1)
 
    def forward(self, x):
        embed_users = self.embedding_user(x[:,0])
        embed_items0 = self.embedding_item(x[:,1])
        embed_items1 = self.embedding_item(x[:,2])
        
        embed_items = (embed_items0 + embed_items1)/2.
        if self.is_attention:
            score0 = torch.reshape(torch.sum(embed_users * embed_items0, 1), shape=[-1,1])
            score1 = torch.reshape(torch.sum(embed_users * embed_items1, 1), shape=[-1,1])
            alpha = torch.sigmoid(score0 - score1)
            embed_items = alpha * embed_items0 + (1. - alpha) * embed_items1
            
        out = torch.cat([embed_users, embed_items],1)
        out = self.layer1(out)
        for layer in self.layers:
            out = layer(out)
        out = self.linear(out) 
        return out
    
    def predict(self, pairs, batch_size, verbose):
        """Computes predictions for a given set of user-item pairs.
        Args:
          pairs: A pair of lists (users, items) of the same length.
          batch_size: unused.
          verbose: unused.
        Returns:
          predictions: A list of the same length as users and items, such that
          predictions[i] is the models prediction for (users[i], items[i]).
        """
        del batch_size, verbose
        num_examples = len(pairs[0])
        assert num_examples == len(pairs[1])
        predictions = np.empty(num_examples)
        pairs = np.array(pairs, dtype=np.int16)
        for i in range(num_examples):
            x = np.c_[pairs[0][i],pairs[1][i],pairs[1][i]]
            x = torch.from_numpy(x).long()
            out = self.forward(x)
            predictions[i] = out.reshape(-1).data.numpy()
        return predictions
    
    def get_embeddings(self):
        idx = torch.LongTensor([i for i in range(self.items_num)])
        embeddings = self.embedding_item(idx)
        return embeddings

In [5]:
def train(model, train_mat, test_ratings, test_negatives, users_num, items_num, train_list=None, test_list=None,
          learning_rate = 1e-2, weight_decay=0.01, positive_size=1, negative_time=4, epochs=64, 
          batch_size=1024, topK=10, mode='hr'):
    
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    criterion = nn.MSELoss()
    
    if train_list!=None:
        train_mat= sequence2mat(sequence=train_list, N=users_num, M=items_num) # train data : user-item matrix
        is_sparse = False
    
    hr_list=[]
    ndcg_list=[]
    hr, ndcg = evaluate(model, test_ratings, test_negatives, K=topK)
    embeddings = model.get_embeddings()
    hr_list.append(hr)
    ndcg_list.append(ndcg)
    print('Init: HR = %.4f, NDCG = %.4f' %(hr, ndcg))
    best_hr, best_ndcg = hr, ndcg
    for epoch in range(epochs):
        data_sequence = generate_instances(train_mat, positive_size=positive_size, negative_time=negative_time, is_sparse=True)
        #data_sequence = read_list("output/" + str(epoch) + ".txt")
        
        train_size = len(data_sequence)
        np.random.shuffle(data_sequence)
        batch_size = batch_size
        total_batch = math.ceil(train_size/batch_size)

        for batch in range(total_batch):
            start = (batch*batch_size)% train_size
            end = min(start + batch_size, train_size)
            data_array = np.array(data_sequence[start:end])
            x = torch.from_numpy(data_array[:,:3]).long()
            y = torch.from_numpy(data_array[:,-1]).reshape(-1,1)
            y_ = model(x)
            loss = criterion(y_.float(), y.float())
            optimizer.zero_grad()              # clear gradients for this training step
            loss.backward()                    # backpropagation, compute gradients
            optimizer.step()                   # apply gradients
            
        # Evaluation
        hr, ndcg = evaluate(model, test_ratings, test_negatives, K=topK)
        hr_list.append(hr)
        ndcg_list.append(ndcg)
        print('epoch=%d, loss=%.4f, HR=%.4f, NDCG=%.4f' %(epoch, loss, hr, ndcg))
        
        mlist = hr_list
        if mode == 'ndcg':
            mlist = ndcg_list
        if (len(mlist) > 20) and (mlist[-2] < mlist[-3] > mlist[-1]):
            best_hr, best_ndcg = hr_list[-3], ndcg_list[-3]
            embeddings = model.get_embeddings()
            break
        best_hr, best_ndcg = hr, ndcg
        embeddings = model.get_embeddings()
            
    print("End. Best HR = %.4f, NDCG = %.4f. " %(best_hr, best_ndcg))
    return embeddings

In [6]:
dataset_path = 'data/100k'

# Load the dataset
dataset = Dataset(dataset_path)
train_mat, test_ratings, test_negatives = dataset.trainMatrix, dataset.testRatings, dataset.testNegatives
print('Dataset: #user=%d, #item=%d, #train_pairs=%d, #test_pairs=%d' 
      % (dataset.num_users, dataset.num_items, train_mat.nnz, len(test_ratings)))

embedding_size_users = 64
embedding_size_items = 64
hidden_size = [64,32,16]
is_attention = True
learning_rate = 1e-3
weight_decay = 1e-5

positive_size = 2
negative_time = 8
epochs = 64
batch_size = 1024 #batch大小
topK = 10
mode = 'hr'
seed = 18

setup_seed(seed)
# Initialize the model
model = CoNet(users_num=dataset.num_users, items_num=dataset.num_items, embedding_size_users=embedding_size_users, 
              embedding_size_items=embedding_size_items, hidden_size=hidden_size, is_attention=is_attention)

if torch.cuda.is_available():
    model = model.cuda()

# Train and evaluate model
embeddings = train(model=model, 
                  train_mat=train_mat.tocsr(), 
                  test_ratings=test_ratings, 
                  test_negatives=test_negatives, 
                  users_num=dataset.num_users, 
                  items_num=dataset.num_items,  
                  learning_rate=learning_rate,
                  weight_decay=weight_decay,
                  positive_size=positive_size,
                  negative_time=negative_time,
                  epochs=epochs,
                  batch_size=batch_size,
                  topK=topK,
                  mode=mode)
print('----------------------------------------------------------')

Dataset: #user=943, #item=1682, #train_pairs=99057, #test_pairs=943
Init: HR = 0.1039, NDCG = 0.0442
epoch=0, loss=0.0815, HR=0.4634, NDCG=0.2646
epoch=1, loss=0.0529, HR=0.4793, NDCG=0.2622
epoch=2, loss=0.0545, HR=0.5599, NDCG=0.3253
epoch=3, loss=0.0571, HR=0.6267, NDCG=0.3489
epoch=4, loss=0.0575, HR=0.6225, NDCG=0.3455
epoch=5, loss=0.0561, HR=0.6384, NDCG=0.3611
epoch=6, loss=0.0314, HR=0.6288, NDCG=0.3707
epoch=7, loss=0.0498, HR=0.6660, NDCG=0.3743
epoch=8, loss=0.0507, HR=0.6469, NDCG=0.3766
epoch=9, loss=0.0426, HR=0.6702, NDCG=0.3866
epoch=10, loss=0.0341, HR=0.6744, NDCG=0.3874
epoch=11, loss=0.0460, HR=0.6691, NDCG=0.3808
epoch=12, loss=0.0381, HR=0.6861, NDCG=0.3948
epoch=13, loss=0.0350, HR=0.6723, NDCG=0.3886
epoch=14, loss=0.0406, HR=0.6776, NDCG=0.3898
epoch=15, loss=0.0388, HR=0.6596, NDCG=0.3804
epoch=16, loss=0.0401, HR=0.6914, NDCG=0.4002
epoch=17, loss=0.0340, HR=0.6872, NDCG=0.4116
epoch=18, loss=0.0558, HR=0.6819, NDCG=0.3953
epoch=19, loss=0.0439, HR=0.6957, N

In [7]:
file_dir = 'datasets/ml-100k/u.item'
id_name_dict = id_name(file_dir) # original id : movie name

file_dir = 'datasets/ml-100k/u.data'
item_dict = get_item_dict(file_dir) # original id : new id

In [8]:
movieid_list = [174, 195, 449]
    
for movieid in movieid_list:
    print('MovieID:', movieid, '; MovieName:', id_name_dict[movieid])
    original_id = str(movieid)
    target_item = item_dict[original_id]

    top5 = get_similar_items(embeddings.data.numpy(), idx=target_item)
    movie_list = [get_key(item_dict=item_dict, value=i) for i in top5]
    rec_list = [id_name_dict[int(movie_id)] for movie_id in movie_list[1:]]
    for i in range(len(rec_list)):
        print('\n{0}: {1}'.format(i+1, rec_list[i]))
    print('------------------------------------------------------------------')

MovieID: 174 ; MovieName: Raiders of the Lost Ark (1981)

1: Back to the Future (1985)

2: Fugitive, The (1993)

3: Empire Strikes Back, The (1980)

4: Blues Brothers, The (1980)

5: Dances with Wolves (1990)
------------------------------------------------------------------
MovieID: 195 ; MovieName: Terminator, The (1984)

1: Fugitive, The (1993)

2: Batman (1989)

3: One Flew Over the Cuckoo's Nest (1975)

4: Terminator 2: Judgment Day (1991)

5: Forrest Gump (1994)
------------------------------------------------------------------
MovieID: 449 ; MovieName: Star Trek: The Motion Picture (1979)

1: Star Trek III: The Search for Spock (1984)

2: Star Trek V: The Final Frontier (1989)

3: Die Hard 2 (1990)

4: Escape from New York (1981)

5: Amityville Horror, The (1979)
------------------------------------------------------------------
