In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import numpy as np
import random
import math
import os

# Dataset and evaluation protocols reused from
# https://github.com/hexiangnan/neural_collaborative_filtering
from Dataset import Dataset
from evaluate import evaluate_model

In [2]:
def generate_instances(train_mat, positive_size=2, negative_time=8, is_sparse=False):
    data = []
    users_num,items_num = train_mat.shape
    
    if is_sparse:
        indptr = train_mat.indptr
        indices = train_mat.indices
    for u in range(users_num):
        if is_sparse:
            rated_items = indices[indptr[u]:indptr[u+1]] #用户u中有评分项的id
        else:
            rated_items = np.where(train_mat[u,:]>0)[0]
        
        for item0 in rated_items:
            for item1 in np.random.choice(rated_items, size=positive_size):
                data.append([u,item0,item1,1.])
            for _ in range(positive_size*negative_time):
                item1 = np.random.randint(items_num) # no matter item1 is positive or negtive
                item2 = np.random.randint(items_num)
                while item2 in rated_items:
                    item2 = np.random.randint(items_num)
                data.append([u,item2,item1,0.])
    return data

In [28]:
def setup_seed(seed):
    np.random.seed(seed)  # Numpy module.
    random.seed(seed)  # Python random module.
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed) # CPU seed
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed) # GPU
        torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True
        

def evaluate(model, test_ratings, test_negatives, K=10):
    """Helper that calls evaluate from the NCF libraries."""
    (hits, ndcgs) = evaluate_model(model, test_ratings, test_negatives, K=K, num_thread=1)
    return np.array(hits).mean(), np.array(ndcgs).mean()


def get_similar_items(item_mat, idx, topk=5):
    m,k = item_mat.shape
    target_item = item_mat[idx,:]
    target_mat = np.reshape(np.tile(target_item,m),(-1,k))
    sim = [np.dot(target_mat[i], item_mat[i])/(np.linalg.norm(target_mat[i])*np.linalg.norm(item_mat[i])) 
           for i in range(m)] 
    sorted_items = np.argsort(-np.array(sim))
    return sorted_items[:topk+1] # the most similar is itself

def get_key(item_dict, value):
    key = -1
    for (k, v) in item_dict.items():
        if v == value:
            key = k
    return key


# read original records
def get_item_dict(file_dir):
    # output: 
    # N: the number of user;
    # M: the number of item
    # data: the list of rating information
    user_ids_dict, rated_item_ids_dict = {},{}
    N, M, u_idx, i_idx = 0,0,0,0 
    data_rating = []
    data_time = []
    f = open(file_dir)
    for line in f.readlines():
        if '::' in line:
            u, i, r = line.split('::')[:3]
        elif ',' in line:
            u, i, r = line.split(',')[:3]
        else:
            u, i, r = line.split()[:3]
    
        if u not in user_ids_dict:
            user_ids_dict[u]=u_idx
            u_idx+=1
        if i not in rated_item_ids_dict:
            rated_item_ids_dict[i]=i_idx
            i_idx+=1
        data_rating.append([user_ids_dict[u],rated_item_ids_dict[i],float(r)])
    
    f.close()
    N = u_idx
    M = i_idx

    return rated_item_ids_dict


def id_name(file_dir):
    id_name_dict = {}
    f = open(file_dir, 'r', encoding='latin-1')
    for line in f.readlines():
        movie_id, movie_name = line.split('|')[:2]
        id_name_dict[int(movie_id)] = movie_name
        
    return id_name_dict

In [4]:
class UInet(nn.Module):
    def __init__(self, embedding_user, embedding_item, embedding_size=16, out_channels=64, kernel_size=2, stride=1, padding=0, n_class=1):
        super(UInet, self).__init__()
        self.embedding_size = embedding_size
        self.embedding_user, self.embedding_item = embedding_user, embedding_item
        self.cnn = nn.Conv2d(in_channels=1, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
        self.relu = nn.ReLU()
        self.linear = nn.Linear(int(((self.embedding_size-kernel_size+2*padding)/stride+1)*out_channels), n_class)
 
    def forward(self, x):
        embed_users = self.embedding_user(x[:,0])
        embed_items0 = self.embedding_item(x[:,1])
        embed_items1 = self.embedding_item(x[:,2])
        out = torch.cat([embed_users, embed_items0],1).reshape(-1, 1, 2, self.embedding_size)
        out = self.cnn(out)          
        out = self.relu(out)
        out = torch.flatten(out, 1)
        out = self.linear(out) 
        return out
    
    def predict(self, pairs, batch_size, verbose):
        """Computes predictions for a given set of user-item pairs.
        Args:
          pairs: A pair of lists (users, items) of the same length.
          batch_size: unused.
          verbose: unused.
        Returns:
          predictions: A list of the same length as users and items, such that
          predictions[i] is the models prediction for (users[i], items[i]).
        """
        del batch_size, verbose
        num_examples = len(pairs[0])
        assert num_examples == len(pairs[1])
        predictions = np.empty(num_examples)
        pairs = np.array(pairs, dtype=np.int16)
        for i in range(num_examples):
            x = np.c_[pairs[0][i],pairs[1][i],pairs[1][i]]
            x = torch.from_numpy(x).long()
            out = self.forward(x)
            predictions[i] = out.reshape(-1).data.numpy()
        return predictions

In [5]:
class UIInet(nn.Module):
    def __init__(self, embedding_user, embedding_item, embedding_size=16, out_channels=64, kernel_size=2, stride=1, padding=0, n_class=1):
        super(UIInet, self).__init__()
        self.embedding_size, self.kernel_size = embedding_size, kernel_size
        self.embedding_user, self.embedding_item = embedding_user, embedding_item
        self.cnn1 = nn.Conv2d(in_channels=1, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
        self.cnn2 = nn.Conv2d(in_channels=out_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
        self.relu = nn.ReLU()
        if self.kernel_size == 2:
            input_size = (self.embedding_size - self.kernel_size + 2 * padding)/stride + 1
            channel_num = out_channels
        else:
            input_size = self.embedding_size
            channel_num = out_channels
        self.linear = nn.Linear(int(((input_size - self.kernel_size + 2 * padding)/stride + 1) * channel_num), n_class)
 
    def forward(self, x):
        embed_users = self.embedding_user(x[:,0])
        embed_items0 = self.embedding_item(x[:,1])
        embed_items1 = self.embedding_item(x[:,2])
        out = torch.cat([embed_items0, embed_users, embed_items1],1).reshape(-1, 1, 3, self.embedding_size)
        out = self.cnn1(out)          
        out = self.relu(out)
        if self.kernel_size == 2: 
            out = self.cnn2(out)          
            out = self.relu(out)
        out = torch.flatten(out, 1)
        out = self.linear(out) 
        return out
    
    def predict(self, pairs, batch_size, verbose):
        """Computes predictions for a given set of user-item pairs.
        Args:
          pairs: A pair of lists (users, items) of the same length.
          batch_size: unused.
          verbose: unused.
        Returns:
          predictions: A list of the same length as users and items, such that
          predictions[i] is the models prediction for (users[i], items[i]).
        """
        del batch_size, verbose
        num_examples = len(pairs[0])
        assert num_examples == len(pairs[1])
        predictions = np.empty(num_examples)
        pairs = np.array(pairs, dtype=np.int16)
        for i in range(num_examples):
            x = np.c_[pairs[0][i],pairs[1][i],pairs[1][i]]
            x = torch.from_numpy(x).long()
            out = self.forward(x)
            predictions[i] = out.reshape(-1).data.numpy()
        return predictions

In [6]:
class Net(nn.Module):
    def __init__(self, users_num, items_num, embedding_size=16, out_channels=64, kernel_size=2, stride=1, padding=0, n_class=1):
        super(Net, self).__init__()
        self.embedding_size, self.kernel_size, self.items_num, self.users_num = embedding_size, kernel_size, items_num, users_num
        self.embedding_user  = nn.Embedding(self.users_num, self.embedding_size)
        self.embedding_item = nn.Embedding(self.items_num, self.embedding_size)
        #self.embedding_user  = nn.Embedding.from_pretrained(torch.nn.init.normal(tensor=torch.Tensor(self.users_num, self.embedding_size), mean=0, std=0.1))
        #self.embedding_item = nn.Embedding.from_pretrained(torch.nn.init.normal(tensor=torch.Tensor(self.items_num, self.embedding_size), mean=0, std=0.1))

        self.net_ui = UInet(embedding_user=self.embedding_user, 
                            embedding_item=self.embedding_item, 
                            embedding_size=self.embedding_size, 
                            out_channels=out_channels, 
                            kernel_size=2, 
                            stride=stride, 
                            padding=padding, 
                            n_class=n_class)
        self.net_uii = UIInet(embedding_user=self.embedding_user, 
                              embedding_item=self.embedding_item, 
                              embedding_size=self.embedding_size, 
                              out_channels=out_channels, 
                              kernel_size=self.kernel_size, 
                              stride=stride, 
                              padding=padding, 
                              n_class=n_class)
 
    def forward(self, x):
        out1 = self.net_ui(x)          
        out2 = self.net_uii(x) 
        return out1, out2
    
    def predict(self, pairs, batch_size, verbose):
        """Computes predictions for a given set of user-item pairs.
        Args:
          pairs: A pair of lists (users, items) of the same length.
          batch_size: unused.
          verbose: unused.
        Returns:
          predictions: A list of the same length as users and items, such that
          predictions[i] is the models prediction for (users[i], items[i]).
        """
        del batch_size, verbose
        num_examples = len(pairs[0])
        assert num_examples == len(pairs[1])
        predictions = np.empty(num_examples)
        pairs = np.array(pairs, dtype=np.int16)
        for i in range(num_examples):
            x = np.c_[pairs[0][i],pairs[1][i],pairs[1][i]]
            x = torch.from_numpy(x).long()
            out, _ = self.forward(x)
            predictions[i] = out.reshape(-1).data.numpy()
        return predictions
    
    def get_embeddings(self):
        idx = torch.LongTensor([i for i in range(self.items_num)])
        embeddings = self.embedding_item(idx)
        return embeddings

In [9]:
def train(model, train_mat, test_ratings, test_negatives, users_num, items_num, train_list=None, test_list=None,
          learning_rate = 1e-2, weight_decay=1e-6, alpha=1., positive_size=1, negative_time=4, epochs=64, 
          batch_size=1024, topK=10, mode='hr'):
    
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    #criterion = nn.BCEWithLogitsLoss()
    criterion = nn.MSELoss()
    
    if train_list!=None:
        train_mat= sequence2mat(sequence=train_list, N=users_num, M=items_num) # train data : user-item matrix
        is_sparse = False
    
    hr_list=[]
    ndcg_list=[]
    hr, ndcg = evaluate(model, test_ratings, test_negatives, K=topK)
    embeddings = model.get_embeddings()
    hr_list.append(hr)
    ndcg_list.append(ndcg)
    print('Init: HR = %.4f, NDCG = %.4f' %(hr, ndcg))
    best_hr, best_ndcg = hr, ndcg
    for epoch in range(epochs):
        data_sequence = generate_instances(train_mat, positive_size=positive_size, negative_time=negative_time, is_sparse=True)
        #data_sequence = read_list("output/" + str(epoch) + ".txt")
        
        train_size = len(data_sequence)
        np.random.shuffle(data_sequence)
        batch_size = batch_size
        total_batch = math.ceil(train_size/batch_size)

        for batch in range(total_batch):
            start = (batch*batch_size)% train_size
            end = min(start+batch_size, train_size)
            data_array = np.array(data_sequence[start:end])
            x = torch.from_numpy(data_array[:,:3]).long()
            y = torch.from_numpy(data_array[:,-1]).reshape(-1,1)
            y1, y2 = model(x)
            loss = criterion(y2.float(), y.float()) + alpha * criterion(y1.float(), y.float())
            optimizer.zero_grad()              # clear gradients for this training step
            loss.backward()                    # backpropagation, compute gradients
            optimizer.step()                   # apply gradients
            
        # Evaluation
        hr, ndcg = evaluate(model, test_ratings, test_negatives, K=topK)
        hr_list.append(hr)
        ndcg_list.append(ndcg)
        print('epoch=%d, loss=%.4f, HR=%.4f, NDCG=%.4f' %(epoch, loss, hr, ndcg))
        
        mlist = hr_list
        if mode == 'ndcg':
            mlist = ndcg_list
        if (len(mlist) > 10) and (mlist[-2] < mlist[-3] > mlist[-1]):
            best_hr, best_ndcg = hr_list[-3], ndcg_list[-3]
            embeddings = model.get_embeddings()
            break
        best_hr, best_ndcg = hr, ndcg
        embeddings = model.get_embeddings()
            
    print("End. Best HR = %.4f, NDCG = %.4f. " %(best_hr, best_ndcg))
    return embeddings

In [64]:
dataset_path = 'data/lastfm'

# Load the dataset
dataset = Dataset(dataset_path)
train_mat, test_ratings, test_negatives = dataset.trainMatrix, dataset.testRatings, dataset.testNegatives
print('Dataset: #user=%d, #item=%d, #train_pairs=%d, #test_pairs=%d' 
      % (dataset.num_users, dataset.num_items, train_mat.nnz, len(test_ratings)))

embedding_size = 32# e=32,o=32 or 64
out_channels = 8
learning_rate = 1e-2
weight_decay = 1e-6
alpha = 1.0
kernel_size = 2

positive_size = 1
negative_time = 4
epochs = 64
batch_size = 1024 #batch大小
topK = 10
mode = 'hr'

setup_seed(3)
# Initialize the model
model = Net(users_num=dataset.num_users, items_num=dataset.num_items, 
            embedding_size=embedding_size, out_channels=out_channels, kernel_size=kernel_size)

if torch.cuda.is_available():
    model = model.cuda()

# Train and evaluate model
embeddings = train(model=model, 
                  train_mat=train_mat.tocsr(), 
                  test_ratings=test_ratings, 
                  test_negatives=test_negatives, 
                  users_num=dataset.num_users, 
                  items_num=dataset.num_items,  
                  learning_rate=learning_rate,
                  weight_decay=weight_decay,
                  alpha=alpha,
                  positive_size=positive_size,
                  negative_time=negative_time,
                  epochs=epochs,
                  batch_size=batch_size,
                  topK=topK,
                  mode=mode)
print('----------------------------------------------------------')

Dataset: #user=518, #item=3488, #train_pairs=45654, #test_pairs=518
Init: HR = 0.1120, NDCG = 0.0532
epoch=0, loss=0.2675, HR=0.4730, NDCG=0.2999
epoch=1, loss=0.2381, HR=0.4788, NDCG=0.2959
epoch=2, loss=0.2332, HR=0.4884, NDCG=0.3024
epoch=3, loss=0.2064, HR=0.5695, NDCG=0.3722
epoch=4, loss=0.1819, HR=0.6506, NDCG=0.4532
epoch=5, loss=0.1630, HR=0.6776, NDCG=0.4723
epoch=6, loss=0.1713, HR=0.6969, NDCG=0.4862
epoch=7, loss=0.1696, HR=0.7162, NDCG=0.4921
epoch=8, loss=0.1431, HR=0.7008, NDCG=0.5033
epoch=9, loss=0.1468, HR=0.7317, NDCG=0.5302
epoch=10, loss=0.1344, HR=0.7375, NDCG=0.5356
epoch=11, loss=0.1298, HR=0.7375, NDCG=0.5191
epoch=12, loss=0.1164, HR=0.7375, NDCG=0.5259
epoch=13, loss=0.1344, HR=0.7490, NDCG=0.5323
epoch=14, loss=0.1210, HR=0.7568, NDCG=0.5326
epoch=15, loss=0.1179, HR=0.7529, NDCG=0.5406
epoch=16, loss=0.1303, HR=0.7548, NDCG=0.5419
epoch=17, loss=0.1192, HR=0.7606, NDCG=0.5313
epoch=18, loss=0.1164, HR=0.7548, NDCG=0.5221
epoch=19, loss=0.1206, HR=0.7568, N

In [8]:
dataset_path = 'data/100k'

# Load the dataset
dataset = Dataset(dataset_path)
train_mat, test_ratings, test_negatives = dataset.trainMatrix, dataset.testRatings, dataset.testNegatives
print('Dataset: #user=%d, #item=%d, #train_pairs=%d, #test_pairs=%d' 
      % (dataset.num_users, dataset.num_items, train_mat.nnz, len(test_ratings)))

embedding_size = 32 # e=32,o=32 or 64 when kernel_size==3
out_channels = 64
learning_rate = 1e-2
weight_decay = 1e-6
alpha = 1.0
kernel_size = 3

positive_size = 1
negative_time = 4
epochs = 64
batch_size = 1024 #batch大小
topK = 10
mode = 'hr'
seed = 10

setup_seed(seed)
# Initialize the model
model = Net(users_num=dataset.num_users, items_num=dataset.num_items, 
            embedding_size=embedding_size, out_channels=out_channels, kernel_size=kernel_size)

if torch.cuda.is_available():
    model = model.cuda()

# Train and evaluate model
embeddings = train(model=model, 
                  train_mat=train_mat.tocsr(), 
                  test_ratings=test_ratings, 
                  test_negatives=test_negatives, 
                  users_num=dataset.num_users, 
                  items_num=dataset.num_items,  
                  learning_rate=learning_rate,
                  weight_decay=weight_decay,
                  alpha=alpha,
                  positive_size=positive_size,
                  negative_time=negative_time,
                  epochs=epochs,
                  batch_size=batch_size,
                  topK=topK,
                  mode=mode)
print('----------------------------------------------------------')

Dataset: #user=943, #item=1682, #train_pairs=99057, #test_pairs=943
10
Init: HR = 0.0954, NDCG = 0.0439
epoch=0, loss=0.2044, HR=0.4698, NDCG=0.2583
epoch=1, loss=0.2236, HR=0.4751, NDCG=0.2656
epoch=2, loss=0.2082, HR=0.4719, NDCG=0.2660
epoch=3, loss=0.1948, HR=0.4825, NDCG=0.2658
epoch=4, loss=0.1928, HR=0.5716, NDCG=0.3157
epoch=5, loss=0.1681, HR=0.6246, NDCG=0.3637
epoch=6, loss=0.1427, HR=0.6543, NDCG=0.3731
epoch=7, loss=0.1357, HR=0.6829, NDCG=0.3982
epoch=8, loss=0.1260, HR=0.6967, NDCG=0.4124
epoch=9, loss=0.1400, HR=0.6903, NDCG=0.4039
epoch=10, loss=0.1313, HR=0.7063, NDCG=0.4203
epoch=11, loss=0.1178, HR=0.7137, NDCG=0.4267
epoch=12, loss=0.1345, HR=0.7031, NDCG=0.4067
epoch=13, loss=0.1144, HR=0.7010, NDCG=0.4122
End. Best HR = 0.7137, NDCG = 0.4267. 
----------------------------------------------------------


In [44]:
dataset_path = 'data/100k'

# Load the dataset
dataset = Dataset(dataset_path)
train_mat, test_ratings, test_negatives = dataset.trainMatrix, dataset.testRatings, dataset.testNegatives
print('Dataset: #user=%d, #item=%d, #train_pairs=%d, #test_pairs=%d' 
      % (dataset.num_users, dataset.num_items, train_mat.nnz, len(test_ratings)))

embedding_size = 32 # e=32,o=8 when kernel_size==2
out_channels = 16
learning_rate = 1e-2
weight_decay = 1e-6
alpha = 1.0
kernel_size = 2

positive_size = 1
negative_time = 4
epochs = 64
batch_size = 1024 #batch大小
topK = 10
mode = 'hr'
seed = 17


setup_seed(seed)
# Initialize the model
model = Net(users_num=dataset.num_users, items_num=dataset.num_items, 
            embedding_size=embedding_size, out_channels=out_channels, kernel_size=kernel_size)

if torch.cuda.is_available():
    model = model.cuda()

# Train and evaluate model
embeddings = train(model=model, 
                  train_mat=train_mat.tocsr(), 
                  test_ratings=test_ratings, 
                  test_negatives=test_negatives, 
                  users_num=dataset.num_users, 
                  items_num=dataset.num_items,  
                  learning_rate=learning_rate,
                  weight_decay=weight_decay,
                  alpha=alpha,
                  positive_size=positive_size,
                  negative_time=negative_time,
                  epochs=epochs,
                  batch_size=batch_size,
                  topK=topK,
                  mode=mode)
print('----------------------------------------------------------')

Dataset: #user=943, #item=1682, #train_pairs=99057, #test_pairs=943
Init: HR = 0.0838, NDCG = 0.0373
epoch=0, loss=0.2003, HR=0.4719, NDCG=0.2534
epoch=1, loss=0.2092, HR=0.4942, NDCG=0.2671
epoch=2, loss=0.1691, HR=0.6151, NDCG=0.3450
epoch=3, loss=0.1700, HR=0.6532, NDCG=0.3780
epoch=4, loss=0.1415, HR=0.6776, NDCG=0.3911
epoch=5, loss=0.1506, HR=0.6882, NDCG=0.4033
epoch=6, loss=0.1276, HR=0.6914, NDCG=0.4099
epoch=7, loss=0.1406, HR=0.7020, NDCG=0.4112
epoch=8, loss=0.1369, HR=0.6999, NDCG=0.4137
epoch=9, loss=0.1320, HR=0.7052, NDCG=0.4213
epoch=10, loss=0.1236, HR=0.6935, NDCG=0.4077
epoch=11, loss=0.1430, HR=0.6903, NDCG=0.4150
End. Best HR = 0.7052, NDCG = 0.4213. 
----------------------------------------------------------


#### build the dict: original id <-> new id, original id <-> name

In [12]:
file_dir = 'datasets/ml-100k/u.item'
id_name_dict = id_name(file_dir) # original id : movie name

file_dir = 'datasets/ml-100k/u.data'
item_dict = get_item_dict(file_dir) # original id : new id

In [45]:
movieid_list = [174, 195, 449]
    
for movieid in movieid_list:
    print('MovieID:', movieid, '; MovieName:', id_name_dict[movieid])
    original_id = str(movieid)
    target_item = item_dict[original_id]

    top5 = get_similar_items(embeddings.data.numpy(), idx=target_item)
    movie_list = [get_key(item_dict=item_dict, value=i) for i in top5]
    rec_list = [id_name_dict[int(movie_id)] for movie_id in movie_list[1:]]
    for i in range(len(rec_list)):
        print('\n{0}: {1}'.format(i+1, rec_list[i]))
    print('------------------------------------------------------------------')

MovieID: 174 ; MovieName: Raiders of the Lost Ark (1981)

1: Empire Strikes Back, The (1980)

2: Back to the Future (1985)

3: Pulp Fiction (1994)

4: Aliens (1986)

5: Alien (1979)
------------------------------------------------------------------
MovieID: 195 ; MovieName: Terminator, The (1984)

1: Terminator 2: Judgment Day (1991)

2: Aliens (1986)

3: Speed (1994)

4: Empire Strikes Back, The (1980)

5: Jurassic Park (1993)
------------------------------------------------------------------
MovieID: 449 ; MovieName: Star Trek: The Motion Picture (1979)

1: Star Trek V: The Final Frontier (1989)

2: Star Trek III: The Search for Spock (1984)

3: Waterworld (1995)

4: Star Trek VI: The Undiscovered Country (1991)

5: Star Trek IV: The Voyage Home (1986)
------------------------------------------------------------------
