Original Paper: 

https://arxiv.org/abs/2108.12184

Original code and dataset: 

https://github.com/usydnlp/Glocal_K

One part of our project is the reconstruction is 
the reconstruction of the GLocal_K algorithm with a different approach: we replaced MOST of tensorflow codes with pytorch, which essentially means to redo all the key parts regarding to the model. Other than that, we basically follow the original ideas of the paper and structure of the existing code. 

There is one exception (a helpful function) that we used tensorflow instead of pytorch for simplicity.

In [254]:
from google.colab import drive
drive.mount('/content/drive')

from zipfile import ZipFile
file_name = '/content/data.zip'

with ZipFile(file_name, 'r') as zip:
  zip.extractall()
  print('Done')
import warnings
warnings.filterwarnings("ignore")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Done


In [255]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from time import time
from scipy.sparse import csc_matrix
import numpy as np
import h5py

# Data Loader Function

In [256]:
def load_data_100k(path='./', delimiter='\t'):

    train = np.loadtxt(path+'movielens_100k_u1.base', skiprows=0, delimiter=delimiter).astype('int32')
    test = np.loadtxt(path+'movielens_100k_u1.test', skiprows=0, delimiter=delimiter).astype('int32')
    total = np.concatenate((train, test), axis=0)

    n_u = np.unique(total[:,0]).size  # num of users
    n_m = np.unique(total[:,1]).size  # num of movies
    n_train = train.shape[0]  # num of training ratings
    n_test = test.shape[0]  # num of test ratings

    train_r = np.zeros((n_m, n_u), dtype='float32')
    test_r = np.zeros((n_m, n_u), dtype='float32')

    for i in range(n_train):
        train_r[train[i,1]-1, train[i,0]-1] = train[i,2]

    for i in range(n_test):
        test_r[test[i,1]-1, test[i,0]-1] = test[i,2]

    train_m = np.greater(train_r, 1e-12).astype('float32')  # masks indicating non-zero entries
    test_m = np.greater(test_r, 1e-12).astype('float32')

    print('data matrix loaded')
    print('num of users: {}'.format(n_u))
    print('num of movies: {}'.format(n_m))
    print('num of training ratings: {}'.format(n_train))
    print('num of test ratings: {}'.format(n_test))

    return n_m, n_u, train_r, train_m, test_r, test_m

In [257]:
def load_data_1m(path='./', delimiter='::', frac=0.1, seed=1234):

    tic = time()
    print('reading data...')
    data = np.loadtxt(path+'movielens_1m_dataset.dat', skiprows=0, delimiter=delimiter).astype('int32')
    print('taken', time() - tic, 'seconds')

    n_u = np.unique(data[:,0]).size  # num of users
    n_m = np.unique(data[:,1]).size  # num of movies
    n_r = data.shape[0]  # num of ratings

    udict = {}
    for i, u in enumerate(np.unique(data[:,0]).tolist()):
        udict[u] = i
    mdict = {}
    for i, m in enumerate(np.unique(data[:,1]).tolist()):
        mdict[m] = i

    np.random.seed(seed)
    idx = np.arange(n_r)
    np.random.shuffle(idx)

    train_r = np.zeros((n_m, n_u), dtype='float32')
    test_r = np.zeros((n_m, n_u), dtype='float32')

    for i in range(n_r):
        u_id = data[idx[i], 0]
        m_id = data[idx[i], 1]
        r = data[idx[i], 2]

        if i < int(frac * n_r):
            test_r[mdict[m_id], udict[u_id]] = r
        else:
            train_r[mdict[m_id], udict[u_id]] = r

    train_m = np.greater(train_r, 1e-12).astype('float32')  # masks indicating non-zero entries
    test_m = np.greater(test_r, 1e-12).astype('float32')

    print('data matrix loaded')
    print('num of users: {}'.format(n_u))
    print('num of movies: {}'.format(n_m))
    print('num of training ratings: {}'.format(n_r - int(frac * n_r)))
    print('num of test ratings: {}'.format(int(frac * n_r)))

    return n_m, n_u, train_r, train_m, test_r, test_m

In [258]:
def load_matlab_file(path_file, name_field):
    
    db = h5py.File(path_file, 'r')
    ds = db[name_field]

    try:
        if 'ir' in ds.keys():
            data = np.asarray(ds['data'])
            ir   = np.asarray(ds['ir'])
            jc   = np.asarray(ds['jc'])
            out  = csc_matrix((data, ir, jc)).astype(np.float32)
    except AttributeError:
        out = np.asarray(ds).astype(np.float32).T

    db.close()

    return out

In [259]:
def load_data_monti(path='./'):

    M = load_matlab_file(path+'douban_monti_dataset.mat', 'M')
    Otraining = load_matlab_file(path+'douban_monti_dataset.mat', 'Otraining') * M
    Otest = load_matlab_file(path+'douban_monti_dataset.mat', 'Otest') * M

    n_u = M.shape[0]  # num of users
    n_m = M.shape[1]  # num of movies
    n_train = Otraining[np.where(Otraining)].size  # num of training ratings
    n_test = Otest[np.where(Otest)].size  # num of test ratings

    train_r = Otraining.T
    test_r = Otest.T

    train_m = np.greater(train_r, 1e-12).astype('float32')  # masks indicating non-zero entries
    test_m = np.greater(test_r, 1e-12).astype('float32')

    print('data matrix loaded')
    print('num of users: {}'.format(n_u))
    print('num of movies: {}'.format(n_m))
    print('num of training ratings: {}'.format(n_train))
    print('num of test ratings: {}'.format(n_test))

    return n_m, n_u, train_r, train_m, test_r, test_m

# Load Data

In [260]:
# Insert the path of a data directory by yourself (e.g., '/content/.../data')
# .-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._
data_path = '/content/data'
# .-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._

In [261]:
# Select a dataset among 'ML-1M', 'ML-100K', and 'Douban'
# .-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._
dataset = 'ML-100K'
# .-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._

In [262]:
# Data Load
try:
    if dataset == 'ML-100K':
        path = data_path + '/MovieLens_100K/'
        n_m, n_u, train_r, train_m, test_r, test_m = load_data_100k(path=path, delimiter='\t')

    elif dataset == 'ML-1M':
        path = data_path + '/MovieLens_1M/'
        n_m, n_u, train_r, train_m, test_r, test_m = load_data_1m(path=path, delimiter='::', frac=0.1, seed=1234)

    elif dataset == 'Douban':
        path = data_path + '/Douban_monti/'
        n_m, n_u, train_r, train_m, test_r, test_m = load_data_monti(path=path)

    else:
        raise ValueError

except ValueError:
    print('Error: Unable to load data')

# Dim of data
print("train_r: ", train_r.shape)
print("train_m: ", train_m.shape)
print("test_r: ", test_r.shape)
print("test_m: ", test_m.shape)
print("n_m: ", n_m)
print("n_u: ", n_u)

data matrix loaded
num of users: 943
num of movies: 1682
num of training ratings: 80000
num of test ratings: 20000
train_r:  (1682, 943)
train_m:  (1682, 943)
test_r:  (1682, 943)
test_m:  (1682, 943)
n_m:  1682
n_u:  943


# Hyperparameter Settings

In [263]:
# Common hyperparameter settings
n_hid = 500
n_dim = 5
n_layers = 2
gk_size = 3

In [264]:
# Different hyperparameter settings for each dataset
if dataset == 'ML-100K':
    lambda_2 = 20.  # l2 regularisation
    lambda_s = 0.006
    iter_p = 5  # optimisation
    iter_f = 5
    epoch_p = 30  # training epoch
    epoch_f = 60
    dot_scale = 1  # scaled dot product

elif dataset == 'ML-1M':
    lambda_2 = 70.
    lambda_s = 0.018
    iter_p = 50
    iter_f = 10
    epoch_p = 20
    epoch_f = 30
    dot_scale = 0.5

elif dataset == 'Douban':
    lambda_2 = 10.
    lambda_s = 0.022
    iter_p = 5
    iter_f = 5
    epoch_p = 20
    epoch_f = 60
    dot_scale = 2

In [265]:
input_shape = (n_m, n_u)

# Create an input tensor
R = torch.zeros(input_shape, dtype=torch.float32, requires_grad=True)
# print(R.shape)

# Network Function

## Local Kernel:

In [266]:
def local_kernel(u, v):
    dist = torch.norm(u - v, dim=2, p=2)
    hat = torch.clamp(1 - dist**2, min=0)
    return hat

## Global Kernel

In [267]:
def global_kernel(input, gk_size, dot_scale):
    avg_pooling = torch.mean(input, dim=1).unsqueeze(0)
    n_kernel = avg_pooling.size(1)

    conv_kernel = nn.Parameter(torch.empty(n_kernel, gk_size**2).normal_(0, 0.1))
    gk = torch.matmul(avg_pooling, conv_kernel) * dot_scale
    gk = gk.view(gk_size, gk_size, 1, 1)

    return gk

def global_conv(input, W):

    input = tf.reshape(input, [1, input.shape[0], input.shape[1], 1])
    # print("input", type(input))
    # print("W", type(W))
    conv2d = tf.nn.relu(tf.nn.conv2d(input, W.detach().numpy(), strides=[1,1,1,1], padding='SAME'))

    return tf.reshape(conv2d, [conv2d.shape[1], conv2d.shape[2]])

In [268]:
class KernelLayer(nn.Module):
    def __init__(self, input_shape, n_hid, n_dim, activation, lambda_s=1e-4, lambda_2=1e-4, name=''):
        super(KernelLayer, self).__init__()

        self.W = nn.Parameter(torch.empty(input_shape[1], n_hid))
        nn.init.xavier_uniform_(self.W)

        self.u = nn.Parameter(torch.empty(input_shape[1], 1, n_dim).normal_(0, 1e-3))
        self.v = nn.Parameter(torch.empty(1, n_hid, n_dim).normal_(0, 1e-3))
        self.b = nn.Parameter(torch.zeros(n_hid))

        self.activation = activation
        self.lambda_s = lambda_s
        self.lambda_2 = lambda_2

    def forward(self, x):

        w_hat = local_kernel(self.u, self.v)
        # print("x.shape", x.shape)
        sparse_reg_term = torch.norm(w_hat, p=2) * self.lambda_s
        l2_reg_term = torch.norm(self.W, p=2) * self.lambda_2
        
        W_eff = self.W * w_hat  # Local kernelised weight matrix
        y = torch.matmul(x, W_eff) + self.b
        y = self.activation(y)
        return y, sparse_reg_term + l2_reg_term

# Network Instantiation

## Pre-training
One thing to notice here is that I did not implement this part as typical Model. Instead, to ***reconstruct*** the original work, I used a for loop to create such "model".

In [276]:
y = R
reg_losses = 0
for i in range(n_layers):
    # print(y.shape)
    layer = KernelLayer(y.shape, n_hid, n_dim, torch.sigmoid, name=str(i))
    y, reg_loss = layer(y)
    
    reg_losses += reg_loss.item()

# print(reg_losses)

# Compute output and add regularization loss

layer_reg = KernelLayer(y.shape, n_u, n_dim, activation=lambda x: x, name='out')
pred, reg_loss = layer_reg.forward(y)
pred = nn.Parameter(pred)  # wrap pred in a nn.Parameter object
reg_losses += reg_loss.item()

# Compute L2 loss

train_r = torch.tensor(train_r)
train_m = torch.tensor(train_m)

diff = train_m * (train_r - pred)

sqE = torch.nn.functional.mse_loss(diff, torch.zeros_like(diff), reduction='sum')
loss_p = sqE + reg_losses
print(loss_p)

# Use L-BFGS-B optimizer to minimize loss
optimizer_p = optim.LBFGS(params=[pred], max_iter=iter_p, max_eval=None, tolerance_grad=1e-07, tolerance_change=1e-09, history_size=10, line_search_fn=None)


tensor(1111901., grad_fn=<AddBackward0>)


## Fine-tuning

In [270]:
y = R
reg_losses = None
for i in range(n_layers):
    layer = KernelLayer(y.shape, n_hid, n_dim, torch.sigmoid, name=str(i))
    
    y, _ = layer(y)

layer_reg = KernelLayer(y.shape, n_u, n_dim, activation=lambda x: x, name='out')
y_dash, _ = layer_reg.forward(y)

gk = global_kernel(y_dash, gk_size, dot_scale)
y_hat = global_conv(train_r, gk)
# print(type(y_hat))
y_hat = torch.tensor(np.array(y_hat))
# print(type(y_hat))
# pred = nn.Parameter(pred)  # wrap pred in a nn.Parameter object
# reg_losses += reg_loss.item()

for j in range(n_layers):
    layer_hat = KernelLayer(y_hat.shape , n_hid, n_dim, torch.sigmoid, name=str(i))
    y_hat, reg_loss = layer_hat(y_hat)
    reg_losses = reg_loss if reg_losses is None else reg_losses + reg_loss


layer_tun = KernelLayer(y_hat.shape, n_u, n_dim, activation=lambda x: x, name='out')
pred_tun, reg_loss = layer_tun.forward(y_hat)
pred_tun = nn.Parameter(pred_tun)  # wrap pred in a nn.Parameter object
reg_losses += reg_loss.item()





# Compute L2 loss

train_r = torch.tensor(train_r)
train_m = torch.tensor(train_m)

diff_f = train_m * (train_r - pred)

sqE = torch.nn.functional.mse_loss(diff_f, torch.zeros_like(diff_f), reduction='sum')
loss_f = sqE + reg_losses
print(loss_f)


optimizer_f = optim.LBFGS(params=[pred_tun], max_iter=iter_f, max_eval=None, tolerance_grad=1e-07, tolerance_change=1e-09, history_size=10, line_search_fn=None)




tensor(1114689.2500, grad_fn=<AddBackward0>)


# Evaluation code

In [271]:
def dcg_k(score_label, k):
    dcg, i = 0., 0
    for s in score_label:
        if i < k:
            dcg += (2**s[1]-1) / np.log2(2+i)
            i += 1
    return dcg

In [272]:
def ndcg_k(y_hat, y, k):
    score_label = np.stack([y_hat, y], axis=1).tolist()
    score_label = sorted(score_label, key=lambda d:d[0], reverse=True)
    score_label_ = sorted(score_label, key=lambda d:d[1], reverse=True)
    norm, i = 0., 0
    for s in score_label_:
        if i < k:
            norm += (2**s[1]-1) / np.log2(2+i)
            i += 1
    dcg = dcg_k(score_label, k)
    return dcg / norm

In [273]:
def call_ndcg(y_hat, y):
    ndcg_sum, num = 0, 0
    y_hat, y = y_hat.T, y.T
    n_users = y.shape[0]

    for i in range(n_users):
        y_hat_i = y_hat[i][np.where(y[i])]
        y_i = y[i][np.where(y[i])]

        if y_i.shape[0] < 2:
            continue

        ndcg_sum += ndcg_k(y_hat_i, y_i, y_i.shape[0])  # user-wise calculation
        num += 1

    return ndcg_sum / num

# Training and Test Loop

Due to the difference between tensorflow and pytorch, here, instead of following the original work, I create a model using nn.Module (also for convenience). 

In [274]:
best_rmse_ep, best_mae_ep, best_ndcg_ep = 0, 0, 0
best_rmse, best_mae, best_ndcg = float("inf"), float("inf"), 0
time_cumulative = 0

train_r = torch.tensor(train_r)
train_m = torch.tensor(train_m)
test_r = torch.tensor(test_r)
test_m = torch.tensor(test_m)


# Just assume n_layers = 2
class KernelModel(nn.Module):
    def __init__(self, n_layers, input_shape, n_hid, n_dim, n_u):
        super(KernelModel, self).__init__()
        self.layers = nn.ModuleList()
        for _ in range(n_layers):
            layer = KernelLayer(input_shape, n_hid, n_dim, torch.sigmoid)
            self.layers.append(layer)
            input_shape = (input_shape[0], n_hid)  # Update input_shape for the next layer
        self.output_layer = KernelLayer(input_shape, n_u, n_dim, activation=lambda x: x)

    def forward(self, x):
        reg_losses = 0
        for layer in self.layers:
            x, reg_loss = layer(x)
            reg_losses += reg_loss.item()
        pred, reg_loss = self.output_layer(x)
        reg_losses += reg_loss.item()
        return pred, reg_losses

# Initialize the model
model_p = KernelModel(n_layers, R.shape, n_hid, n_dim, n_u)

# Now you can use model_p in the training loop
pre, reg_losses = model_p(train_r)
pre = pre.detach().numpy()


# Some helpful functions
def closure_p():
    optimizer_p.zero_grad()
    diff = train_m * (train_r - pred)
    loss_p = torch.nn.functional.mse_loss(diff, torch.zeros_like(diff), reduction='sum') + reg_losses
    loss_p.backward()
    return loss_p

def closure_f():
    optimizer_f.zero_grad()
    diff = train_m * (train_r - pred)
    loss_p = torch.nn.functional.mse_loss(diff, torch.zeros_like(diff), reduction='sum') + reg_losses
    loss_p.backward()
    return loss_p

def loss_fn(pred, train_r, train_m, clip=True):
    diff = train_m * (train_r - pred)
    if clip:
        diff = torch.clamp(diff, min=1., max=5.)
    sqE = torch.nn.functional.mse_loss(diff, torch.zeros_like(diff), reduction='sum')
    return sqE

# optimizer_p 
for i in range(epoch_p):
  tic = time()
  
  # Replace the optimizer_p.minimize() with a PyTorch training step

  optimizer_p.step(closure_p)

  # Replace 'pre' with the output of the PyTorch model
  
  pre, reg_losses = model_p(train_r)


  t = time() - tic
  time_cumulative += t

  # Calculate error, test_rmse, and train_rmse using PyTorch tensor operations
  error = (test_m * (torch.clamp(pre, 1., 5.) - test_r) ** 2).sum() / test_m.sum()
  test_rmse = torch.sqrt(error)

  error_train = (train_m * (torch.clamp(pre, 1., 5.) - train_r) ** 2).sum() / train_m.sum()
  train_rmse = torch.sqrt(error_train)

  print('.-^-._' * 12)
  print('PRE-TRAINING')
  print('Epoch:', i+1, 'test rmse:', test_rmse, 'train rmse:', train_rmse)
  print('Time:', t, 'seconds')
  print('Time cumulative:', time_cumulative, 'seconds')
  print('.-^-._' * 12)


# optimizer_f
class KernelModelF(nn.Module):
    def __init__(self, n_layers, input_shape, n_hid, n_dim, n_u, gk_size, dot_scale):
        super(KernelModelF, self).__init__()
        self.layers1 = nn.ModuleList()
        self.layers2 = nn.ModuleList()
        for _ in range(n_layers):
            layer = KernelLayer(input_shape, n_hid, n_dim, torch.sigmoid)
            self.layers1.append(layer)
            input_shape = (input_shape[0], n_hid)  # Update input_shape for the next layer
        for _ in range(n_layers):
            layer = KernelLayer(input_shape, n_hid, n_dim, torch.sigmoid)
            self.layers2.append(layer)
            input_shape = (input_shape[0], n_hid)  # Update input_shape for the next layer
        self.output_layer = KernelLayer(input_shape, n_u, n_dim, activation=lambda x: x)
        self.gk_size = gk_size
        self.dot_scale = dot_scale

    def forward(self, x):
        reg_losses = 0
        for layer in self.layers1:
            x, reg_loss = layer(x)
            reg_losses += reg_loss.item()
        y_dash, reg_loss = self.output_layer(x)
        reg_losses += reg_loss.item()

        gk = global_kernel(y_dash, self.gk_size, self.dot_scale)
        y_hat = global_conv(x.detach().numpy(), gk)
        y_hat = torch.tensor(np.array(y_hat))

        for layer in self.layers2:
            y_hat, reg_loss = layer(y_hat)
            reg_losses += reg_loss.item()
        pred_tun, reg_loss = self.output_layer(y_hat)
        reg_losses += reg_loss.item()

        return pred_tun, reg_losses

# Initialize the model
model_f = KernelModelF(n_layers, R.shape, n_hid, n_dim, n_u, gk_size, dot_scale)

test_r = test_r.numpy()
test_m = test_m.numpy()
# Now you can use model_with_global_kernel in the training loop
pred_tun, reg_losses = model_f(train_r)
pred_tun = pred_tun.detach().numpy()

for i in range(epoch_p):
  tic = time()
  
  # Replace the optimizer_p.minimize() with a PyTorch training step

  optimizer_f.step(closure_f)

  # Replace 'pre' with the output of the PyTorch model
  
  pre, reg_losses = model_f(train_r)
  pre = pre.detach().numpy()

  t = time() - tic
  time_cumulative += t

  error = (test_m * (np.clip(pre, 1., 5.) - test_r) ** 2).sum() / test_m.sum()  # test error
  test_rmse = np.sqrt(error)

  error_train = (train_m * (np.clip(pre, 1., 5.) - train_r.numpy()) ** 2).sum() / train_m.sum()  # train error
  train_rmse = np.sqrt(error_train)

  test_mae = (test_m * np.abs(np.clip(pre, 1., 5.) - test_r)).sum() / test_m.sum()
  train_mae = (train_m * np.abs(np.clip(pre, 1., 5.) - train_r.numpy())).sum() / train_m.sum()

  test_ndcg = call_ndcg(np.clip(pre, 1., 5.), test_r)
  train_ndcg = call_ndcg(np.clip(pre, 1., 5.), train_r)

  if test_rmse < best_rmse:
      best_rmse = test_rmse
      best_rmse_ep = i+1

  if test_mae < best_mae:
      best_mae = test_mae
      best_mae_ep = i+1

  if best_ndcg < test_ndcg:
      best_ndcg = test_ndcg
      best_ndcg_ep = i+1

  print('.-^-._' * 12)
  print('FINE-TUNING')
  print('Epoch:', i+1, 'test rmse:', test_rmse, 'test mae:', test_mae, 'test ndcg:', test_ndcg)
  print('Epoch:', i+1, 'train rmse:', train_rmse, 'train mae:', train_mae, 'train ndcg:', train_ndcg)
  print('Time:', t, 'seconds')
  print('Time cumulative:', time_cumulative, 'seconds')
  print('.-^-._' * 12)


.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._
PRE-TRAINING
Epoch: 1 test rmse: tensor(2.7835, grad_fn=<SqrtBackward0>) train rmse: tensor(2.7638, grad_fn=<SqrtBackward0>)
Time: 0.3860139846801758 seconds
Time cumulative: 0.3860139846801758 seconds
.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._
.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._
PRE-TRAINING
Epoch: 2 test rmse: tensor(2.7835, grad_fn=<SqrtBackward0>) train rmse: tensor(2.7638, grad_fn=<SqrtBackward0>)
Time: 0.20011019706726074 seconds
Time cumulative: 0.5861241817474365 seconds
.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._
.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._
PRE-TRAINING
Epoch: 3 test rmse: tensor(2.7835, grad_fn=<SqrtBackward0>) train rmse: tensor(2.7638, grad_fn=<SqrtBackward0>)
Time: 0.1889195442199707 seconds
Time cumulative: 0.7750437259674072 seconds
.-^-._.-^-._.-^-._.-^-._.-^-

In [275]:
# Final result
print('Epoch:', best_rmse_ep, ' best rmse:', best_rmse)
print('Epoch:', best_mae_ep, ' best mae:', best_mae)
print('Epoch:', best_ndcg_ep, ' best ndcg:', best_ndcg)

Epoch: 23  best rmse: 2.7853196
Epoch: 23  best mae: 2.535229
Epoch: 27  best ndcg: 0.8378764538115717
