In [9]:
import numpy as np
import time
import random
from math import sqrt,fabs,log
import sys
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.models.cornac.cornac_utils import predict_ranking
from recommenders.utils.timer import Timer
from recommenders.utils.constants import SEED

In [12]:
class HNERec:
    def __init__(self, unum, inum, ratedim, userdim, itemdim, user_metapaths,item_metapaths, trainfile, testfile, steps, delta, beta_e, beta_h, beta_p, beta_w, beta_b, reg_u, reg_v):
        self.unum = unum
        self.inum = inum
        self.ratedim = ratedim
        self.userdim = userdim
        self.itemdim = itemdim
        self.steps = steps
        self.delta = delta
        self.beta_e = beta_e
        self.beta_h = beta_h
        self.beta_p = beta_p
        self.beta_w = beta_w
        self.beta_b = beta_b
        self.reg_u = reg_u
        self.reg_v = reg_v

        self.user_metapathnum = len(user_metapaths)
        self.item_metapathnum = len(item_metapaths)

        self.X, self.user_metapathdims = self.load_embedding(user_metapaths, unum)
        print('Load user embeddings finished.')

        self.Y, self.item_metapathdims = self.load_embedding(item_metapaths, inum)
        print('Load user embeddings finished.')

        self.R, self.T, self.ba = self.load_rating(trainfile, testfile)
        print('Load rating finished.')
        print('train size : ', len(self.R))
        print('test size : ', len(self.T))

        self.initialize();
        self.recommend();

    def load_embedding(self, metapaths, num):
        X = {}
        for i in range(num):
            X[i] = {}
        metapathdims = []
    
        ctn = 0
        for metapath in metapaths:
            sourcefile = '../data/embeddings/' + metapath
            #print sourcefile
            with open(sourcefile) as infile:
                
                k = int(infile.readline().strip().split(' ')[1])
                metapathdims.append(k)
                for i in range(num):
                    X[i][ctn] = np.zeros(k)

                n = 0
                for line in infile.readlines():
                    n += 1
                    arr = line.strip().split(' ')
                    i = int(arr[0]) - 1
                    for j in range(k):
                        X[i][ctn][j] = float(arr[j + 1])
                print('metapath ', metapath, 'numbers ', n)
            ctn += 1
        return X, metapathdims

    def load_rating(self, trainfile, testfile):
        R_train = []
        R_test = []
        ba = 0.0
        n = 0
        user_test_dict = dict()
        with open(trainfile) as infile:
            for line in infile.readlines():
                user, item, rating = line.strip().split('\t')
                R_train.append([int(user)-1, int(item)-1, int(rating)])
                ba += int(rating)
                n += 1
        ba = ba / n
        ba = 0
        with open(testfile) as infile:
            for line in infile.readlines():
                user, item, rating = line.strip().split('\t')
                R_test.append([int(user)-1, int(item)-1, int(rating)])

        return R_train, R_test, ba

    def initialize(self):
        self.E = np.random.randn(self.unum, self.itemdim) * 0.1
        self.H = np.random.randn(self.inum, self.userdim) * 0.1
        self.U = np.random.randn(self.unum, self.ratedim) * 0.1
        self.V = np.random.randn(self.inum, self.ratedim) * 0.1

        self.pu = np.ones((self.unum, self.user_metapathnum)) * 1.0 / self.user_metapathnum
        self.pv = np.ones((self.inum, self.item_metapathnum)) * 1.0 / self.item_metapathnum


        self.Wu = {}
        self.bu = {}
        for k in range(self.user_metapathnum):
            self.Wu[k] = np.random.randn(self.userdim, self.user_metapathdims[k]) * 0.1
            self.bu[k] = np.random.randn(self.userdim) * 0.1

        self.Wv = {}
        self.bv = {}
        for k in range(self.item_metapathnum):
            self.Wv[k] = np.random.randn(self.itemdim, self.item_metapathdims[k]) * 0.1
            self.bv[k] = np.random.randn(self.itemdim) * 0.1

    def cal_u(self, i):
        ui = np.zeros(self.userdim)
        for k in range(self.user_metapathnum):
            ui += self.pu[i][k] * (self.Wu[k].dot(self.X[i][k]) + self.bu[k])
        return ui

    def cal_v(self, j):
        vj = np.zeros(self.itemdim)
        for k in range(self.item_metapathnum):
            vj += self.pv[j][k] * (self.Wv[k].dot(self.Y[j][k]) + self.bv[k])
        return vj

    def get_rating(self, i, j):
        ui = self.cal_u(i)
        vj = self.cal_v(j)
        return self.U[i, :].dot(self.V[j, :]) + self.reg_u * ui.dot(self.H[j, :]) + self.reg_v * self.E[i, :].dot(vj)

    def maermse(self):
        m = 0.0
        mae = 0.0
        rmse = 0.0
        n = 0
        for t in self.T:
            n += 1
            i = t[0]
            j = t[1]
            r = t[2]
            r_p = self.get_rating(i, j)

            if r_p > 5: r_p = 5
            if r_p < 1: r_p = 1
            m = fabs(r_p - r)
            mae += m
            rmse += m * m
        mae = mae * 1.0 / n
        rmse = sqrt(rmse * 1.0 / n)
        return mae, rmse

    def recommend(self):
        mae = []
        rmse = []
        starttime = time.process_time()
        perror = 99999
        cerror = 9999
        n = len(self.R)

        for step in range(steps):
            total_error = 0.0
            train_start_time = time.time()
            for t in self.R:
                i = t[0]
                j = t[1]
                rij = t[2]

                rij_t = self.get_rating(i, j)
                eij = rij - rij_t
                total_error += eij * eij
                
                U_g = -eij * self.V[j, :] + self.beta_e * self.U[i, :]
                V_g = -eij * self.U[i, :] + self.beta_h * self.V[j, :]

                self.U[i, :] -= delta * U_g
                self.V[j, :] -= delta * V_g

                ui = self.cal_u(i)
                for k in range(self.user_metapathnum):
                    pu_g = self.reg_u * -eij * self.H[j, :].dot(self.Wu[k].dot(self.X[i][k]) + self.bu[k]) + self.beta_p * self.pu[i][k]
                    Wu_g = self.reg_u * -eij * self.pu[i][k] * np.array([self.H[j, :]]).T.dot(np.array([self.X[i][k]])) + self.beta_w * self.Wu[k]
                    bu_g = self.reg_u * -eij * self.pu[i][k] * self.H[j, :] + self.beta_b * self.bu[k]

                    self.pu[i][k] -= 0.1 * self.delta * pu_g
                    self.Wu[k] -= 0.1 * self.delta * Wu_g
                    self.bu[k] -= 0.1 * self.delta * bu_g

                H_g = self.reg_u * -eij * ui + self.beta_h * self.H[j, :]
                self.H[j, :] -= self.delta * H_g

                vj = self.cal_v(j)
                for k in range(self.item_metapathnum):
                    pv_g = self.reg_v * -eij * self.E[i, :].dot(self.Wv[k].dot(self.Y[j][k]) + self.bv[k]) + self.beta_p * self.pv[j][k]
                    Wv_g = self.reg_v * -eij * self.pv[j][k] * np.array([self.E[i, :]]).T.dot(np.array([self.Y[j][k]])) + self.beta_w * self.Wv[k]
                    bv_g = self.reg_v * -eij * self.pv[j][k] * self.E[i, :] + self.beta_b * self.bv[k]

                    self.pv[j][k] -= 0.1 * self.delta * pv_g
                    self.Wv[k] -= 0.1 * self.delta * Wv_g
                    self.bv[k] -= 0.1 * self.delta * bv_g

                E_g = self.reg_v * -eij * vj + 0.01 * self.E[i, :]
                self.E[i, :] -= self.delta * E_g

            perror = cerror
            cerror = total_error / n

            self.delta = 0.93 * self.delta

            if(abs(perror - cerror) < 0.0001):
                break
            print('step ', step, 'crror : ', sqrt(cerror))
            train_end_time = time.time()
            print('train time : ', (train_end_time - train_start_time))
            MAE, RMSE = self.maermse()
            mae.append(MAE)
            rmse.append(RMSE)
            #if step % 5 == 0:
            print('step, MAE, RMSE ', step, MAE, RMSE)
            test_time = time.time()
            print('time: ', test_time - train_end_time)
        print('MAE: ', min(mae), ' RMSE: ', min(rmse))

In [13]:
unum = 4010
inum = 9788
ratedim = 10#int(sys.argv[1])
userdim = 30
itemdim = 10
train_rate = 0.8
    
user_metapaths = ['umu', 'umamu', 'umdmu', 'umtmu']
item_metapaths = ['mam', 'mdm', 'mtm', 'mum']

for i in range(len(user_metapaths)):
    user_metapaths[i] += '_' + str(train_rate) + '.embedding'
for i in range(len(item_metapaths)):
    item_metapaths[i] += '_' + str(train_rate) + '.embedding'

#user_metapaths = ['ubu_' + str(train_rate) +'.embedding', 'ubcibu_''.embedding', 'ubcabu_0.8.embedding']
    
#item_metapaths = ['bub_0.8.embedding', 'bcib_0.8.embedding', 'bcab_0.8.embedding']
trainfile = '../data/um_' + str(train_rate) +'.train'
testfile = '../data/um_' + str(train_rate) + '.test'
steps = 100
delta = 0.01
beta_e = 0.1
beta_h = 0.1
beta_p = 2
beta_w = 0.1
beta_b = 0.01
reg_u = 1.0
reg_v = 1.0
print('train_rate: ', train_rate)
print('ratedim: ', ratedim, ' userdim: ', userdim, ' itemdim: ', itemdim)
print('max_steps: ', steps)
print('delta: ', delta, 'beta_e: ', beta_e, 'beta_h: ', beta_h, 'beta_p: ', beta_p, 'beta_w: ', beta_w, 'beta_b', beta_b, 'reg_u', reg_u, 'reg_v', reg_v)

HERec_pl=HNERec(unum, inum, ratedim, userdim, itemdim, user_metapaths, item_metapaths, trainfile, testfile, steps, delta, beta_e, beta_h, beta_p, beta_w, beta_b, reg_u, reg_v)

train_rate:  0.8
ratedim:  10  userdim:  30  itemdim:  10
max_steps:  100
delta:  0.01 beta_e:  0.1 beta_h:  0.1 beta_p:  2 beta_w:  0.1 beta_b 0.01 reg_u 1.0 reg_v 1.0
metapath  umu_0.8.embedding numbers  3870
metapath  umamu_0.8.embedding numbers  3860
metapath  umdmu_0.8.embedding numbers  3787
metapath  umtmu_0.8.embedding numbers  3872
Load user embeddings finished.
metapath  mam_0.8.embedding numbers  9100
metapath  mdm_0.8.embedding numbers  7745
metapath  mtm_0.8.embedding numbers  9787
metapath  mum_0.8.embedding numbers  9338
Load user embeddings finished.
Load rating finished.
train size :  258697
test size :  64675
step  0 crror :  1.2434801232659016
train time :  40.080870628356934
step, MAE, RMSE  0 0.6389285019594038 0.8238848947249358
time:  1.4821619987487793
step  1 crror :  0.789131754711417
train time :  41.1482617855072
step, MAE, RMSE  1 0.6171665487218861 0.7917516121751406
time:  1.6193530559539795
step  2 crror :  0.7619896934361788
train time :  40.97644162178

In [16]:
pred_rating=np.zeros([unum,inum])
eu=np.zeros([unum,userdim])
for i in range(unum):
    ui = HERec_pl.cal_u(i)
    eu[i,:]=ui.T

ev=np.zeros([inum,itemdim])
for i in range(inum):
    vi = HERec_pl.cal_v(i)
    ev[i,:]=vi.T

pred_rating=HERec_pl.U.dot(HERec_pl.V.T) + HERec_pl.reg_u * eu.dot(HERec_pl.H.T) + HERec_pl.reg_v * HERec_pl.E.dot(ev.T)
print(np.shape(pred_rating))
print(pred_rating)

(4010, 9788)
[[2.85009949 3.17948071 2.98700219 ... 3.23466831 3.46505734 2.44952021]
 [4.51672439 4.54144713 4.27596405 ... 3.68519519 5.1562636  4.10856501]
 [3.20084217 3.44306807 3.42914715 ... 3.98207255 3.72683856 2.61112819]
 ...
 [2.64713427 2.66133131 2.88941077 ... 3.71199879 3.01418939 1.90230968]
 [4.65982943 4.63686452 4.44617563 ... 3.83635378 5.35498835 4.13474626]
 [4.56516109 4.51282862 4.39208628 ... 3.93467705 5.21410549 3.98049682]]


In [17]:
import pandas as pd
data = []
with open('../data/um_0.8.test', 'r') as infile:
    for line in infile.readlines():
        user, item, rating = line.strip().split('\t')
        
        data.append([int(user), int(item), rating])
        
test = pd.DataFrame(data=data, columns=["userID", "itemID", "rating"])

In [18]:
R_t=np.zeros([unum,inum])
with open('../data/um_0.8.train', 'r') as infile:
    for line in infile.readlines():
        user, item, rating = line.strip().split('\t')
        R_t[int(user)-1,int(item)-1]=1

In [20]:
data = []
for u in range(unum):
    pred_rating_user_item = np.argsort(pred_rating[u,:])[::-1][:]
    n=0
    i=0
    while n<100:
        if R_t[u,int(pred_rating_user_item[i])]==0:
            data.append([int(u)+1,int(pred_rating_user_item[i])+1,pred_rating[u,pred_rating_user_item[i]]])
            n+=1
            i+=1
        else:
            i+=1
all_predictions = pd.DataFrame(data=data, columns=["userID", "itemID", "prediction"])
all_predictions

Unnamed: 0,userID,itemID,prediction
0,1,5676,4.993701
1,1,9360,4.958818
2,1,261,4.899161
3,1,6141,4.774704
4,1,393,4.645872
...,...,...,...
400995,4010,2429,5.568940
400996,4010,2470,5.568780
400997,4010,6111,5.567759
400998,4010,8727,5.567755


In [23]:
k = 20
eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=k)
eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=k)

print("Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

Precision@K:	0.002399
Recall@K:	0.023167
