In [59]:
import sys
import random
from time import time

import pandas as pd
from tqdm import tqdm
import torch.optim as optim

from KGAT import KGAT
from utils.parser_kgat import *
from utils.log_helper import *
from utils.metrics import *
from utils.model_helper import *
from data_loader import DataLoaderKGAT

from scipy.sparse import coo_matrix
import copy


def evaluate(model, dataloader, Ks, device):
    test_batch_size = dataloader.test_batch_size
    train_user_dict = dataloader.train_user_dict
    test_user_dict = dataloader.test_user_dict

    model.eval()

    user_ids = list(test_user_dict.keys())
    user_ids_batches = [user_ids[i: i + test_batch_size] for i in range(0, len(user_ids), test_batch_size)]
    user_ids_batches = [torch.LongTensor(d) for d in user_ids_batches]

    n_items = dataloader.n_items
    item_ids = torch.arange(n_items, dtype=torch.long).to(device)

    cf_scores = []
    metric_names = ['precision', 'recall', 'ndcg']
    metrics_dict = {k: {m: [] for m in metric_names} for k in Ks}

    with tqdm(total=len(user_ids_batches), desc='Evaluating Iteration') as pbar:
        for batch_user_ids in user_ids_batches:
            batch_user_ids = batch_user_ids.to(device)

            with torch.no_grad():
                batch_scores = model(batch_user_ids, item_ids, mode='predict')       # (n_batch_users, n_items)

            batch_scores = batch_scores.cpu()
            batch_metrics = calc_metrics_at_k(batch_scores, train_user_dict, test_user_dict, batch_user_ids.cpu().numpy(), item_ids.cpu().numpy(), Ks)

            cf_scores.append(batch_scores.numpy())
            for k in Ks:
                for m in metric_names:
                    metrics_dict[k][m].append(batch_metrics[k][m])
            pbar.update(1)

    cf_scores = np.concatenate(cf_scores, axis=0)
    for k in Ks:
        for m in metric_names:
            metrics_dict[k][m] = np.concatenate(metrics_dict[k][m]).mean()
    return cf_scores, metrics_dict

def predict(args):
    # GPU / CPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # load data
    data = DataLoaderKGAT(args, logging)

    # load model
    model = KGAT(args, data.n_users, data.n_entities, data.n_relations)
    load_model(model, args.pretrain_model_path)
    model.to(device)

    # predict
    Ks = eval(args.Ks)
    k_min = min(Ks)
    k_max = max(Ks)

    cf_scores, metrics_dict = evaluate(model, data, Ks, device)
    np.save(args.save_dir + 'cf_scores.npy', cf_scores)
    print('CF Evaluation: Precision [{:.4f}, {:.4f}], Recall [{:.4f}, {:.4f}], NDCG [{:.4f}, {:.4f}]'.format(
        metrics_dict[k_min]['precision'], metrics_dict[k_max]['precision'], metrics_dict[k_min]['recall'], metrics_dict[k_max]['recall'], metrics_dict[k_min]['ndcg'], metrics_dict[k_max]['ndcg']))

In [2]:
class KGAT_args():
    def __init__(self, 
                 seed=2019,
                 data_name='amazon-book', 
                 data_dir='datasets/', 
                 use_pretrain=1, 
                 pretrain_embedding_dir='datasets/pretrain/', 
                 pretrain_model_path='trained_model/KGAT/model_epoch280.pth', 
                 cf_batch_size=1024, 
                 kg_batch_size=2048, 
                 test_batch_size=10000, 
                 embed_dim=64, 
                 relation_dim=64, 
                 laplacian_type='random-walk', 
                 aggregation_type='bi-interaction', 
                 conv_dim_list='[64, 32, 16]', 
                 mess_dropout='[0.1, 0.1, 0.1]', 
                 kg_l2loss_lambda=1e-5, 
                 cf_l2loss_lambda=1e-5, 
                 lr=0.0001, 
                 n_epoch=1000, 
                 stopping_steps=10, 
                 cf_print_every=1, 
                 kg_print_every=1, 
                 evaluate_every=20, 
                 Ks='[20, 40, 60, 80, 100]'):
        
        self.seed = seed
        self.data_name = data_name
        self.data_dir = data_dir
        self.use_pretrain = use_pretrain
        self.pretrain_embedding_dir = pretrain_embedding_dir
        self.pretrain_model_path = pretrain_model_path
        self.cf_batch_size = cf_batch_size
        self.kg_batch_size = kg_batch_size
        self.test_batch_size = test_batch_size
        self.embed_dim = embed_dim
        self.relation_dim = relation_dim
        self.laplacian_type = laplacian_type
        self.aggregation_type = aggregation_type
        self.conv_dim_list = conv_dim_list
        self.mess_dropout = mess_dropout
        self.kg_l2loss_lambda = kg_l2loss_lambda
        self.cf_l2loss_lambda = cf_l2loss_lambda
        self.lr = lr
        self.n_epoch = n_epoch
        self.stopping_steps = stopping_steps
        self.cf_print_every = cf_print_every
        self.kg_print_every = kg_print_every
        self.evaluate_every = evaluate_every
        self.Ks = Ks
        save_dir = 'trained_model/KGAT/{}/embed-dim{}_relation-dim{}_{}_{}_{}_lr{}_pretrain{}/'.format(
        self.data_name, self.embed_dim, self.relation_dim, self.laplacian_type, self.aggregation_type,
        '-'.join([str(i) for i in eval(self.conv_dim_list)]), self.lr, self.use_pretrain)
        self.save_dir = save_dir

In [34]:
args = KGAT_args()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# load data
data = DataLoaderKGAT(args, logging)

# load model
model = KGAT(args, data.n_users, data.n_entities, data.n_relations)
load_model(model, args.pretrain_model_path)
model.to(device)
pass

In [None]:
predict(args)

In [5]:
print(data.n_items) # 0~24914
print(data.n_entities) # 24915~113486
print(data.n_users_entities) # 113487~184165

24915
113487
184166


In [20]:
def find_all_path(start, end, target_r_1, target_r_2, adj_matrix):
    adj_first = adj_matrix[0].tocsr()
    adj_second = adj_matrix[target_r_1].tocsr()
    adj_last = adj_matrix[target_r_2].tocsr()
    
    if target_r_2 == 0:
        reverse_adj_last = adj_matrix[1].tocsr()
    elif target_r_2 <= 40:
        reverse_adj_last = adj_matrix[target_r_2 + 39].tocsr()
    else:
        reverse_adj_last = adj_matrix[target_r_2 - 39].tocsr()
    
    # All items connect to start
    first_step_candidate = adj_first.indices[adj_first.indptr[start]:adj_first.indptr[start+1]]
    first_step_candidate = np.delete(first_step_candidate, np.where(first_step_candidate == end)[0])
    
    # All entity(uiei)/user(uiui) connect to end
    last_step_candidate = reverse_adj_last.indices[reverse_adj_last.indptr[end]:reverse_adj_last.indptr[end+1]]
    last_step_candidate = np.delete(last_step_candidate, np.where(last_step_candidate == start)[0])

    output_paths = []
    for h in first_step_candidate:
        tails = adj_second.indices[adj_second.indptr[h]:adj_second.indptr[h+1]]
        for t in tails:
            if t in last_step_candidate:
                output_paths.append([start, h, target_r_1, t, target_r_2, end])
    
    if len(output_paths) > 0:
        return output_paths
    else:
        #print("there is no path between them")
        return 0

def find_all_path_all_r(start, end, relations, adj_matrix):
    all_paths = []
    for r in relations:
        if r == 0:
            r_1 = 0
            r_2 = 1
        elif r == 1:
            r_1 = 1
            r_2 = 0
        elif r <= 40:
            r_1 = r
            r_2 = r + 39
        else:
            r_1 = r
            r_2 = r - 39

        paths = find_all_path(start, end, r_1, r_2, adj_matrix)
        if paths == 0:
            continue
        if len(all_paths) == 0:
            all_paths = paths
        else:
            all_paths += paths
    return all_paths

In [75]:
# delete [2, 3, 6, 11, 21, 22, 32, 35, 37]

array([24925], dtype=int32)

In [85]:
user = 113490
item = 54
target_r_1 = 6
target_r_2 = 45
r_all_path = find_all_path(113490, 54, target_r_1, target_r_2, data.adjacency_dict)
print(r_all_path)

[[113490, 52, 24925, 54], [113490, 53, 24925, 54], [113490, 55, 24925, 54], [113490, 56, 24925, 54], [113490, 57, 24925, 54], [113490, 58, 24925, 54], [113490, 59, 24925, 54], [113490, 60, 24925, 54], [113490, 62, 24925, 54], [113490, 64, 24925, 54], [113490, 65, 24925, 54]]


In [57]:
# 'sum', 'mul', 'max'
def path_score(path, A_in, mode='sum'):
    assert len(path) == 6  # [U, I, r_1, E, r_2, I]

    if mode == 'sum':
        return A_in[path[0]][path[1]] + A_in[path[1]][path[2]] + A_in[path[2]][path[3]]
    
    elif mode == 'mul':
        return A_in[path[0]][path[1]] * A_in[path[1]][path[2]] * A_in[path[2]][path[3]]
    
    elif mode == 'max':
        return max(A_in[path[0]][path[1]], A_in[path[1]][path[2]], A_in[path[2]][path[3]])
    
    else:
        raise ValueError('mode should be in ["sum", "mul", "max"].')

def get_top_k_path(paths, A_in, mode='sum', k=5):
    assert len(paths) > 0
    
    scores = [path_score(path, A_in, mode) for path in paths]
    reranked_paths = [path for _, path in sorted(zip(scores,paths), reverse=True)]
    sorted_scores = sorted(scores, reverse=True)

    return reranked_paths[:k], sorted_scores[:k]

def predict_single(model, dataloader, user_id, relations, Ks, device):
    train_user_dict = dataloader.train_user_dict

    model.eval()
    input_user_id = torch.LongTensor([user_id]).to(device)

    n_items = dataloader.n_items
    item_ids = torch.arange(n_items, dtype=torch.long).to(device)
    k_max = max(Ks)

    print(f'Get matching score of user {user_id}......')
    with torch.no_grad():
        matching_scores = model(input_user_id, item_ids, mode='predict')[0].cpu().numpy()       # (n_batch_users, n_items)
    
    top_k_items = np.argsort(-matching_scores)[:k_max]
    top_k_all_att_scores = []
    top_k_all_paths = []
    top_k_paths = []
    top_k_att_scores = []
    
    for item_id in tqdm(top_k_items):
        paths = find_all_path_all_r(user_id, item_id, relations, dataloader.adjacency_dict)
        reranked_paths, scores = get_top_k_path(paths, model.A_in, mode='sum', k=5)
        top_k_all_att_scores.append(scores)
        top_k_all_paths.append(reranked_paths)
        top_k_att_scores.append(scores[0])
        top_k_paths.append(reranked_paths[0])
    
    output = {}
    for k in Ks:
        reranked_items = [id for _, id in sorted(zip(top_k_att_scores[:k], top_k_items[:k]), reverse=True)]
        explainations = [path for _, path in sorted(zip(top_k_att_scores[:k], top_k_paths[:k]), reverse=True)]
        output[k] = {'item_ids': reranked_items, 'explainations': explainations}
    
    return output, top_k_all_att_scores, top_k_all_paths

def explainable_evaluate(model, dataloader, relations, Ks, device):
    test_batch_size = dataloader.test_batch_size
    train_user_dict = dataloader.train_user_dict
    test_user_dict = dataloader.test_user_dict

    model.eval()

    user_ids = list(test_user_dict.keys())

    n_items = dataloader.n_items
    item_ids = torch.arange(n_items, dtype=torch.long).to(device)

    cf_scores = []
    metric_names = ['precision', 'recall', 'ndcg']
    metrics_dict = {k: {m: [] for m in metric_names} for k in Ks}

    with tqdm(total=len(user_ids), desc='Evaluating Iteration') as pbar:
        for user_id in user_ids:

            rerank_indices, _, _ = predict_single(model, dataloader, user_id, relations, Ks, device)
            batch_metrics = calc_metrics_at_k_exp(rerank_indices, train_user_dict, test_user_dict, [user_id], item_ids.cpu().numpy(), Ks)

            for k in Ks:
                for m in metric_names:
                    metrics_dict[k][m].append(batch_metrics[k][m])
            pbar.update(1)

    for k in Ks:
        for m in metric_names:
            metrics_dict[k][m] = np.concatenate(metrics_dict[k][m]).mean()
    return metrics_dict
    
def explainable_recommend(model, data, relations, Ks):
    # predict
    Ks = eval(Ks)
    k_min = min(Ks)
    k_max = max(Ks)

    metrics_dict = explainable_evaluate(model, data, relations, Ks, device)
    print('CF Evaluation: Precision [{:.4f}, {:.4f}], Recall [{:.4f}, {:.4f}], NDCG [{:.4f}, {:.4f}]'.format(
        metrics_dict[k_min]['precision'], metrics_dict[k_max]['precision'], metrics_dict[k_min]['recall'], metrics_dict[k_max]['recall'], metrics_dict[k_min]['ndcg'], metrics_dict[k_max]['ndcg']))

In [95]:
print('sum', get_top_k_path(r_all_path, model.A_in, mode='sum'))
print('mul', get_top_k_path(r_all_path, model.A_in, mode='mul'))
print('max', get_top_k_path(r_all_path, model.A_in, mode='max'))

sum ([[113490, 57, 24925, 54], [113490, 62, 24925, 54], [113490, 60, 24925, 54], [113490, 65, 24925, 54], [113490, 64, 24925, 54]], [tensor(1.1899, device='cuda:0'), tensor(1.0503, device='cuda:0'), tensor(1.0312, device='cuda:0'), tensor(1.0153, device='cuda:0'), tensor(1.0007, device='cuda:0')])
mul ([[113490, 57, 24925, 54], [113490, 62, 24925, 54], [113490, 55, 24925, 54], [113490, 53, 24925, 54], [113490, 60, 24925, 54]], [tensor(3.5424e-07, device='cuda:0'), tensor(1.2501e-07, device='cuda:0'), tensor(9.5676e-08, device='cuda:0'), tensor(9.4368e-08, device='cuda:0'), tensor(7.1571e-08, device='cuda:0')])
max ([[113490, 65, 24925, 54], [113490, 58, 24925, 54], [113490, 60, 24925, 54], [113490, 64, 24925, 54], [113490, 52, 24925, 54]], [tensor(0.9875, device='cuda:0'), tensor(0.9854, device='cuda:0'), tensor(0.9824, device='cuda:0'), tensor(0.9767, device='cuda:0'), tensor(0.9644, device='cuda:0')])


In [22]:
#delete [2, 3, 6, 11, 21, 22, 32, 35, 37] -> [4, 5, 8, 13, 23, 24, 34, 37, 39] [43, 44, 47, 52, 62, 63, 73, 76, 78]
all_relations = range(80)
delete_relations = [4, 5, 8, 13, 23, 24, 34, 37, 39, 43, 44, 47, 52, 62, 63, 73, 76, 78]
target_relations = list(set(all_relations) - set(delete_relations))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

output, all_paths, all_scores = predict_single(model, data, 113490, target_relations, [20], device)

Get matching score of user 113490......


100%|██████████| 20/20 [00:46<00:00,  2.32s/it]


In [50]:
eval("['20']")

['20']

In [58]:
explainable_recommend(model, data, target_relations, "[20]")

Evaluating Iteration:   0%|          | 0/70591 [00:00<?, ?it/s]

Get matching score of user 113487......


100%|██████████| 20/20 [00:57<00:00,  2.89s/it]
Evaluating Iteration:   0%|          | 0/70591 [00:58<?, ?it/s]


NameError: name 'calc_metrics_at_k_exp' is not defined

In [36]:
predict(args)sum ([[113490, 57, 24925, 54], [113490, 62, 24925, 54], [113490, 60, 24925, 54], [113490, 65, 24925, 54], [113490, 64, 24925, 54]], [tensor(1.1899, device='cuda:0'), tensor(1.0503, device='cuda:0'), tensor(1.0312, device='cuda:0'), tensor(1.0153, device='cuda:0'), tensor(1.0007, device='cuda:0')])
mul ([[113490, 57, 24925, 54], [113490, 62, 24925, 54], [113490, 55, 24925, 54], [113490, 53, 24925, 54], [113490, 60, 24925, 54]], [tensor(3.5424e-07, device='cuda:0'), tensor(1.2501e-07, device='cuda:0'), tensor(9.5676e-08, device='cuda:0'), tensor(9.4368e-08, device='cuda:0'), tensor(7.1571e-08, device='cuda:0')])
max ([[113490, 65, 24925, 54], [113490, 58, 24925, 54], [113490, 60, 24925, 54], [113490, 64, 24925, 54], [113490, 52, 24925, 54]], [tensor(0.9875, device='cuda:0'), tensor(0.9854, device='cuda:0'), tensor(0.9824, device='cuda:0'), tensor(0.9767, device='cuda:0'), tensor(0.9644, device='cuda:0')])
array([24925], dtype=int32)
[[113490, 52, 24925, 54], [113490, 53, 24925, 54], [113490, 55, 24925, 54], [113490, 56, 24925, 54], [113490, 57, 24925, 54], [113490, 58, 24925, 54], [113490, 59, 24925, 54], [113490, 60, 24925, 54], [113490, 62, 24925, 54], [113490, 64, 24925, 54], [113490, 65, 24925, 54]]
Evaluating Iteration:   0%|          | 0/70591 [00:00<?, ?it/s]Get matching score of user 113487......
100%|██████████| 20/20 [00:57<00:00,  2.89s/it]
Evaluating Iteration:   0%|          | 0/70591 [00:58<?, ?it/s]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_5536/802476662.py in <module>
----> 1 explainable_recommend(model, data, target_relations, "[20]")

~\AppData\Local\Temp/ipykernel_5536/2790380136.py in explainable_recommend(model, data, relations, Ks)
     98     k_max = max(Ks)
     99 
--> 100     metrics_dict = explainable_evaluate(model, data, relations, Ks, device)
    101     print('CF Evaluation: Precision [{:.4f}, {:.4f}], Recall [{:.4f}, {:.4f}], NDCG [{:.4f}, {:.4f}]'.format(
    102         metrics_dict[k_min]['precision'], metrics_dict[k_max]['precision'], metrics_dict[k_min]['recall'], metrics_dict[k_max]['recall'], metrics_dict[k_min]['ndcg'], metrics_dict[k_max]['ndcg']))

~\AppData\Local\Temp/ipykernel_5536/2790380136.py in explainable_evaluate(model, dataloader, relations, Ks, device)
     80 
     81             rerank_indices, _, _ = predict_single(model, dataloader, user_id, relations, Ks, device)
---> 82             batch_metrics = calc_metrics_at_k_exp(rerank_indices, train_user_dict, test_user_dict, [user_id], item_ids.cpu().numpy(), Ks)
     83 
     84             for k in Ks:

NameError: name 'calc_metrics_at_k_exp' is not defined

Evaluating Iteration: 100%|██████████| 8/8 [03:49<00:00, 28.65s/it]


CF Evaluation: Precision [0.0150, 0.0071], Recall [0.1441, 0.3102], NDCG [0.0765, 0.1143]
