In [1]:
import tensorflow as tf
import numpy as np
import os
import pandas as pd
import heapq
# print(tf.__version__)

In [2]:
class Arg:
    '''
    Mock arguments
    '''
    def __init__(self, ratio, n_interact, n_neighbors, dataset, mission, neighbor_sample_size):
        self.ratio = ratio
        self.n_interact = n_interact
        self.n_neighbors = n_neighbors
        self.dataset = dataset
        self.mission = mission
        self.neighbor_sample_size = neighbor_sample_size
        
args = Arg(1, 20, 20, 'movie', 'gender', 4)


In [3]:
def load_rating(args):
    print('reading rating file ...')

    # reading rating file
    if args.mission == 'gender':
        rating_file = './data/' + args.dataset + '/ratings_final_pos'
        user_file = './data/' + args.dataset + '/user_gender_final'
    elif args.mission == 'age':
        rating_file = './data/' + args.dataset + '/age/ratings_final_pos'
        user_file = './data/' + args.dataset + '/age/user_age_final'
    if os.path.exists(rating_file + '.npy'):
        rating_np = np.load(rating_file + '.npy')
    else:
        rating_np = np.loadtxt(rating_file + '.txt', dtype=np.int64)
        np.save(rating_file + '.npy', rating_np)
    if os.path.exists(user_file + '.npy'):
        users_np = np.load(user_file + '.npy')
    else:
        users_np = np.loadtxt(user_file + ".txt", dtype=np.int64)
        np.save(user_file + '.npy', users_np)
    n_user = len(set(rating_np[:, 0]))  # 用户数
    n_item = len(set(rating_np[:, 1]))  # 交互项目数
    n_classes = len(set(users_np[:, 1]))  # 待分类的类别数（例如如果对性别分类，就是男女2类）
    train_data, eval_data, test_data, adj_item, adj_user = dataset_split(n_user, users_np, rating_np, args)
    np.random.shuffle(rating_np)

    return n_user, n_item, train_data, eval_data, test_data, adj_item, n_classes, rating_np, adj_user

In [4]:
def dataset_split(n_user, users_np, rating_np, args):
    print('splitting dataset ...')

    # train:eval:test = 7:1.5:1.5
    eval_ratio = 0.15  # Validation Set
    test_ratio = 0.15  # Testing Set

    '''
    以下代码处理用户数据即user
    '''
    eval_indices = np.random.choice(list(range(n_user)), size=int(n_user * eval_ratio), replace=False)  # 随机抽取
    left = set(range(n_user)) - set(eval_indices)  # 做差集得到剩余下标集合
    test_indices = np.random.choice(list(left), size=int(n_user * test_ratio), replace=False)
    train_indices = list(left - set(test_indices))
    if args.ratio < 1:
        # 如果在参数中指定了training set的比例，则按照比例进一步挑选剩余的下标
        # 否则默认把剩下的所有下标作为training set
        train_indices = np.random.choice(list(train_indices), size=int(len(train_indices) * args.ratio), replace=False)

    # 获取对应的数据
    train_data = users_np[train_indices]
    eval_data = users_np[eval_indices]
    test_data = users_np[test_indices]

    '''
    以下代码处理rating
    '''
    # 构建pandas df
    rating_df = pd.DataFrame(rating_np)
    rating_df.columns = ['userid', 'movieid', 'gender']

    # 从rating_np中分别取出train validation和test集合的中随机到的用户对应的下标
    train_idx = []
    for user in train_data[:, 0]:
        # 这样写的原因是在rating中 一个用户id可以对应多个item
        train_idx.extend(rating_df[rating_df['userid'] == user].index)
    eval_idx = []
    for user in eval_data[:, 0]:
        eval_idx.extend(rating_df[rating_df['userid'] == user].index)
    test_idx = []
    for user in test_data[:, 0]:
        test_idx.extend(rating_df[rating_df['userid'] == user].index)

    # 取出对应的行
    train_rating = rating_df.iloc[train_idx, :].values
    eval_rating = rating_df.iloc[eval_idx, :].values
    test_rating = rating_df.iloc[test_idx, :].values

    np.random.shuffle(train_rating)
    np.random.shuffle(eval_rating)
    np.random.shuffle(test_rating)

    # 构建用户-交互词典与交互-用户词典
    userPos = dict()  # 用户-交互词典
    itemUser = dict()  # 交互-用户词典
    for i in range(rating_np.shape[0]):
        user = rating_np[i, 0]  # 用户id
        item = rating_np[i, 1]  # 交互item
        if user not in userPos:
            userPos[user] = set()
        userPos[user].add(item)
        if item not in itemUser:
            itemUser[item] = set()
        itemUser[item].add(user)
    # 创建邻接矩阵(arg.n_interact -> number of item to be sampled 根据论文 为20)
    adj_item = np.zeros([max(list(userPos.keys())) + 1, args.n_interact], dtype=np.int64)

    for user in userPos:
        interItem = list(userPos[user])
        n_inter = len(interItem)
        if n_inter > 0:
            if n_inter >= args.n_interact:
                # replace: Whether the sample is with or without replacement. Default is True, meaning that a value
                # of a can be selected multiple times.
                sampled_indices = np.random.choice(list(range(n_inter)), size=args.n_interact, replace=False)
            else:
                sampled_indices = np.random.choice(list(range(n_inter)), size=args.n_interact, replace=True)
        adj_item[user] = np.array([interItem[i] for i in sampled_indices])

    # 相邻用户计数矩阵(O(m*n*A_n^2))
    adj_user_count = np.zeros([max(list(userPos.keys())) + 1, max(list(userPos.keys())) + 1], dtype=np.int64)
    for item in itemUser:
        commenUsers = list(itemUser[item])  # 与item有关的所有用户
        commenUsersSet = set(commenUsers)
        for user in commenUsers:
            # 对该item所有关联的user进行迭代
            tmp = commenUsersSet
            tmp.remove(user)  # 删除自身
            for userj in list(tmp):
                # 该用户与其他用户为相邻用户（只要跟同一个item相连的就是相邻用户）
                adj_user_count[user, userj] += 1

    idx = []
    for i in range(len(adj_user_count)):
        # 用堆排序获取近邻最多的user的下标
        # n_neighbors -> number of users.dat neighbors 根据论文近邻用户采样应该为10
        tmp = heapq.nlargest(args.n_neighbors, range(len(adj_user_count[i])), adj_user_count[i].take)
        idx.append(tmp)

    # 创建邻接用户矩阵
    adj_user = np.zeros([max(list(userPos.keys())) + 1, args.n_neighbors], dtype=np.int64)
    for i, users in enumerate(idx):
        n_neighbors = len(users)
        if n_neighbors > 0:
            if n_neighbors >= args.n_neighbors:
                sampled_indices = np.random.choice(list(range(n_neighbors)), size=args.n_neighbors, replace=False)
            else:
                sampled_indices = np.random.choice(list(range(n_neighbors)), size=args.n_neighbors, replace=True)
        adj_user[i] = np.array([users[i] for i in sampled_indices])
    return train_rating, eval_rating, test_rating, adj_item, adj_user

In [5]:
def construct_kg(kg_np):
    # 构造知识库字典
    print('constructing knowledge graph ...')
    kg = dict()
    for triple in kg_np:
        head = triple[0]
        relation = triple[1]
        tail = triple[2]
        # treat the KG as an undirected graph
        if head not in kg:
            kg[head] = []
        kg[head].append((tail, relation))
        if tail not in kg:
            kg[tail] = []
        kg[tail].append((head, relation))
    return kg

In [6]:
def construct_adj(args, kg, entity_num):
    print('constructing adjacency matrix ...')
    maxIdx = max(list(kg.keys()))
    # neighbor_sample_size -> 临近实体采样数 论文中为4
    # each line of adj_entity stores the sampled neighbor entities for a given entity
    adj_entity = np.zeros([maxIdx + 1, args.neighbor_sample_size], dtype=np.int64)
    # each line of adj_relation stores the corresponding sampled neighbor relations
    adj_relation = np.zeros([maxIdx + 1, args.neighbor_sample_size], dtype=np.int64)
    for entity in range(maxIdx + 1):
        if entity not in kg:
            continue
        neighbors = kg[entity]
        n_neighbors = len(neighbors)
        if n_neighbors >= args.neighbor_sample_size:
            sampled_indices = np.random.choice(list(range(n_neighbors)), size=args.neighbor_sample_size, replace=False)
        else:
            sampled_indices = np.random.choice(list(range(n_neighbors)), size=args.neighbor_sample_size, replace=True)
        adj_entity[entity] = np.array([neighbors[i][0] for i in sampled_indices])
        adj_relation[entity] = np.array([neighbors[i][1] for i in sampled_indices])
    return adj_entity, adj_relation

In [10]:
def load_kg(args):
    kg_file = './data/' + args.dataset + '/kg_final'
    if os.path.exists(kg_file + '.npy'):
        kg_np = np.load(kg_file + '.npy')
    else:
        kg_np = np.loadtxt(kg_file + '.txt', dtype=np.int64)
        np.save(kg_file + '.npy', kg_np)
        
    n_entity = len(set(kg_np[:, 0]) | set(kg_np[:, 2]))
    n_relation = len(set(kg_np[:, 1]))
    print(n_entity, n_relation)
    kg = construct_kg(kg_np)
    adj_entity, adj_relation = construct_adj(args, kg, n_entity)
    return n_entity, n_relation, adj_entity, adj_relation

In [8]:
def load_data(args):
    n_user, n_item, train_data, eval_data, test_data, adj_item, n_classes, ratings_np, adj_user = load_rating(args)
    n_entity, n_relation, adj_entity, adj_relation = load_kg(args)
    print('data loaded.')

    return n_user, n_item, n_entity, n_relation, train_data, eval_data, test_data, adj_entity, adj_relation, adj_item, n_classes, ratings_np, adj_user

In [11]:
ret = load_data(args)

reading rating file ...
splitting dataset ...
37473 24
constructing knowledge graph ...
constructing adjacency matrix ...
data loaded.
