In [638]:
from collections import defaultdict

import dgl
import numpy as np
import scipy.sparse as sp
import torch as th



In [640]:
data_folder = "data/"

In [641]:
path = data_folder + "acm/"
label = np.load(path + "labels.npy")
nei_a = np.load(path + "nei_a.npy", allow_pickle=True)
nei_s = np.load(path + "nei_s.npy", allow_pickle=True)
# 源码在此步骤划分了[20,40,60]
type_num = [4019, 7167, 60]
ratio = [20, 40, 60]
feat_p = sp.load_npz(path + "p_feat.npz")
feat_a = sp.eye(type_num[1])
feat_s = sp.eye(type_num[2])

pap = sp.load_npz(path + "pap.npz")
psp = sp.load_npz(path + "psp.npz")
pos = sp.load_npz(path + "pos.npz")

train = [np.load(path + "train_" + str(i) + ".npy") for i in ratio]
test = [np.load(path + "test_" + str(i) + ".npy") for i in ratio]
val = [np.load(path + "val_" + str(i) + ".npy") for i in ratio]

label = th.LongTensor(label)
nei_a = [th.LongTensor(i) for i in nei_a]
nei_s = [th.LongTensor(i) for i in nei_s]

# 源码此步preprocess_features
feat_p = th.FloatTensor(feat_p.todense())
feat_a = th.FloatTensor(feat_a.todense())
feat_s = th.FloatTensor(feat_s.todense())

pap = th.FloatTensor(pap.todense()).to_sparse()
psp = th.FloatTensor(psp.todense()).to_sparse()
pos = th.FloatTensor(pos.todense()).to_sparse()

train = [th.LongTensor(i) for i in train]
val = [th.LongTensor(i) for i in val]
test = [th.LongTensor(i) for i in test]

In [642]:
# adj
neis = [nei_a, nei_s]
links = []
for src, nei in enumerate(neis):
    dst_array_concat = th.concatenate(nei)  # .unsqueeze(0)
    src_array_concat = []
    for src_id, dst_array in enumerate(nei):
        src_array_concat.extend([src_id] * len(dst_array))
    src_array_concat = th.tensor(src_array_concat)  # .unsqueeze(0)
    index = th.vstack([src_array_concat, dst_array_concat])
    links.append(index)
    index = th.vstack([dst_array_concat, src_array_concat])
    links.append(index)

In [643]:
data_dict = {
    ("paper", "paper-author", "author"): (links[0][0], links[0][1]),
    ("author", "author-paper", "paper"): (links[1][0], links[1][1]),
    ("paper", "paper-subject", "subject"): (links[2][0], links[2][1]),
    ("subject", "subject-paper", "paper"): (links[3][0], links[3][1]),
}


meta_paths_dict = {
    "PAP": [("paper", "paper-author", "author"), ("author", "author-paper", "paper")],
    "PSP": [
        ("paper", "paper-subject", "subject"),
        ("subject", "subject-paper", "paper"),
    ],
}

hg = dgl.heterograph(data_dict)

hg.nodes["paper"].data["h"] = feat_p
hg.nodes["paper"].data["label"] = label

hg.nodes["author"].data["h"] = feat_a
hg.nodes["subject"].data["h"] = feat_s

ratio=[20,40,60]
for i,r in enumerate(ratio):
    mask=th.zeros(4019).bool()
    mask[train[i]]=True
    hg.nodes['paper'].data['train_%d'%r]=mask

    mask=th.zeros(4019).bool()
    mask[val[i]]=True
    hg.nodes['paper'].data['val_%d'%r]=mask

    mask=th.zeros(4019).bool()
    mask[test[i]]=True
    hg.nodes['paper'].data['test_%d'%r]=mask

dgl.save_graphs('./data/acm4hgmae.bin',hg)