In [2]:
import dgl
import torch
import torch.nn as nn
from dgl.nn import MetaPath2Vec
from openhgnn.dataset.NodeClassificationDataset import OHGB_NodeClassification
from torch.optim import SparseAdam
from torch.utils.data import DataLoader
from tqdm import tqdm

acm = OHGB_NodeClassification(
    dataset_name="ohgbn-acm", raw_dir="./dataset", logger=None
)

  from .autonotebook import tqdm as notebook_tqdm


Extracting file to ./openhgnn/dataset\ohgbn-acm
Done saving data into cached files.


In [3]:
hg = acm.g
meta_paths_dict = acm.meta_paths_dict

## test Mp2Vec

In [187]:
mp2vec_negative_size = 5
mp2vec_feat_dim = 128
mp2vec_window_size = 5
mp2vec_train_lr = 0.001
mp2vec_batch_size = 256
mp2vec_train_epoch = 20
mp2vec_rw_walk_length = 10
mp2vec_rw_walks_per_node = 3
device = "cuda"


def train_mp2vec(
    hg,
    category,
    metapaths_dict,
    mp2vec_feat_dim,
    mp2vec_window_size,
    mp2vec_negative_size,
    mp2vec_rw_walk_length,
    mp2vec_rw_walks_per_node,
    mp2vec_train_lr,
    mp2vec_train_epoch,
    mp2vec_batch_size,
):
    hg = hg.to(device)
    num_nodes = hg.num_nodes(category)

    # metapath for metapath2vec model
    Mp4Mp2Vec = []
    mp_nodes_seq = []
    for mp_name, mp in acm.meta_paths_dict.items():
        Mp4Mp2Vec += mp
        assert (mp[0][0]==mp[-1][-1]), "The start node type and the end one in metapath should be the same."
    
    x=max(mp2vec_rw_walk_length//(len(Mp4Mp2Vec)+1),1)
    Mp4Mp2Vec*=x
    
    for mp in Mp4Mp2Vec:
        mp_nodes_seq.append(mp[0])
    mp_nodes_seq.append(mp[-1])
    assert (
        mp_nodes_seq[0] == mp_nodes_seq[-1]
    ), "The start node type and the end one in metapath should be the same."
    print("Metapath for training mp2vec models:", mp_nodes_seq)

    m2v_model = MetaPath2Vec(
        hg, Mp4Mp2Vec, mp2vec_window_size, mp2vec_feat_dim, mp2vec_negative_size
    ).to(device)
    m2v_model.train()
    dataloader = DataLoader(
        list(range(num_nodes)) * mp2vec_rw_walks_per_node,
        batch_size=mp2vec_batch_size,
        shuffle=True,
        collate_fn=m2v_model.sample,
    )
    optimizer = SparseAdam(m2v_model.parameters(), lr=mp2vec_train_lr)
    for _ in tqdm(range(mp2vec_train_epoch)):
        for pos_u, pos_v, neg_v in dataloader:
            loss = m2v_model(pos_u.to(device), pos_v.to(device), neg_v.to(device))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # get the embeddings
    nids = torch.LongTensor(m2v_model.local_to_global_nid[category]).to(device)
    emb = m2v_model.node_embed(nids)

    del m2v_model, nids, pos_u, pos_v, neg_v
    if device == "cuda":
        torch.cuda.empty_cache()
    return emb.detach()

In [190]:
mp2vec_feat = train_mp2vec(
    hg,
    "paper",
    metapaths_dict=meta_paths_dict,
    mp2vec_feat_dim=mp2vec_feat_dim,
    mp2vec_window_size=mp2vec_window_size,
    mp2vec_negative_size=mp2vec_negative_size,
    mp2vec_train_lr=mp2vec_train_lr,
    mp2vec_train_epoch=mp2vec_train_epoch,
    mp2vec_batch_size=mp2vec_batch_size,
    mp2vec_rw_walks_per_node=mp2vec_rw_walks_per_node,
    mp2vec_rw_walk_length=mp2vec_rw_walk_length
)

Metapath for training mp2vec models: ['paper', 'author', 'paper', 'subject', 'paper']


100%|██████████| 3025/3025 [00:01<00:00, 1623.98it/s]
100%|██████████| 20/20 [00:06<00:00,  3.00it/s]


In [182]:
mp2vec_feat.shape

torch.Size([3025, 128])

In [183]:
mp2vec_feat

tensor([[-0.1002,  0.1190,  0.0887,  ...,  0.0571,  0.1262, -0.0924],
        [-0.0823,  0.1349,  0.0495,  ...,  0.0565,  0.1109, -0.0930],
        [-0.1024,  0.0948,  0.0914,  ...,  0.0962,  0.0945, -0.0886],
        ...,
        [-0.1247,  0.1205,  0.1371,  ...,  0.0969,  0.0379, -0.0608],
        [-0.1033,  0.0826,  0.1067,  ...,  0.0962,  0.1356, -0.0863],
        [-0.3054, -0.0359,  0.2993,  ...,  0.2854,  0.3617, -0.1911]],
       device='cuda:0')

In [None]:
# walk length of one random walk
rw_length = 20
# number of random walks per node
rw_walks = 10

In [5]:
mp2vec_negative_size = 5
mp2vec_feat_dim = 128
mp2vec_window_size = 3
mp2vec_train_lr = 0.001
mp2vec_batch_size = 256
mp2vec_train_epoch = 20
mp2vec_rw_walk_length = 10
mp2vec_rw_walks_per_node = 3
device = "cuda"
num_nodes = hg.num_nodes("paper")

In [316]:
m2v_model = MetaPath2Vec(
    hg, meta_paths_dict["PAP"], m2v_window_size, m2v_emb_dim, m2v_negative_size
)
# dataloader = DataLoader(
# torch.arange(num_nodes),
#             batch_size=m2v_batch_size,
#             shuffle=True,
#             collate_fn=m2v_model.sample,
#         )

100%|██████████| 3025/3025 [00:01<00:00, 2831.51it/s]


In [200]:
from dgl.sampling import random_walk

random_walk(
    hg,
    torch.arange(hg.num_nodes('paper')),
    metapath=(meta_paths_dict["PAP"] + meta_paths_dict["PSP"]),
)

(tensor([[   0,    0,  734,    2, 1775],
         [   1,    3,    1,    1,  223],
         [   2,    9,  121,    6, 1650],
         ...,
         [3022,  725, 3022,    5, 2923],
         [3023, 5906, 3023,    2, 1449],
         [3024, 5910, 3024,    2,  346]]),
 tensor([1, 0, 1, 2, 1]))

In [9]:
Mp4Mp2V = []
for mp_name, mp in meta_paths_dict.items():
    Mp4Mp2V += mp
m2v_model = MetaPath2Vec(
    hg, Mp4Mp2V, mp2vec_window_size, mp2vec_feat_dim, mp2vec_negative_size
).to(device)
m2v_model.train()
dataloader = DataLoader(
    list(range(num_nodes)) * mp2vec_rw_walks_per_node,
    batch_size=mp2vec_batch_size,
    shuffle=True,
    collate_fn=m2v_model.sample,
)

optimizer = SparseAdam(m2v_model.parameters(), lr=mp2vec_train_lr)
for _ in tqdm(range(mp2vec_train_epoch)):
    for pos_u, pos_v, neg_v in dataloader:
        loss = m2v_model(pos_u.to(device), pos_v.to(device), neg_v.to(device))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

100%|██████████| 3025/3025 [00:01<00:00, 2832.75it/s]
100%|██████████| 20/20 [00:06<00:00,  3.27it/s]


## openhgnn random_walk_sampler

In [109]:
from openhgnn.sampler import random_walk_sampler

In [168]:
mp2vec_sampler = random_walk_sampler.RandomWalkSampler(
    g=hg,
    metapath=meta_paths_dict["PAP"],
    rw_walks=2,
    window_size=m2v_window_size,
    neg_size=m2v_negative_size,
)

  self.discards = np.sqrt(t / f) + (t / f)


In [159]:
dataloader = DataLoader(
    mp2vec_sampler,
    batch_size=m2v_batch_size,
    shuffle=True,
    collate_fn=mp2vec_sampler.collate,
)

In [160]:
for i, sample_batched in enumerate(tqdm(dataloader)):
    if len(sample_batched[0]) > 1:
        pos_u = sample_batched[0]
        pos_v = sample_batched[1]
        neg_v = sample_batched[2]

100%|██████████| 12/12 [00:01<00:00, 10.67it/s]


In [185]:
for etype in meta_paths_dict["PAP"]:
    print(etype)

('paper', 'paper-author', 'author')
('author', 'author-paper', 'paper')


In [219]:
metapath = [hg.get_etype_id(etype) for etype in meta_paths_dict["PAP"]]