In [1]:
import os
import json
import random
from tqdm import tqdm
import os.path as osp

In [2]:
import torch
from torch.utils.data import DataLoader
import numpy as np
import dgl
from dgl.data import DGLDataset
from dgl import save_graphs, load_graphs
from dgl.data.utils import makedirs, save_info, load_info

Using backend: pytorch


In [3]:
from multimodal.datasets import KwaiTagRecoDataset
from multimodal.core import (mean_average_precision, mean_class_accuracy,
                    mmit_mean_average_precision, top_k_accuracy,
                    top_k_recall, top_k_precision)

In [4]:
class KwaiNodeCollator():
    def __init__(self, g, block_sampler, test_mode):
        self.g = g
        self.block_sampler = block_sampler
        self.test_mode = test_mode
        self.pos_etype = ('tag', 'HasVideo', 'video')
        self.neg_etype = ('tag', 'NotHasVideo', 'video')
        self.infer_etype = ('tag', 'WhetherHasVideo', 'video')
    
    def collate(self, video_nids):
        if self.test_mode:
            return self.collate_infer(video_nids)
        return self.collate_train(video_nids)
    
    def collate_train(self, video_nids):
        items = {'video': torch.tensor(video_nids, dtype=self.g.idtype)}
        # Sample pos & neg graph
        pos_pair_eids = self.g.in_edges(items['video'], form='eid', etype=self.pos_etype)
        neg_pair_eids = self.g.in_edges(items['video'], form='eid', etype=self.neg_etype)
        pair_graph = self.g.edge_subgraph({self.pos_etype: pos_pair_eids, 
                                           self.neg_etype: neg_pair_eids})
        # no need to apply transform.compact_graphs() since there can't be isolated nodes
        # Sample MFGs
        seed_nodes = pair_graph.ndata[dgl.NID]
        blocks = self.block_sampler.sample_blocks(self.g, seed_nodes)
        input_nodes = blocks[0].srcdata[dgl.NID]
        pair_graph.ndata.pop('feat')
        batch = {
            'input_nodes': input_nodes, 
            'pair_graph': pair_graph, 
            'mfgs': blocks
        }
        from dgl import dataloading
        return batch
    
    def collate_infer(self, video_nids):
        items = {'video': torch.tensor(video_nids, dtype=self.g.idtype)}
        # Sample pos & neg graph
        unknown_pair_eids = self.g.in_edges(items['video'], form='eid', etype=self.infer_etype)
        unknown_pair_graph = self.g.edge_subgraph({self.infer_etype: unknown_pair_eids})
        unknown_pair_graph.ndata.pop('feat')
        # no need to apply transform.compact_graphs() since there can't be isolated nodes
        # Sample MFGs
        seed_nodes = unknown_pair_graph.ndata[dgl.NID]
        blocks = self.block_sampler.sample_blocks(self.g, seed_nodes)
        input_nodes = blocks[0].srcdata[dgl.NID]
        batch = {
            'input_nodes': input_nodes, 
            'pair_graph': unknown_pair_graph, 
            'mfgs': blocks
        }
        return batch

In [5]:
# # Trace real video and tags
# new_video_nid = 8
# new_tag_nids = torch.where(labels[new_video_nid]!=0)[0]
# video_nid = pair_graph.ndata[dgl.NID]['video'][new_video_nid].item()
# tag_nids = pair_graph.ndata[dgl.NID]['tag'][new_tag_nids].tolist()
# print(video_nid2pid[video_nid])
# for tag_nid in tag_nids:
#     print(tag_nid2tag[tag_nid])
#
# original_tagid = pair_graph.ndata[dgl.NID]['tag']
# original_videoid = pair_graph.ndata[dgl.NID]['video']
pass

# Load data

In [6]:
vertical = 'cosmetic'
split = 'train'
dataset_root = '/home/wangxiao13/annotation/data/kwai'
video_emb_dir = 'video_feat'
tag_emb_dir = 'tag_feat_bert'

num_gnn_layers = 2

tag_nid2tag_path = osp.join(dataset_root, f'cache/{vertical}_{split}_tag_nid2tag.pkl')
video_nid2pid_path = osp.join(dataset_root, f'cache/{vertical}_{split}_video_nid2pid.pkl')

In [22]:
train_fanouts_base = [{
    ('tag', 'HasVideo', 'video'): 0,
    ('tag', 'NotHasVideo', 'video'): 0,
    ('video', 'FollowedBy', 'video'): 0,
    ('tag', 'SubTopic', 'tag'): 8,
    ('video', 'HasTag', 'tag'): 8}]
train_fanouts_final = [{
    ('tag', 'HasVideo', 'video'): 0,
    ('tag', 'NotHasVideo', 'video'): 0,
    ('video', 'FollowedBy', 'video'): 0,
    ('tag', 'SubTopic', 'tag'): 8,
    ('video', 'HasTag', 'tag'): 8}]
infer_fanouts_base = [{
    ('tag', 'WhetherHasVideo', 'video'): 0,
    ('tag', 'HasVideo', 'video'): 0,
    ('tag', 'NotHasVideo', 'video'): 0,
    ('video', 'FollowedBy', 'video'): 0,
    ('tag', 'SubTopic', 'tag'): 8,
    ('video', 'HasTag', 'tag'): 8}]
infer_fanouts_final = [{
    ('tag', 'WhetherHasVideo', 'video'): 0,
    ('tag', 'HasVideo', 'video'): 0,
    ('tag', 'NotHasVideo', 'video'): 0,
    ('video', 'FollowedBy', 'video'): 0,
    ('tag', 'SubTopic', 'tag'): 8,
    ('video', 'HasTag', 'tag'): 8}]
train_fanouts = train_fanouts_base*(num_gnn_layers-1) + train_fanouts_final
infer_fanouts = infer_fanouts_base*(num_gnn_layers-1) + infer_fanouts_final

In [23]:
dataset = KwaiTagRecoDataset(vertical=vertical,
                             split=split,
                             dataset_root=dataset_root,
                             video_emb_dir=video_emb_dir,
                             tag_emb_dir=tag_emb_dir,
                             force_reload=False)

Loading processed dataset ...


In [24]:
g = dataset.g
tag_nid2tag = load_info(tag_nid2tag_path)
video_nid2pid = load_info(video_nid2pid_path)

In [25]:
g

Graph(num_nodes={'tag': 3114, 'video': 77539},
      num_edges={('tag', 'HasVideo', 'video'): 308665, ('tag', 'NotHasVideo', 'video'): 241147781, ('tag', 'SubTopic', 'tag'): 6955, ('video', 'FollowedBy', 'video'): 6241272, ('video', 'HasTag', 'tag'): 308665},
      metagraph=[('tag', 'video', 'HasVideo'), ('tag', 'video', 'NotHasVideo'), ('tag', 'tag', 'SubTopic'), ('video', 'video', 'FollowedBy'), ('video', 'tag', 'HasTag')])

In [26]:
total_videos = g.num_nodes('video')
try:
    test_videos = g.num_edges(etype=('tag', 'WhetherHasVideo', 'video')) // g.num_nodes('tag')
    train_videos = total_videos - test_videos
except:
    train_videos = 0

test_mode = False if split == 'train' else True
if test_mode:
    block_sampler = dgl.dataloading.MultiLayerNeighborSampler(infer_fanouts)
else:
    block_sampler = dgl.dataloading.MultiLayerNeighborSampler(train_fanouts)
collator = KwaiNodeCollator(g, block_sampler, test_mode=test_mode)

train_dataloader = DataLoader(
    list(range(train_videos, total_videos)),
    batch_size=1024,
    collate_fn=collator.collate,
    shuffle=not test_mode,
    drop_last=False,
    num_workers=0
)

In [27]:
example_minibatch = next(iter(train_dataloader))
input_nodes, pair_graph, mfgs = list(example_minibatch.values())

In [29]:
for idx, mfg in enumerate(mfgs):
    print('Layer %d:' % (idx+1))
    print('    SRC: %d videos, %d tags' % (mfg.num_src_nodes('video'), mfg.num_src_nodes('tag')))
    print('    DST: %d videos, %d tags' % (mfg.num_dst_nodes('video'), mfg.num_dst_nodes('tag')))

Layer 1:
    SRC: 24798 videos, 3114 tags
    DST: 17945 videos, 3114 tags
Layer 2:
    SRC: 17945 videos, 3114 tags
    DST: 1024 videos, 3114 tags


In [21]:
"""
FollowedBy, SubTopic, HasTag
Layer 1:
    SRC: 37371 videos, 3114 tags
    DST: 20686 videos, 3114 tags
Layer 2:
    SRC: 20686 videos, 3114 tags
    DST: 1024 videos, 3114 tags
"""
pass