In [95]:
import os 
import sys 
import pickle 
import pandas as pd
import numpy as np 
from pandas.api.types import is_numeric_dtype, is_categorical_dtype, is_categorical
import torch
import pandas as pd 
import dgl
import dgl.function as fn
from sklearn.model_selection import train_test_split

Python 3.9.7


In [96]:
import numpy as np
import dgl
import torch
from torch.utils.data import IterableDataset, DataLoader

In [97]:
directory = '/home/mila/r/rebecca.salganik/'

# os.listdir(directory)

In [98]:
ns_music_all_data = pickle.load(open(directory+'ns_music_all_data_ming.p', 'rb'))

In [99]:
df_playlists = ns_music_all_data['df_playlist']
df_playlists_info = ns_music_all_data['df_playlist_info']
df_tracks = ns_music_all_data['df_track']

Unnamed: 0,pid,tid
0,0,0
1,123,0
2,218,0
3,342,0


# 1. Build Playlist-Track graph

In [100]:
def _series_to_tensor(series):
    if is_categorical(series):
        return torch.LongTensor(series.cat.codes.values.astype('int64'))
    else:       # numeric
        return torch.FloatTensor(series.values)

class PandasGraphBuilder(object):
    def __init__(self):
        self.entity_tables = {}
        self.relation_tables = {}

        self.entity_pk_to_name = {}     # mapping from primary key name to entity name
        self.entity_pk = {}             # mapping from entity name to primary key
        self.entity_key_map = {}        # mapping from entity names to primary key values
        self.num_nodes_per_type = {}
        self.edges_per_relation = {}
        self.relation_name_to_etype = {}
        self.relation_src_key = {}      # mapping from relation name to source key
        self.relation_dst_key = {}      # mapping from relation name to destination key

    def add_entities(self, entity_table, primary_key, name):
        entities = entity_table[primary_key].astype('category')
        if not (entities.value_counts() == 1).all():
            raise ValueError('Different entity with the same primary key detected.')
        # preserve the category order in the original entity table
        entities = entities.cat.reorder_categories(entity_table[primary_key].values)

        self.entity_pk_to_name[primary_key] = name
        self.entity_pk[name] = primary_key
        self.num_nodes_per_type[name] = entity_table.shape[0]
        self.entity_key_map[name] = entities
        self.entity_tables[name] = entity_table

    def add_binary_relations(self, relation_table, source_key, destination_key, name):
        src = relation_table[source_key].astype('category')
        src = src.cat.set_categories(
            self.entity_key_map[self.entity_pk_to_name[source_key]].cat.categories)
        dst = relation_table[destination_key].astype('category')
        dst = dst.cat.set_categories(
            self.entity_key_map[self.entity_pk_to_name[destination_key]].cat.categories)
        if src.isnull().any():
            raise ValueError(
                'Some source entities in relation %s do not exist in entity %s.' %
                (name, source_key))
        if dst.isnull().any():
            raise ValueError(
                'Some destination entities in relation %s do not exist in entity %s.' %
                (name, destination_key))

        srctype = self.entity_pk_to_name[source_key]
        dsttype = self.entity_pk_to_name[destination_key]
        etype = (srctype, name, dsttype)
        self.relation_name_to_etype[name] = etype
        self.edges_per_relation[etype] = (src.cat.codes.values.astype('int64'), dst.cat.codes.values.astype('int64'))
        self.relation_tables[name] = relation_table
        self.relation_src_key[name] = source_key
        self.relation_dst_key[name] = destination_key

    def build(self):
        # Create heterograph
        graph = dgl.heterograph(self.edges_per_relation, self.num_nodes_per_type)
        return graph


Build bipartite heterogenous graph:

- track is identified by tid
- playlist is identified by pid 
- edge contains : play list contains track 
- edge contained_by: track is contained by playlist


ids of nodes in graph and rows in dataset are in the same order:
- 1st track node is track with tid =1 
- 1st playlist node is play list with pid = 1

In [101]:
graph_builder = PandasGraphBuilder()

In [102]:
df_playlists_info = df_playlists_info.sort_values('pid').reset_index(drop=True)

In [10]:
df_playlists_info.head(3)

Unnamed: 0,pid,name
0,0,Throwbacks
1,1,Awesome Playlist
2,2,korean


In [103]:
graph_builder = PandasGraphBuilder()
graph_builder.add_entities(df_tracks, 'tid', 'track')
graph_builder.add_entities(df_playlists_info, 'pid', 'playlist')
graph_builder.add_binary_relations(df_playlists, 'pid', 'tid', 'contains')
graph_builder.add_binary_relations(df_playlists, 'tid', 'pid', 'contained_by')


In [104]:
g = graph_builder.build()

Load features to graph
- music features are stored as long tensors (categorical, to be embedded)
- genre, album_img_emb, album_text_emb are stored as numerical features 
- track id, play list id are also included, can be embedded as well

In [105]:
for key in ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']:
    
    g.nodes['track'].data[key] = torch.LongTensor(df_tracks[key].values)
    

In [106]:
g.nodes['track'].data['genre'] = torch.tensor(np.asarray(list(df_tracks['genre'].values))).float()


In [15]:
import psutil
print('The CPU usage is: ', psutil.cpu_percent(4))
print('RAM memory % used:', psutil.virtual_memory()[2])

The CPU usage is:  9.1
RAM memory % used: 21.4


In [None]:
# g.nodes['track'].data['album_img_emb'] = torch.tensor(np.asarray(list(df_tracks['album_img_emb'].values)))
# g.nodes['track'].data['album_text_emb'] = torch.tensor(np.asarray(list(df_tracks['album_text_emb'].values)))

In [107]:
g.nodes['playlist'].data['id'] = torch.arange(g.number_of_nodes('playlist'))
g.nodes['track'].data['id'] = torch.arange(g.number_of_nodes('track'))

In [108]:
g.nodes['playlist']

NodeSpace(data={'id': tensor([     0,      1,      2,  ..., 999997, 999998, 999999])})

In [109]:
g.nodes['track']

NodeSpace(data={'danceability': tensor([3, 3, 3,  ..., 3, 2, 1]), 'energy': tensor([3, 3, 3,  ..., 2, 3, 2]), 'loudness': tensor([2, 3, 2,  ..., 2, 2, 2]), 'speechiness': tensor([3, 3, 3,  ..., 2, 1, 1]), 'acousticness': tensor([1, 1, 1,  ..., 1, 1, 2]), 'instrumentalness': tensor([2, 2, 1,  ..., 2, 1, 2]), 'liveness': tensor([1, 3, 1,  ..., 2, 2, 2]), 'valence': tensor([3, 3, 3,  ..., 2, 3, 1]), 'tempo': tensor([2, 3, 1,  ..., 1, 3, 2]), 'genre': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), 'id': tensor([      0,       1,       2,  ..., 2262187, 2262188, 2262189])})

# 2. Train test splits

In [110]:
def split_by_pid(df, group_by_val, train_size=.8, val_size = .1, test_size=.1):
    print("***Splitting by Playlist***")
    train_pids, all_test_pids = train_test_split(df[group_by_val].unique(), test_size=test_size+val_size, random_state=1)
    all_test = df[df.pid.isin(all_test_pids)]
    val_pids, test_pids = train_test_split(all_test[group_by_val].unique(), test_size=val_size, random_state=1)
    train = df[df.pid.isin(train_pids)]
    val = df[df.pid.isin(val_pids)]
    test = df[df.pid.isin(test_pids)]
    
    print("***Current Set has {} pids in train, {} pids in val, {} pids in test".format(len(train_pids), len(val_pids), len(test_pids)))
    return list(train.index), list(val.index), list(test.index)

In [111]:
train_indices, val_indices, test_indices = split_by_pid(df = df_playlists, group_by_val = 'pid')

***Splitting by Playlist***
***Current Set has 800000 pids in train, 180000 pids in val, 20000 pids in test


In [117]:
import scipy.sparse as ssp
def build_val_test_matrix(g, val_indices, test_indices, utype, itype, etype):
    n_users = g.number_of_nodes(utype)
    n_items = g.number_of_nodes(itype)
    val_src, val_dst = g.find_edges(val_indices, etype=etype)
    test_src, test_dst = g.find_edges(test_indices, etype=etype)
    val_src = val_src.numpy()
    val_dst = val_dst.numpy()
    test_src = test_src.numpy()
    test_dst = test_dst.numpy()
    val_matrix = ssp.coo_matrix((np.ones_like(val_src), (val_src, val_dst)), (n_users, n_items))
    test_matrix = ssp.coo_matrix((np.ones_like(test_src), (test_src, test_dst)), (n_users, n_items))

    return val_matrix, test_matrix

In [119]:
val_matrix, test_matrix = build_val_test_matrix(g, val_indices, test_indices, 'playlist', 'track', 'contains')

In [122]:
dataset = {
        'train-graph': train_g,
        'val-matrix': val_matrix,
        'test-matrix': test_matrix,
        'item-texts': None,
        'item-images': None,
        'user-type': 'playlist',
        'item-type': 'track',
        'user-to-item-type': 'contained_by',
        'item-to-user-type': 'contains',
        'timestamp-edge-column': None}
with open(directory+"dataset_without_im.pkl", 'wb') as f:
        pickle.dump(dataset, f)

In [21]:
import numpy as np 

In [22]:
def build_train_graph(g, train_indices, utype, itype, etype, etype_rev):
    train_g = g.edge_subgraph(
        {etype: train_indices, etype_rev: train_indices},
        relabel_nodes=False)

    # copy features
    for ntype in g.ntypes:
        for col, data in g.nodes[ntype].data.items():
            train_g.nodes[ntype].data[col] = data
    for etype in g.etypes:
        for col, data in g.edges[etype].data.items():
            train_g.edges[etype].data[col] = data[train_g.edges[etype].data[dgl.EID]]

    return train_g

In [23]:
# train_indices = np.arange(len(df_playlists))
train_g = build_train_graph(g, train_indices, 'playlist', 'track', 'contains', 'contained_by' )
val_g = build_train_graph(g, val_indices, 'playlist', 'track', 'contains', 'contained_by')

# 3. Build Sampler

## 3.1 HEAD POS NEG Tracks sampler

In [24]:
class ItemToItemBatchSampler(IterableDataset):
    def __init__(self, g, user_type, item_type, batch_size):
        self.g = g
        self.user_type = user_type
        self.item_type = item_type
        self.user_to_item_etype = list(g.metagraph()[user_type][item_type])[0]
        self.item_to_user_etype = list(g.metagraph()[item_type][user_type])[0]
        self.batch_size = batch_size

    def __iter__(self):
        while True:
            heads = torch.randint(0, self.g.number_of_nodes(self.item_type), (self.batch_size,))
            tails = dgl.sampling.random_walk(
                self.g,
                heads,
                metapath=[self.item_to_user_etype, self.user_to_item_etype])[0][:, 2]
            neg_tails = torch.randint(0, self.g.number_of_nodes(self.item_type), (self.batch_size,))

            mask = (tails != -1)
            yield heads[mask], tails[mask], neg_tails[mask]


this block below generate a small graph data set that can be used to do parameters tuning

In [None]:
# heads = torch.randint(0, train_g.number_of_nodes('track'), (10000,))

# tails = dgl.sampling.random_walk(
#     train_g,
#     heads,
#     metapath=['contained_by', 'contains'])[0]
# playlist_ids = torch.unique(tails[:, 1]).numpy()
# track_ids = torch.unique(torch.cat([tails[:,0],tails[:,-1]])).numpy()
# df_tracks_small = df_tracks[df_tracks['tid'].isin(track_ids)]
# df_playlists_info_small = df_playlists_info[df_playlists_info['pid'].isin(playlist_ids)]
# df_playlists_small  = df_playlists[df_playlists['pid'].isin(playlist_ids) & df_playlists['tid'].isin(track_ids)]
# df_tracks_small = df_tracks_small.reset_index(drop=True)
# df_playlists_info_small = df_playlists_info_small.reset_index(drop=True)
# df_playlists_small = df_playlists_small.reset_index(drop=True)
# new_track_uri_ids ={x:idx for idx, x in enumerate(list(df_tracks_small['tid']))}
# new_playlists_ids ={x:idx for idx, x in enumerate(list(df_playlists_info_small['pid']))}
# df_tracks_small['tid'] = [new_track_uri_ids[x] for x in list(df_tracks_small['tid'])]
# df_playlists_info_small['pid'] = [new_playlists_ids[x] for x in list(df_playlists_info_small['pid'])]
# df_playlists_small['pid'] = [new_playlists_ids[x] for x in list(df_playlists_small['pid'])]
# df_playlists_small['tid'] = [new_track_uri_ids[x] for x in list(df_playlists_small['tid'])]
# data = {
#     'df_playlist': df_playlists_small,
#     'df_playlist_info': df_playlists_info_small,
#     'df_track': df_tracks_small
# }
# pickle.dump(data, open('ns_music_small_data.p', 'wb'))

The sampler below generate positive and negative edges
- sample a batch of heads
- do track -> playlist -> track random walk to find pairs of positive edges
- random sample batch of nodes as negative edges

In [25]:
batch_sampler = ItemToItemBatchSampler(train_g, 'playlist', 'track', 32)

In [26]:
batch_iter = iter(batch_sampler)

In [27]:
heads, tails, neg_tails = next(batch_iter)

In [28]:
heads, tails, neg_tails

(tensor([ 413061,  879847, 1159623, 1749339,  262613, 1690124, 2200180,  607687,
         1476803,  247953,  103258, 1321047, 1339018,  828669, 1311840, 2196059,
         1786005, 1690300,  791193,  103203,  139474,  816963, 1666529,  866276,
         1782598,  253172,  666214, 1181945,  890482]),
 tensor([  20202,  879834,   50731,  522094,  262617, 1690104, 1311800,    4894,
           15503,   28590,   51565, 1390839, 1339031,  828726,  497469,   27669,
          508329,  154006, 1217686,  366362,  140033,   72889,  199389,   20171,
           16341,   37195,   31381,  124773,  146017]),
 tensor([1279967, 2205394, 1001195, 1603003,  362784,  556222, 1405729,  164426,
         1501382,  737591,  760490, 2174565, 2138340,  130636, 1908287, 2187245,
          266175,  472640, 1087661, 2191565, 1691016, 1953413, 2051470,  170384,
         1915228,  258887,  339759,  579151,  228380]))

## 3.2  Neighborhood sampler default

In [29]:

def compact_and_copy(frontier, seeds):
    block = dgl.to_block(frontier, seeds)
    for col, data in frontier.edata.items():
        if col == dgl.EID:
            continue
        block.edata[col] = data[block.edata[dgl.EID]]
    return block

class NeighborSampler(object):
    def __init__(self, g, user_type, item_type, random_walk_length, random_walk_restart_prob,
                 num_random_walks, num_neighbors, num_layers):
        self.g = g
        self.user_type = user_type
        self.item_type = item_type
        self.user_to_item_etype = list(g.metagraph()[user_type][item_type])[0]
        self.item_to_user_etype = list(g.metagraph()[item_type][user_type])[0]
        self.samplers = [
            dgl.sampling.PinSAGESampler(g, item_type, user_type, random_walk_length,
                                        random_walk_restart_prob, num_random_walks, num_neighbors)
            for _ in range(num_layers)]

    def sample_blocks(self, seeds, heads=None, tails=None, neg_tails=None):
        blocks = []
        for sampler in self.samplers:
            frontier = sampler(seeds)
            if heads is not None:
                eids = frontier.edge_ids(torch.cat([heads, heads]), torch.cat([tails, neg_tails]), return_uv=True)[2]
                if len(eids) > 0:
                    old_frontier = frontier
                    frontier = dgl.remove_edges(old_frontier, eids)
                    # print(old_frontier)
                    # print(frontier)
                    # print(frontier.edata['weights'])
                    # frontier.edata['weights'] = old_frontier.edata['weights'][frontier.edata[dgl.EID]]
            block = compact_and_copy(frontier, seeds)
            seeds = block.srcdata[dgl.NID]
            blocks.insert(0, block)
        return blocks

    def sample_from_item_pairs(self, heads, tails, neg_tails):
        # Create a graph with positive connections only and another graph with negative
        # connections only.
        pos_graph = dgl.graph(
            (heads, tails),
            num_nodes=self.g.number_of_nodes(self.item_type))
        neg_graph = dgl.graph(
            (heads, neg_tails),
            num_nodes=self.g.number_of_nodes(self.item_type))
        pos_graph, neg_graph = dgl.compact_graphs([pos_graph, neg_graph])
        seeds = pos_graph.ndata[dgl.NID]

        blocks = self.sample_blocks(seeds, heads, tails, neg_tails)
        return pos_graph, neg_graph, blocks


In [30]:
def assign_features_to_blocks(blocks, g, ntype='track'):
    
    data = blocks[0].srcdata
    
    for col in g.nodes[ntype].data.keys():
        if  col == dgl.NID:
            continue
        induced_nodes = data[dgl.NID]
        data[col] = g.nodes[ntype].data[col][induced_nodes]

    
    data = blocks[-1].dstdata
    for col in g.nodes[ntype].data.keys():
        if  col == dgl.NID:
            continue
        induced_nodes = data[dgl.NID]
        data[col] = g.nodes[ntype].data[col][induced_nodes]


In [127]:
class PinSAGECollator(object):
    def __init__(self, sampler, g, ntype):
        self.sampler = sampler
        self.ntype = ntype
        self.g = g

    def collate_train(self, batches):
        heads, tails, neg_tails = batches[0]
        # Construct multilayer neighborhood via PinSAGE...
        pos_graph, neg_graph, blocks = self.sampler.sample_from_item_pairs(heads, tails, neg_tails)
        assign_features_to_blocks(blocks, self.g, self.ntype)

        return pos_graph, neg_graph, blocks
    
    def collate_test(self, samples):
        batch = torch.LongTensor(samples)
        blocks = self.sampler.sample_blocks(batch)
        assign_features_to_blocks(blocks, self.g, self.ntype)
        return blocks


**IMPORTANT** This is the neighbor graph generator

- batch sampler: generate pos and negative pairs
- neigbbor sampler:   generate neighbor graph of nodes from pos + neg pairs

please go through the implementation details if you want to make modifications

The output consists of:

- pos graph: data structure storing positive pairs 
- neg graph: data structure storing negative pairs 
- blocks: data structure fascilitating message passing:
    - blocks[0]  frontier 2 -> frontier 1 
    - blocks[1]  frontier 1 -> nodes of interests

corresponding nodes features are stored in blocks as well 
- `block.srcdata` src nodes data 
- `block.dstdata` dst nodes data

**IMPORTANT** 
- The destination nodes are the destination end of edges in the graph. 
- The source nodes stored in `block.srcnode` are not only the source end of edges in the graph, it consists of:
    - destination end nodes 
    - source end nodes

In [129]:
neighbor_sampler = NeighborSampler(train_g, 'playlist', 'track', 
                                   random_walk_length=2, random_walk_restart_prob=0.5, num_random_walks=10, num_neighbors=3, num_layers=2)
collator = PinSAGECollator(neighbor_sampler, train_g, 'track')


In [130]:
dataloader = DataLoader(
    batch_sampler,
    collate_fn=collator.collate_train,
    num_workers=4) #was 8 
dataloader_test = DataLoader(
    batch_sampler,
    collate_fn=collator.collate_test,
    num_workers=4) #was 8

In [34]:
dataloader_it = iter(dataloader)

In [35]:
pos_graph,neg_graph, blocks = next(dataloader_it)

In [36]:
blocks[0].srcdata

{'_ID': tensor([ 946700, 1098320, 1478866,  ..., 1871281, 1081338, 1081348]), 'danceability': tensor([2, 1, 2,  ..., 2, 2, 2]), 'energy': tensor([3, 3, 1,  ..., 3, 3, 1]), 'loudness': tensor([3, 3, 2,  ..., 1, 2, 1]), 'speechiness': tensor([3, 3, 1,  ..., 3, 3, 3]), 'acousticness': tensor([2, 1, 3,  ..., 3, 3, 3]), 'instrumentalness': tensor([1, 2, 2,  ..., 1, 1, 1]), 'liveness': tensor([1, 1, 3,  ..., 3, 3, 3]), 'valence': tensor([3, 2, 3,  ..., 2, 2, 2]), 'tempo': tensor([3, 1, 2,  ..., 2, 3, 1]), 'genre': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), 'id': tensor([ 946700, 1098320, 1478866,  ..., 1871281, 1081338, 1081348])}

# 4. Modeling

outline of model procedure:

inputs: `pos_graph`, `neg_graph`, `block_0`, `block_1`

- compute node projection for all nodes,  if you understand the section above you will see all we need to computes are:
    - source nodes of block_0 
    - destination nodes of block_1
    
- run 2 sage layers:
    - layer 1 on block 0: frontier 2 -> frontier 1
    - layer 2 on block 1: frontier 1 -> nodes of interests 
    
- score on nodes of interests
    - for nodes u, v,  dot(u,v) + bias(u) + bias(v)

- loss function sum(neg_pairs_scores)  - sum(pos_pairs_scores) + 1




In [37]:
from torch import nn

Feature projections:

- music feature: 
    - each entry map to embedding of length 16
    - total size 144 
- genre feature: as it is  20 
- album img feature: as it is  2048
- track id feature: map to 128 embeddding


Feature aggregation:

- concatenate music feature, genre feature , album img feature : size 2212
- Fully connected layer 2212x128 reduce the dimentionality of the concatenated feature 
- add id feature

FC(concate[music, genre, img_emb]) + id_emb




In [50]:
def disable_grad(module):
    for param in module.parameters():
        param.requires_grad = False


def _init_input_modules(g, ntype):
    module_dict = nn.ModuleDict()
    
    tracks_data = g.nodes[ntype].data
    
    module_dict['track_id'] = nn.Embedding(tracks_data['id'].max()+1, 128)
    
    for m in ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']: 
        module_dict[m] = nn.Embedding(tracks_data[m].max() + 1, 16)

    return module_dict


# class LinearProjector(nn.Module):
#     """
#     Projects each input feature of the graph linearly and sums them up
#     """

#     def __init__(self, full_graph, ntype):
#         super().__init__()

#         self.ntype = ntype
#         #self.fc = nn.Linear(164, 128)
#         self.fc = nn.Linear(2212, 128)
#         self.inputs = _init_input_modules(full_graph, ntype)

#     def forward(self, ndata):
        
#         # get music feature
#         music_features = []
#         for c in ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']:

#             module = self.inputs[c]
#             music_features.append(module(ndata[c]))
#         music_features = torch.cat(music_features, dim=1)
        
        
#         # id embedding 
#         id_embedding = self.inputs['track_id'](ndata['id'])
        
#         # album feature 
#         img_emb = ndata['album_img_emb']
        
#         # genre 
#         genre = ndata['genre']
        
#         # concatenate 
#         feature = torch.cat([music_features, genre, ndata['album_img_emb']], dim=1)
#         #feature = torch.cat([music_features, genre], dim=1)

#         projection = self.fc(feature) + id_embedding
        
#         return projection
    
class LinearProjector(nn.Module):
    """
    Projects each input feature of the graph linearly and sums them up
    """

    def __init__(self, full_graph, ntype):
        super().__init__()

        self.ntype = ntype
        self.fc = nn.Linear(164, 128)
        #self.fc = nn.Linear(2212, 128)
        self.inputs = _init_input_modules(full_graph, ntype)

    def forward(self, ndata):
        
        # get music feature
        music_features = []
        for c in ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']:

            module = self.inputs[c]
            music_features.append(module(ndata[c]))
        music_features = torch.cat(music_features, dim=1)
        
        
#         # id embedding 
#         id_embedding = self.inputs['track_id'](ndata['id'])
        
#         # album feature 
#         img_emb = ndata['album_img_emb']
        
        # genre 
        genre = ndata['genre']
        
        # concatenate 
        #feature = torch.cat([music_features, genre, ndata['album_img_emb']], dim=1)
        feature = torch.cat([music_features, genre], dim=1)

        projection = self.fc(feature) #+ id_embedding
        
        return projection

## 4.2 sage layers

not too much to discuss, please understand every line of the the following if you intend to make model modification

notice the changes in `itemtoitemscorer`

In [40]:
import torch.nn.functional as F


class WeightedSAGEConv(nn.Module):
    def __init__(self, input_dims, hidden_dims, output_dims, act=F.relu):
        super().__init__()

        self.act = act
        self.Q = nn.Linear(input_dims, hidden_dims)
        self.W = nn.Linear(input_dims + hidden_dims, output_dims)
        self.reset_parameters()
        self.dropout = nn.Dropout(0.5)

    def reset_parameters(self):
        gain = nn.init.calculate_gain('relu')
        nn.init.xavier_uniform_(self.Q.weight, gain=gain)
        nn.init.xavier_uniform_(self.W.weight, gain=gain)
        nn.init.constant_(self.Q.bias, 0)
        nn.init.constant_(self.W.bias, 0)

    def forward(self, g, h, weights):
        """
        g : graph
        h : node features
        weights : scalar edge weights
        """
        h_src, h_dst = h
        with g.local_scope():
            g.srcdata['n'] = self.act(self.Q(self.dropout(h_src)))
            g.edata['w'] = weights.float()
            g.update_all(fn.u_mul_e('n', 'w', 'm'), fn.sum('m', 'n'))
            g.update_all(fn.copy_e('w', 'm'), fn.sum('m', 'ws'))
            n = g.dstdata['n']
            ws = g.dstdata['ws'].unsqueeze(1).clamp(min=1)
            z = self.act(self.W(self.dropout(torch.cat([n / ws, h_dst], 1))))
            z_norm = z.norm(2, 1, keepdim=True)
            z_norm = torch.where(z_norm == 0, torch.tensor(1.).to(z_norm), z_norm)
            z = z / z_norm
            return z


class SAGENet(nn.Module):
    def __init__(self, hidden_dims, n_layers):
        """
        g : DGLHeteroGraph
            The user-item interaction graph.
            This is only for finding the range of categorical variables.
        item_textsets : torchtext.data.Dataset
            The textual features of each item node.
        """
        super().__init__()

        self.convs = nn.ModuleList()
        for _ in range(n_layers):
            self.convs.append(WeightedSAGEConv(hidden_dims, hidden_dims, hidden_dims))

    def forward(self, blocks, h):
        for layer, block in zip(self.convs, blocks):
            h_dst = h[:block.number_of_nodes('DST/' + block.ntypes[0])]
            h = layer(block, (h, h_dst), block.edata['weights'])
        return h

class ItemToItemScorer(nn.Module):
    def __init__(self, full_graph, ntype):
        super().__init__()

        n_nodes = full_graph.number_of_nodes(ntype)
        self.bias = nn.Parameter(torch.zeros(n_nodes))

    def _add_bias(self, edges):
        bias_src = self.bias[edges.src[dgl.NID]]
        bias_dst = self.bias[edges.dst[dgl.NID]]
        return {'s': edges.data['s'] + bias_src + bias_dst}

    def forward(self, item_item_graph, h):
        """
        item_item_graph : graph consists of edges connecting the pairs
        h : hidden state of every node
        """
        with item_item_graph.local_scope():
            item_item_graph.ndata['h'] = h
            item_item_graph.apply_edges(fn.u_dot_v('h', 'h', 's'))
            item_item_graph.edata['s'] = item_item_graph.edata['s'].flatten()
            item_item_graph.apply_edges(self._add_bias)

            pair_score = item_item_graph.edata['s']
        return pair_score


In [90]:
from sklearn.metrics import roc_auc_score, ndcg_score
def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).detach().cpu().numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)

class PinSAGEModel(nn.Module):
    def __init__(self, full_graph, ntype, hidden_dims, n_layers):
        super().__init__()

        self.proj = LinearProjector(full_graph, ntype)
        self.sage = SAGENet(hidden_dims, n_layers)
        self.scorer = ItemToItemScorer(full_graph, ntype)

    def forward(self, pos_graph, neg_graph, blocks):
        h_item = self.get_repr(blocks)
        pos_score = self.scorer(pos_graph, h_item)
        neg_score = self.scorer(neg_graph, h_item)
        
        #return h_item, pos_score, neg_score
        auc = compute_auc(pos_score, neg_score)
        return (neg_score - pos_score + 1).clamp(min=0), auc 


    def get_repr(self, blocks):
        h_item = self.proj(blocks[0].srcdata)
        h_item_dst = self.proj(blocks[-1].dstdata)
        return h_item_dst + self.sage(blocks, h_item)


# Run

In [91]:
model = PinSAGEModel(train_g, 'track', 128, 2)
model = model.cuda()

In [124]:
val_matrix = val_matrix.tocsr()

In [92]:
dataloader_it = iter(dataloader)

In [93]:
device = torch.device('cuda:0')

In [132]:
model.train()
opt = torch.optim.Adam(model.parameters(), lr=3e-5)
losses = []
for batch_id in  range(100000000):
    pos_graph, neg_graph, blocks = next(dataloader_it)
    # Copy to GPU
    for i in range(len(blocks)):
        blocks[i] = blocks[i].to(device)
    pos_graph = pos_graph.to(device)
    neg_graph = neg_graph.to(device)
    
    loss, auc = model(pos_graph, neg_graph, blocks)
    
    loss = loss.mean()
    opt.zero_grad()
    loss.backward()
    opt.step()
    if batch_id % 100 == 0:
        print(loss, auc)
        losses.append([loss.item(),auc])
        
model.eval()
with torch.no_grad():
    item_batches = torch.arange(g.number_of_nodes(item_ntype)).split(args.batch_size)
    h_item_batches = []
    for blocks in dataloader_test:
        for i in range(len(blocks)):
            blocks[i] = blocks[i].to(device)

        h_item_batches.append(model.get_repr(blocks))
    h_item = torch.cat(h_item_batches, 0)

    print(evaluation.evaluate_nn(dataset, h_item, args.k, args.batch_size))

tensor(1.2058, device='cuda:0', grad_fn=<MeanBackward0>) 0.6826222684703434


RuntimeError: DataLoader worker (pid(s) 72779, 72910, 72911, 73169) exited unexpectedly

In [None]:
from matplotlib import pyplot as plt
def moving_average(a, n=3) :
    ret = np.cumsum(a, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret[n - 1:] / n

plt.plot(moving_average(np.array([x[0] for x in losses]), 100))